In [23]:
# imports
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [24]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [25]:
# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [26]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)
class_dummy = pd.get_dummies(df[["class"]], drop_first=True)

In [27]:
# putting the three dataframes together
df = pd.concat([df, embarked_dummy, class_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked', 'class'])
df.isna().sum()

passenger_id      0
survived          0
pclass            0
age             177
sibsp             0
parch             0
fare              0
alone             0
is_female         0
embarked_Q        0
embarked_S        0
class_Second      0
class_Third       0
dtype: int64

In [28]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")
train.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S,class_Second,class_Third
583,583,0,1,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,3,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,3,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,2,50.0,0,1,26.0,0,False,0,1,1,0
306,306,1,1,,0,0,110.8833,1,False,0,0,0,0


In [29]:
#Calculate our fill value using train dataset only.
avg_age = train.age.mean()

# Fill null values in all of our datasets using our hardcoded value.
train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

In [21]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### 1. What is your baseline prediction? What is your baseline accuracy?

#### 2. Fit the decision tree classifier to your training sample and transform

In [30]:
# generating a blank, new Decision Tree model
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

# training the model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

#### 5. Run through steps 2-4 using a different max_depth value.

#### 6. Which model performs better on your in-sample data?

#### 7. Which model performs best on your out-of-sample data, the validate set?