In [9]:
# imports
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

ModuleNotFoundError: No module named 'graphviz'

In [10]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [11]:
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [12]:
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
dummy_df = pd.get_dummies(df[["embarked"]], drop_first=True)
dummy_df

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


In [13]:
df = pd.concat([df, dummy_df], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked'])
df.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,class,alone,is_female,embarked_Q,embarked_S
0,0,0,3,22.0,1,0,7.25,Third,0,False,0,1
1,1,1,1,38.0,1,0,71.2833,First,0,False,0,0
2,2,1,3,26.0,0,0,7.925,Third,1,False,0,1
3,3,1,1,35.0,1,0,53.1,First,0,False,0,1
4,4,0,3,35.0,0,0,8.05,Third,1,False,0,1


In [14]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")
train.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,class,alone,is_female,embarked_Q,embarked_S
583,583,0,1,36.0,0,0,40.125,First,1,False,0,0
165,165,1,3,9.0,0,2,20.525,Third,0,False,0,1
50,50,0,3,7.0,4,1,39.6875,Third,0,False,0,1
259,259,1,2,50.0,0,1,26.0,Second,0,False,0,1
306,306,1,1,,0,0,110.8833,First,1,False,0,0


#### 1. What is your baseline prediction? What is your baseline accuracy?

#### 2. Fit the decision tree classifier to your training sample and transform

#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

#### 5. Run through steps 2-4 using a different max_depth value.

#### 6. Which model performs better on your in-sample data?

#### 7. Which model performs best on your out-of-sample data, the validate set?