In [1]:
import pandas as pd

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import env
import acquire
import prepare

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

import graphviz
from graphviz import Graph

In [2]:
titanic = acquire.get_titanic_data()
titanic_df = titanic.copy()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def prep_titanic(df):
    '''
    take in titanc dataframe, remove all rows where age or embarked is null, 
    get dummy variables for sex and embark_town, 
    and drop sex, deck, passenger_id, class, and embark_town. 
    '''

    df = df[(df.age.notna()) & (df.embarked.notna())]
    df = df.drop(columns=['deck', 'passenger_id', 'class'])

    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], prefix=['sex', 'embark'])

    df = pd.concat([df, dummy_df.drop(columns=['sex_male'])], axis=1)

    df = df.drop(columns=['sex', 'embark_town', 'embarked']) 

    df = df.rename(columns={"sex_female": "is_female"})

    return df

In [4]:
titanic_df = prep_titanic(titanic_df)
titanic_df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
0,0,3,22.0,1,0,7.25,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,1,1,0,0
2,1,3,26.0,0,0,7.925,1,1,0,0,1
3,1,1,35.0,1,0,53.1,0,1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,0,0,1


In [5]:
def train_validate_test_split(df, target):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes)
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .25*.90= 22.5% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2,  
                                            stratify=df[target])
    
    
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       stratify=train_validate[target])
    return train, validate, test

In [6]:
train, validate, test = train_validate_test_split(titanic_df, target = 'survived')

In [52]:
train.shape, validate.shape, test.shape

((398, 11), (171, 11), (143, 11))

In [53]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 500 to 100
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   survived            398 non-null    int64  
 1   pclass              398 non-null    int64  
 2   age                 398 non-null    float64
 3   sibsp               398 non-null    int64  
 4   parch               398 non-null    int64  
 5   fare                398 non-null    float64
 6   alone               398 non-null    int64  
 7   is_female           398 non-null    uint8  
 8   embark_Cherbourg    398 non-null    uint8  
 9   embark_Queenstown   398 non-null    uint8  
 10  embark_Southampton  398 non-null    uint8  
dtypes: float64(2), int64(5), uint8(4)
memory usage: 26.4 KB


In [7]:
train['baseline1'] = 0

validate['baseline1'] = 0

test['baseline1'] = 0

In [55]:
train.survived.value_counts()

0    237
1    161
Name: survived, dtype: int64

In [61]:
baseline_accuracy = (train.survived == train.baseline1).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline_accuracy = (validate.survived == validate.baseline1).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline_accuracy = (test.survived == test.baseline1).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 59.55%
baseline accuracy: 59.65%
baseline accuracy: 59.44%


In [8]:
X_train = train.drop(columns=['survived', 'baseline1'])
y_train = train.survived

X_validate = validate.drop(columns=['survived', 'baseline1'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'baseline1'])
y_test = test.survived

In [9]:
clf = DecisionTreeClassifier(max_depth=3)

In [27]:
clf2 = DecisionTreeClassifier(max_depth=8)

In [10]:
clf = clf.fit(X_train, y_train)

In [28]:
clf2 = clf2.fit(X_train, y_train)

In [11]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [29]:
dot_data = export_graphviz(clf2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [30]:
y_pred2 = clf2.predict(X_train)
y_pred2[0:4]

array([0, 0, 0, 0])

In [18]:
y_pred = clf.predict(X_train)
y_pred[0:4]

array([1, 0, 0, 1])

In [19]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:4]

array([[0.4       , 0.6       ],
       [0.90909091, 0.09090909],
       [0.90909091, 0.09090909],
       [0.42553191, 0.57446809]])

In [31]:
y_pred_proba2 = clf2.predict_proba(X_train)
y_pred_proba2[0:4]

array([[0.72727273, 0.27272727],
       [0.5       , 0.5       ],
       [0.9       , 0.1       ],
       [0.53333333, 0.46666667]])

In [20]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.84


In [32]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.93


In [21]:
# confusion matrix

confusion_matrix(y_train, y_pred)

array([[193,  44],
       [ 21, 140]])

In [53]:
print("First Descision Tree Model")
print("-------------------------------")
#TP/TP+FN
print("True Positive Rate: ", round((193/ (193+21)), 2))

#FP / FP+TN
print("False Positive Rate: ", round((44 / (44 + 140)), 2))

#TN / TN+FP
print("True Negative Rate: ", round((140 / (140+44)), 2))

#FN / FN + TP
print("False Negative Rate: ", round((21 / (21+193)), 2))

First Descision Tree Model
-------------------------------
True Positive Rate:  0.9
False Positive Rate:  0.24
True Negative Rate:  0.76
False Negative Rate:  0.1


In [33]:
# confusion matrix

confusion_matrix(y_train, y_pred2)

array([[236,   1],
       [ 25, 136]])

In [54]:
print("Second Descision Tree Model")
print("-------------------------------")
#TP/TP+FN
print("True Positive Rate: ", round((236/ (236+25)), 2))

#FP / FP+TN
print("False Positive Rate: ", round((1 / (1 + 136)), 2))

#TN / TN+FP
print("True Negative Rate: ", round((136/ (136+1)), 2))

#FN / FN + TP
print("False Negative Rate: ", round((25/ (25+236)), 2))

Second Descision Tree Model
-------------------------------
True Positive Rate:  0.9
False Positive Rate:  0.01
True Negative Rate:  0.99
False Negative Rate:  0.1


In [22]:
y_train.value_counts()

0    237
1    161
Name: survived, dtype: int64

In [23]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,193,44
1,21,140


In [34]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred2), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,236,1
1,25,136


In [24]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.81      0.86       237
           1       0.76      0.87      0.81       161

    accuracy                           0.84       398
   macro avg       0.83      0.84      0.83       398
weighted avg       0.84      0.84      0.84       398



In [35]:
print(classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       237
           1       0.99      0.84      0.91       161

    accuracy                           0.93       398
   macro avg       0.95      0.92      0.93       398
weighted avg       0.94      0.93      0.93       398



# Evaluate Validate

In [25]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.77


In [36]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.83


In [26]:
# And since accuracy isn't everything

# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.78      0.80       102
           1       0.70      0.74      0.72        69

    accuracy                           0.77       171
   macro avg       0.76      0.76      0.76       171
weighted avg       0.77      0.77      0.77       171



In [38]:
# And since accuracy isn't everything

# Produce y_predictions that come from the X_validate
y_pred2 = clf2.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred2))


              precision    recall  f1-score   support

           0       0.83      0.90      0.86       102
           1       0.83      0.72      0.78        69

    accuracy                           0.83       171
   macro avg       0.83      0.81      0.82       171
weighted avg       0.83      0.83      0.83       171

