## Decision Tree Exercises

In [1]:
# imports
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [2]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [3]:
# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)
class_dummy = pd.get_dummies(df[["class"]], drop_first=True)

In [5]:
# putting the three dataframes together
df = pd.concat([df, embarked_dummy, class_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked', 'class', 'pclass'])
df.isna().sum()

passenger_id      0
survived          0
age             177
sibsp             0
parch             0
fare              0
alone             0
is_female         0
embarked_Q        0
embarked_S        0
class_Second      0
class_Third       0
dtype: int64

In [6]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")
print(train.survived.value_counts())

train

0    307
1    191
Name: survived, dtype: int64


Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S,class_Second,class_Third
583,583,0,36.0,0,0,40.1250,1,False,0,0,0,0
165,165,1,9.0,0,2,20.5250,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0000,0,False,0,1,1,0
306,306,1,,0,0,110.8833,1,False,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
313,313,0,28.0,0,0,7.8958,1,False,0,1,0,1
636,636,0,32.0,0,0,7.9250,1,False,0,1,0,1
222,222,0,51.0,0,0,8.0500,1,False,0,1,0,1
485,485,0,,3,1,25.4667,0,False,0,1,0,1


In [7]:
# Calculate our fill value using train dataset only.
avg_age = train.age.mean()

# Fill null values in age column using our hardcoded value.
train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

In [8]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### 1. What is your baseline prediction? What is your baseline accuracy?

In [9]:
# needs help
train["most_frequent"] = 0
baseline_accuracy = (train.survived == train.most_frequent).mean()
baseline_accuracy

0.6164658634538153

#### 2. Fit the decision tree classifier to your training sample and transform

In [10]:
# generating a blank, new Decision Tree model
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

# training the first model on the training data
# fitting == training the model
clf1 = clf.fit(X_train, y_train)
clf1

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [11]:
# visualizing the model
dot_data = export_graphviz(clf1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [12]:
# making a set of predictions using this trained model
y_pred = clf1.predict(X_train)
y_pred[0:3]

array([1, 0, 0])

In [13]:
# evaluating the model with .score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.74


In [14]:
# confusion matrix for model
y_true = y_train
confusion_matrix(y_true, y_pred,
                 labels = [0, 1])

array([[262,  45],
       [ 86, 105]])

In [15]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       307
           1       0.70      0.55      0.62       191

    accuracy                           0.74       498
   macro avg       0.73      0.70      0.71       498
weighted avg       0.73      0.74      0.73       498



In [16]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf1.score(X_train, y_train)

0.7369477911646586

#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

In [17]:
# did not survive = positive class
# survived = negative class

# manually adding the scores and printing 
tp = 118
tn = 265
fp = 42
fn = 73

print("Survived classifier (where 'did not survive' is the positive prediction)")
print("True Positives", tp)
print("False Positives", fp) # False positive: We predicted did not survive, but it is actually survived
print("False Negatives", fn) # False negative: We predicted survived, but it is actually did not survive
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", round(recall,2))
print("Precision is", round(precision,2))

Survived classifier (where 'did not survive' is the positive prediction)
True Positives 118
False Positives 42
False Negatives 73
True Negatives 265
-------------
Accuracy is 0.7690763052208835
Recall is 0.62
Precision is 0.74


#### 5. Comparison: Run through steps 2-4 using a different max_depth value.

In [18]:
# generating a blank, new Decision Tree model
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

# training the model on the training data
# fitting == training the model
clf2 = clf.fit(X_train, y_train)
clf2

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [19]:
# visualizing the second model
dot_data = export_graphviz(clf2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [20]:
# making a set of predictions using this second trained model
y_pred = clf2.predict(X_train)
y_pred[0:3]

array([1, 0, 0])

In [21]:
# evaluating the second model with .score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [22]:
# confusion matrix for second model
y_true = y_train
confusion_matrix(y_true, y_pred,
                 labels = [0, 1])

array([[271,  36],
       [ 69, 122]])

In [23]:
# classification report for second model
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       307
           1       0.77      0.64      0.70       191

    accuracy                           0.79       498
   macro avg       0.78      0.76      0.77       498
weighted avg       0.79      0.79      0.78       498



In [24]:
# clf was trained on X_train, y_train
# To evaluate the second model trained on new data, validate the arguments coming into .score()
clf2.score(X_validate, y_validate)

0.6915887850467289

In [25]:
# Let's evaluate the second model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.3f}'
     .format(clf2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.692


#### 6. Which model performs better on your in-sample data?

In [26]:
# first classification model

# Use the classification model trained on train data to make predictions on data
y_pred = clf1.predict(X_train)

# Compare actual y values from validate to predictions based on X_train
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       307
           1       0.70      0.55      0.62       191

    accuracy                           0.74       498
   macro avg       0.73      0.70      0.71       498
weighted avg       0.73      0.74      0.73       498



In [27]:
# second classification model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf2.predict(X_train)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       307
           1       0.77      0.64      0.70       191

    accuracy                           0.79       498
   macro avg       0.78      0.76      0.77       498
weighted avg       0.79      0.79      0.78       498



#### 7. Which model performs best on your out-of-sample data, the validate set?

In [28]:
# first classification model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf1.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       132
           1       0.66      0.46      0.54        82

    accuracy                           0.70       214
   macro avg       0.69      0.66      0.66       214
weighted avg       0.69      0.70      0.69       214



In [29]:
#second classification model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf2.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.82      0.77       132
           1       0.62      0.49      0.55        82

    accuracy                           0.69       214
   macro avg       0.67      0.65      0.66       214
weighted avg       0.68      0.69      0.68       214



** Takeaways **
The second decision tree model in which the max_depth was increased by 2 fared slightly worse in most categories.

## Random Forest Exercises

#### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [30]:
# additional import for random forest
from sklearn.ensemble import RandomForestClassifier

# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [31]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)
class_dummy = pd.get_dummies(df[["class"]], drop_first=True)

In [32]:
# putting the three dataframes together
df = pd.concat([df, embarked_dummy, class_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked', 'class', 'pclass'])
df.isna().sum()

passenger_id      0
survived          0
age             177
sibsp             0
parch             0
fare              0
alone             0
is_female         0
embarked_Q        0
embarked_S        0
class_Second      0
class_Third       0
dtype: int64

In [33]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")
print(train.survived.value_counts())
train.head()

0    307
1    191
Name: survived, dtype: int64


Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S,class_Second,class_Third
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,False,0,1,1,0
306,306,1,,0,0,110.8833,1,False,0,0,0,0


In [34]:
# Calculate our fill value using train dataset only.
avg_age = train.age.mean()

# Fill null values in all of our datasets using our hardcoded value.
train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

In [35]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [36]:
# running random forest
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [37]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [38]:
print(rf.feature_importances_)

[0.24605563 0.21915032 0.04980628 0.04156951 0.27117085 0.02411287
 0.         0.02102327 0.03148504 0.02188514 0.07374109]


In [39]:
y_pred = rf.predict(X_train)

In [40]:
y_pred_proba = rf.predict_proba(X_train)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [41]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [42]:
print(confusion_matrix(y_train, y_pred,
                      labels = [0, 1]))

[[307   0]
 [ 17 174]]


In [43]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       1.00      0.91      0.95       191

    accuracy                           0.97       498
   macro avg       0.97      0.96      0.96       498
weighted avg       0.97      0.97      0.97       498



#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [44]:
# create a function to calculate these metrics
def get_metrics_binary(rf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [45]:
# call our function
report_df = get_metrics_binary(rf)


    The accuracy for our model is 0.9659
    The True Positive Rate is 0.911, The False Positive Rate is 0.0,
    The True Negative Rate is 1.0, and the False Negative Rate is 0.089
    


#### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [46]:
# rerunning random forest
rf1 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=7, 
                            random_state=123)

In [47]:
rf1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [48]:
y_pred = rf1.predict(X_train)

In [49]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.84


In [50]:
print(confusion_matrix(y_train, y_pred,
                      labels = [0, 1]))

[[286  21]
 [ 59 132]]


In [51]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       307
           1       0.86      0.69      0.77       191

    accuracy                           0.84       498
   macro avg       0.85      0.81      0.82       498
weighted avg       0.84      0.84      0.84       498



#### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [52]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier1 on training set: {:.2f}'
     .format(rf1.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97
Accuracy of random forest classifier1 on training set: 0.84


## KNN Exercises

#### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [53]:
# additional import
from sklearn.neighbors import KNeighborsClassifier

# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [54]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)
class_dummy = pd.get_dummies(df[["class"]], drop_first=True)

# putting the three dataframes together
df = pd.concat([df, embarked_dummy, class_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked', 'class', 'pclass'])

In [55]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")

print(train.survived.value_counts())
train.head()

0    307
1    191
Name: survived, dtype: int64


Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S,class_Second,class_Third
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,False,0,1,1,0
306,306,1,,0,0,110.8833,1,False,0,0,0,0


In [56]:
# Calculate our fill value using train dataset only.
avg_age = train.age.mean()

# Fill null values in all of our datasets using our hardcoded value.
train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

In [57]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [58]:
# making the model
clf1 = KNeighborsClassifier()

# fitting the model
clf1.fit(X_train, y_train)

# making predictions 
y_pred = clf1.predict(X_train)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [59]:
# accuracy and confusion matrix
class_report1 = get_metrics_binary(clf1)


    The accuracy for our model is 0.743
    The True Positive Rate is 0.545, The False Positive Rate is 0.134,
    The True Negative Rate is 0.866, and the False Negative Rate is 0.455
    


In [60]:
# classification report
class_report1

Unnamed: 0,precision,recall,f1-score,support
0,0.753541,0.86645,0.806061,307.0
1,0.717241,0.544503,0.619048,191.0
accuracy,0.742972,0.742972,0.742972,0.742972
macro avg,0.735391,0.705476,0.712554,498.0
weighted avg,0.739619,0.742972,0.734335,498.0


#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [61]:
# once again, the accuracy and confusion matrix
class_report1 = get_metrics_binary(clf1)


    The accuracy for our model is 0.743
    The True Positive Rate is 0.545, The False Positive Rate is 0.134,
    The True Negative Rate is 0.866, and the False Negative Rate is 0.455
    


In [62]:
# once again, the classification report
class_report1

Unnamed: 0,precision,recall,f1-score,support
0,0.753541,0.86645,0.806061,307.0
1,0.717241,0.544503,0.619048,191.0
accuracy,0.742972,0.742972,0.742972,0.742972
macro avg,0.735391,0.705476,0.712554,498.0
weighted avg,0.739619,0.742972,0.734335,498.0


#### 4. Run through steps 1-3 setting k to 10

In [63]:
# making the second model
clf2 = KNeighborsClassifier(n_neighbors=10)

# fitting the second model
clf2.fit(X_train, y_train)

# making predictions 
y_pred = clf2.predict(X_train)

In [64]:
# accuracy and confusion matrix
class_report2 = get_metrics_binary(clf2)


    The accuracy for our model is 0.6968
    The True Positive Rate is 0.33, The False Positive Rate is 0.0749,
    The True Negative Rate is 0.925, and the False Negative Rate is 0.67
    


In [65]:
# classification report
class_report2

Unnamed: 0,precision,recall,f1-score,support
0,0.68932,0.925081,0.789986,307.0
1,0.732558,0.329843,0.454874,191.0
accuracy,0.696787,0.696787,0.696787,0.696787
macro avg,0.710939,0.627462,0.62243,498.0
weighted avg,0.705904,0.696787,0.661459,498.0


#### 5. Run through steps 1-3 setting k to 20

In [66]:
# making the third model
clf3 = KNeighborsClassifier(n_neighbors=20)

# fitting the third model
clf3.fit(X_train, y_train)

# making predictions 
y_pred = clf3.predict(X_train)

In [67]:
# accuracy and confusion matrix
class_report3 = get_metrics_binary(clf3)


    The accuracy for our model is 0.6667
    The True Positive Rate is 0.23, The False Positive Rate is 0.0619,
    The True Negative Rate is 0.938, and the False Negative Rate is 0.77
    


In [68]:
# classification report
class_report3

Unnamed: 0,precision,recall,f1-score,support
0,0.662069,0.938111,0.77628,307.0
1,0.698413,0.230366,0.346457,191.0
accuracy,0.666667,0.666667,0.666667,0.666667
macro avg,0.680241,0.584239,0.561369,498.0
weighted avg,0.676008,0.666667,0.611428,498.0


#### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? 

##### The first model perfomed best

#### 7. Which model performs best on our out-of-sample data from validate?

In [69]:
# first classification model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf1.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.75      0.70       132
           1       0.48      0.38      0.42        82

    accuracy                           0.61       214
   macro avg       0.57      0.56      0.56       214
weighted avg       0.59      0.61      0.60       214



In [70]:
# second  model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf2.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.87      0.74       132
           1       0.50      0.21      0.29        82

    accuracy                           0.62       214
   macro avg       0.57      0.54      0.52       214
weighted avg       0.59      0.62      0.57       214



In [71]:
# third model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf3.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.92      0.76       132
           1       0.63      0.21      0.31        82

    accuracy                           0.65       214
   macro avg       0.64      0.57      0.54       214
weighted avg       0.64      0.65      0.59       214



## Logistic Regression Exercises

#### Acquiring and cleaning the data

In [88]:
# additional import for these exercises
from sklearn.linear_model import LogisticRegression

# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [89]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

# Handle missing ages
avg_age = df.age.mean()
df.age = df.age.fillna(avg_age)

# dropping unwanted columns
df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town', 'passenger_id', 'class'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)

# putting the dataframes together
df = pd.concat([df, embarked_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked'])
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S
0,0,3,22.0,1,0,7.25,0,False,0,1
1,1,1,38.0,1,0,71.2833,0,False,0,0
2,1,3,26.0,0,0,7.925,1,False,0,1
3,1,1,35.0,1,0,53.1,0,False,0,1
4,0,3,35.0,0,0,8.05,1,False,0,1


In [90]:
# double check to make sure we don't have any nulls
df.isna().sum()

survived      0
pclass        0
age           0
sibsp         0
parch         0
fare          0
alone         0
is_female     0
embarked_Q    0
embarked_S    0
dtype: int64

In [91]:
# Split the datasets
train, validate, test = split(df, stratify_by="survived")

In [92]:
# Separate out our X and y values
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

#### 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [93]:
# Setting the baseline
# The most frequenly observed outcome will be our baseline
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [94]:
baseline_accuracy = (train.survived == 0).mean()
round(baseline_accuracy, 3)

0.616

In [95]:
# Create the logistic regression
logit = LogisticRegression(random_state=123)

# specify the features we're using
features = ["age", "pclass", "fare"]

# Fit a model using only these specified features
logit.fit(X_train[features], y_train)

# Since we .fit on a subset, we .predict on that same subset of features
y_pred = logit.predict(X_train[features])

print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train[features], y_train)))

Baseline is 0.62
Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.69


#### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [96]:
# Create the logistic regression
logit1 = LogisticRegression(random_state=123)

# specify the features we're using
features = ["age", "pclass", "fare", "is_female"]

# Fit a model using only these specified features
logit1.fit(X_train[features], y_train)

y_pred = logit1.predict(X_train[features])

print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features], y_train)))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.69


#### 3. Try out other combinations of features and models.

In [97]:
# All features, all default hyperparameters
logit2 = LogisticRegression(random_state=123)

logit2.fit(X_train, y_train)

y_pred = logit2.predict(X_train)

print("Model trained on all features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Model trained on all features
Accuracy of Logistic Regression classifier on training set: 0.72


In [99]:
# Only gender and fare
features = ["is_female", 'fare']

logit3 = LogisticRegression(random_state=123)

logit3.fit(X_train[features], y_train)

y_pred = logit3.predict(X_train[features])

accuracy = logit3.score(X_train[features], y_train)

print("Model trained using only gender and fare features")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

Model trained using only is_female and fare features
Accuracy of Logistic Regression classifier on training set: 0.65


In [100]:
# All features, but we'll use the class_weights to hold the actual ratios`
logit4 = LogisticRegression(random_state=123, class_weight='balanced')

logit4.fit(X_train, y_train)

y_pred = logit4.predict(X_train)

accuracy = logit4.score(X_train, y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.71


#### 4. Use you best 3 models to predict and evaluate on your validate sample.

In [101]:
# Let's determine logit1's metrics on validate
features = ["age", "pclass", "fare", "is_female"]

y_pred = logit1.predict(X_validate[features])

print('Logit1 model using age, pclass, fare, and is_female as the features')
print(classification_report(y_validate, y_pred))

Logit1 model using age, pclass, fare, and is_female as the features
              precision    recall  f1-score   support

           0       0.71      0.90      0.79       132
           1       0.72      0.40      0.52        82

    accuracy                           0.71       214
   macro avg       0.71      0.65      0.65       214
weighted avg       0.71      0.71      0.69       214



In [102]:
# Logit2 uses all features
y_pred = logit2.predict(X_validate)

print("Logit2 model using all features and all model defaults")
print(classification_report(y_validate, y_pred))

Logit2 model using all features and all model defaults
              precision    recall  f1-score   support

           0       0.73      0.89      0.80       132
           1       0.73      0.46      0.57        82

    accuracy                           0.73       214
   macro avg       0.73      0.68      0.68       214
weighted avg       0.73      0.73      0.71       214



In [103]:
# Logit4 uses all features and class_weight='balanced'
y_pred = logit4.predict(X_validate)

print("Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default")
print(classification_report(y_validate, y_pred))

Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default
              precision    recall  f1-score   support

           0       0.79      0.73      0.76       132
           1       0.61      0.70      0.65        82

    accuracy                           0.71       214
   macro avg       0.70      0.71      0.71       214
weighted avg       0.72      0.71      0.72       214



#### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [112]:
y_pred = logit2.predict(X_test)
y_pred_proba = logit2.predict_proba(X_test)

print("Model 1: ")

print('Accuracy: {:.2f}'.format(logit2.score(X_test, y_test)))

print(classification_report(y_test, y_pred))

Model 1: 
Accuracy: 0.71
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       110
           1       0.64      0.57      0.60        69

    accuracy                           0.71       179
   macro avg       0.69      0.68      0.69       179
weighted avg       0.70      0.71      0.71       179

