In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import warnings
warnings.filterwarnings("ignore")

import acquire, prepare

In [2]:
df = acquire.get_titanic_data()
df.head(2)

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0


In [3]:
# prepare
train, validate, test = prepare.prep_titanic_data(df)
train.head()

Unnamed: 0.1,Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,36.0,0,0,40.125,1,1,0,0
165,165,1,3,9.0,0,2,20.525,0,1,0,1
50,50,0,3,7.0,4,1,39.6875,0,1,0,1
259,259,1,2,50.0,0,1,26.0,0,0,0,1
306,306,1,1,29.678105,0,0,110.8833,1,0,0,0


In [4]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [5]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,1,36.0,0,0,40.125,1,1,0,0
165,165,3,9.0,0,2,20.525,0,1,0,1
50,50,3,7.0,4,1,39.6875,0,1,0,1
259,259,2,50.0,0,1,26.0,0,0,0,1
306,306,1,29.678105,0,0,110.8833,1,0,0,0


In [6]:
X_train.shape, X_validate.shape, X_test.shape

((498, 10), (214, 10), (179, 10))

In [7]:
# explore
baseline_pred = y_train.mode()
y_train

583    0
165    1
50     0
259    1
306    1
      ..
313    0
636    0
222    0
485    0
744    1
Name: survived, Length: 498, dtype: int64

In [8]:
baseline_prediction = y_train.mode()

# create a series of predictions with that value, 
# the same length as our training set
y_train_pred = pd.Series([0]*len(y_train))

# compute accuracy of baseline
cm = confusion_matrix(y_train, y_train_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = (tp+tn)/(tn+fp+fn+tp)
accuracy

0.6164658634538153

#### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [9]:
rf = RandomForestClassifier(min_samples_leaf=1, 
                            max_depth=10, 
                            random_state=123)

# Create the model by fitting the algorithm to X_train and y_train
rf.fit(X_train, y_train)

# Run the model on X_train to make predictions (y_train_pred)
y_train_pred = rf.predict(X_train)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [10]:
# model score (accuracy)
rf.score(X_train, y_train)

0.9779116465863453

In [11]:
cm = confusion_matrix(y_train, y_train_pred)
cm

array([[307,   0],
       [ 11, 180]])

In [12]:
report = pd.DataFrame(classification_report(y_train, 
                                            y_train_pred, 
                                            output_dict=True))
report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.965409,1.0,0.977912,0.982704,0.978676
recall,1.0,0.942408,0.977912,0.971204,0.977912
f1-score,0.9824,0.97035,0.977912,0.976375,0.977779
support,307.0,191.0,0.977912,498.0,498.0


#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [13]:
tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn)/(tn + fp + fn + tp)
print(f"Accuracy: {accuracy}")

true_positive_rate = tp/(tp + fn)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = fp/(fp + tn)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = tn/(tn + fp)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = fn/(fn + tp)
print(f"False Negative Rate: {false_negative_rate}")

precision = tp/(tp + fp)
print(f"Precision: {precision}")

recall = tp/(tp + fn)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = tp + fn
print(f"Support (0): {support_pos}")

support_neg = fp + tn
print(f"Support (1): {support_neg}")

Accuracy: 0.9779116465863453
True Positive Rate: 0.9424083769633508
False Positive Rate: 0.0
True Negative Rate: 1.0
False Negative Rate: 0.05759162303664921
Precision: 1.0
Recall: 0.9424083769633508
F1 Score: 0.9703504043126685
Support (0): 191
Support (1): 307


#### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [14]:
metrics = []

for j in range (1, 10):
    for i in range(2, 10):
        rf = RandomForestClassifier(max_depth=i, 
                                    min_samples_leaf=j, 
                                    random_state=123)

        # Fit the model (on train and only train)
        rf = rf.fit(X_train, y_train)

        # We'll evaluate the model's performance on train, first
        in_sample_accuracy = rf.score(X_train, y_train)
    
        out_of_sample_accuracy = rf.score(X_validate, y_validate)

        output = {
            "min_samples_per_leaf": j,
            "max_depth": i,
            "train_accuracy": in_sample_accuracy,
            "validate_accuracy": out_of_sample_accuracy
        }
    
        metrics.append(output)

In [15]:
df = pd.DataFrame(metrics)

# compute difference in accuracy between train and validate
df["difference"] = df.train_accuracy - df.validate_accuracy

# sort the df by validate_accuracy (descending) and take top 10
df.sort_values(by=['validate_accuracy'], ascending=False).head(10)

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
5,1,7,0.939759,0.82243,0.117329
3,1,5,0.87751,0.813084,0.064426
6,1,8,0.953815,0.813084,0.140731
15,2,9,0.933735,0.813084,0.120651
14,2,8,0.935743,0.813084,0.122659
30,4,8,0.901606,0.808411,0.093195
2,1,4,0.855422,0.808411,0.04701
13,2,7,0.909639,0.808411,0.101227
38,5,8,0.891566,0.803738,0.087828
35,5,5,0.859438,0.803738,0.055699


- A min_samples_per_leaf of 3 and a max_depth of 8 performs best on out-of-sample dataset and has a reasonable difference between train and validat.

#### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

#### After making a few models, which one has the best performance (or closest metrics) on both train and validate?