In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import acquire
import prepare

df = acquire.get_titanic_data()

In [2]:
# check that I have the right data
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
# check for nulls
df.isna().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [4]:
# get median age to fill in those nulls
median_age = df[df.age.notnull()].age.median()
median_age

28.0

In [5]:
# the pandas .median method ignores nulls
df.age.median()

28.0

In [6]:
# fill the nulls w/ the median
df.age = df.age.fillna(median_age)
print(f"Age nulls: {df.age.isna().sum()}")

Age nulls: 0


In [7]:
# drop some columns with nulls
# assign df again 
df = df.drop(columns=["deck", "embark_town", "embarked"])

In [8]:
# verfy that nulls are gone
df.isna().sum()

passenger_id    0
survived        0
pclass          0
sex             0
age             0
sibsp           0
parch           0
fare            0
class           0
alone           0
dtype: int64

In [9]:
# need to encode sex so I can use in my decision tree
def encode_gender(gender):
    if gender == "male":
        return 0
    else:
        return 1

In [10]:
df.sex = df.sex.apply(encode_gender)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,alone
0,0,0,3,0,22.0,1,0,7.25,Third,0
1,1,1,1,1,38.0,1,0,71.2833,First,0
2,2,1,3,1,26.0,0,0,7.925,Third,1
3,3,1,1,1,35.0,1,0,53.1,First,0
4,4,0,3,0,35.0,0,0,8.05,Third,1


## Lecture sample

In [11]:
# X = df[['pclass', 'sex', 'fare', 'sibsp', 'parch', 'age']]
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

In [12]:
# setup the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=123)

In [13]:
X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
416,2,34.0,32.5,1,1
801,2,31.0,26.25,1,1
512,1,36.0,26.2875,0,0
455,3,29.0,7.8958,0,0
757,2,18.0,11.5,0,0


In [14]:
# Create the model
rf = RandomForestClassifier(bootstrap=True,
                           class_weight=None,
                           criterion='gini',
                           min_samples_leaf=3,
                           n_estimators=100,
                           max_depth=3,
                           random_state=123)

In [15]:
# fit the model
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [16]:
# print the model
print(rf.feature_importances_)

[0.27038416 0.18306563 0.41467092 0.08974103 0.04213826]


In [17]:
# estimate whether or not a passenger would survive using training data
y_pred = rf.predict(X_train)

In [18]:
# estimate the probabiity of a passenger surviving using the training data
y_pred_proba = rf.predict_proba(X_train)

In [19]:
# evaluate the model
print('Accuracy of random forest classifier on training set: {:.2f}'.format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.75


In [20]:
# create a confusion matrix
confusion_matrix(y_train, y_pred)

array([[337,  42],
       [116, 128]])

In [21]:
# print classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.89      0.81       379
           1       0.75      0.52      0.62       244

    accuracy                           0.75       623
   macro avg       0.75      0.71      0.71       623
weighted avg       0.75      0.75      0.74       623



In [22]:
# Test the model
print('Accuracy of random forest classifier on test set: {:.2f}'.format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.75


## Random Forest Exercise

In [23]:
def encode_sex(df):
    '''
    Returns a new dataframe with the ``sex`` column encoded.
    '''
    return df.assign(
        sex=(df.sex == 'female').astype(int)
    )

In [24]:
def get_splits(titanic):
    '''
    Returns X and y for train, validate and test datasets
    '''
    # don't blow away our original data
    titanic = titanic.copy()
    
    # ignore warnings just for this block
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        scaler, encoder, train, test = prepare.prep_titanic(titanic)
    
    # Which features are we going to look at?
    cols = ['survived', 'pclass', 'sex', 'age', 'alone']
    train = train[cols]
    test = test[cols]

    # validate data split
    train, validate = sklearn.model_selection.train_test_split(
        train, train_size=.85, random_state=123
    )

    # split into X and y
    X_train, y_train = train.drop(columns='survived'), train.survived
    X_validate, y_validate = validate.drop(columns='survived'), validate.survived
    X_test, y_test = test.drop(columns='survived'), test.survived
    
    X_train = encode_sex(X_train)
    X_validate = encode_sex(X_validate)
    X_test = encode_sex(X_test)
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [None]:
logistic_regression_util.plot_metrics_by_thresholds(
    evaluation.actual, evaluation['survived ~ pclass + age + sex + alone']
)

It looks like the model with more features is better here.

In [None]:
t = .66

predictions = (evaluation['survived ~ pclass + age + sex + alone'] > t).astype(int)
actual = evaluation.actual

# confusion matrix
pd.crosstab(predictions, actual, normalize=True)

In [None]:
X_train, y_train, X_validate, y_validate, X_test, y_test = get_splits(titanic)

# TODO: allow for a threshold
# TODO: include precision and recall
def evaluate_model(c):
    model = sklearn.linear_model.LogisticRegression(C=c)
    model.fit(X_train, y_train)
    accuracy = model.score(X_validate, y_validate)
    coefs = dict(zip(X_train.columns, model.coef_[0]))
    return {'C': c, 'accuracy': accuracy, **coefs}

models = [evaluate_model(c) for c in [.001, .01, .1, 1, 10, 100, 1000]]
(pd.DataFrame(models).round(3)
 .set_index(['C', 'accuracy'])
 .style
 .set_caption('Effect of differnt C values on accuracy (t=.5) and the resulting coefficients.')
 .set_precision(3)
#  .background_gradient('Blues')
#  .highlight_max() # for columns
#  .highlight_max(axis=1) # for rows
)

In [None]:
dict(zip(X_train.columns, model.coef_[0]))

5. **Bonus** How do different strategies for handling the missing values in the age column affect model performance?

6. **Bonus**: How do different strategies for encoding sex affect model performance?

7. **Bonus**: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

    Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.
                                C=.01,.1,1,10,100,1000

**Bonus Bonus**: how does scaling the data interact with your choice of C?

In [None]:
df = sns.load_dataset("titanic")
df.head()

In [None]:
# drop duplicate columns
# Drop columns that we discovered from Explore stage didn't really have a lot of bearing
df = df[["survived", "pclass", "sex", "age", "fare"]]
df.head()

In [None]:
def encode_gender(gender):
    if gender == "male":
        return 0
    else:
        return 1

In [None]:
df.sex = df.sex.apply(encode_gender)
df.head()

In [None]:
df.info()

In [None]:
print(f"Survived nulls: {df.survived.isna().sum()}")
print(f"Class nulls:  {df.pclass.isna().sum()}")
print(f"Gender nulls: {df.sex.isna().sum()}")
print(f"Age nulls: {df.age.isna().sum()}")
print(f"Fare nulls: {df.fare.isna().sum()}")

In [None]:
# nice and clean
df.isna().sum()

In [None]:
# get the median age
median_age = df[df.age.notnull()].age.median()
median_age

In [None]:
# the pandas .median method ignores nulls
df.age.median()

In [None]:
# fill the nulls w/ the median
df.age = df.age.fillna(median_age)
print(f"Age nulls: {df.age.isna().sum()}")

In [None]:
# Setup the X and y variables
X = df.drop("survived", axis=1)
y = df[["survived"]]

In [None]:
# Setup the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

In [None]:
# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='entropy', max_depth=8, random_state=123)

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_train)
y_pred

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
# Get the predicted y values from the X_test
y_pred = clf.predict(X_test)

In [None]:
print(f"Accuracy of Decision Tree on Test data is: {clf.score(X_test, y_test)}")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
dot_data = export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True)

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

2. Evaluate your results using the model score, confusion matrix, and classification report.
   

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
   

4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.
   

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?