In [1]:
# imports...
import pandas as pd
import numpy as np
import os
from scipy import stats
# visualize
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(11, 9))
plt.rc('font', size=13)
# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")
# acquire
from env import host, user, password
from pydataset import data
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
# my fancy docs
import acquire
import prepare
import explore
import model_func
import mf

In [2]:
# need to convert

## Exercises
Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [3]:
# positive: survives
# negative: dies
# TP: predict survival, and the passenger actually survived
# TN: predict death, and passenger actually died
# FN: predict death, but passenger actually survived
# FP: predict survival, but passenger actually died

In [4]:
# lets bring in the data set
# titanic_df = acquire.new_titanic_data()
# now let's prep it
train, validate, test = prepare.prep_titanic_data(acquire.new_titanic_data(), column = 'age', method = 'median', dummies = ['embarked', 'sex'])
# split it
# train, validate, test = prepare.titanic_split(titanic_df)
# what do they look like?
train.shape, validate.shape, test.shape

((498, 11), (214, 11), (179, 11))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  498 non-null    int64  
 1   survived      498 non-null    int64  
 2   pclass        498 non-null    int64  
 3   age           498 non-null    float64
 4   sibsp         498 non-null    int64  
 5   parch         498 non-null    int64  
 6   fare          498 non-null    float64
 7   alone         498 non-null    int64  
 8   embarked_Q    498 non-null    uint8  
 9   embarked_S    498 non-null    uint8  
 10  sex_male      498 non-null    uint8  
dtypes: float64(2), int64(6), uint8(3)
memory usage: 36.5 KB


In [6]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

In [7]:
# i want to see relationships between each column and 'survived'
# 'pclass' - chi2
observed1 = pd.crosstab(train.survived, train.pclass)
chi2, p, degf, expected = stats.chi2_contingency(observed1)
print('     pclass')
print('')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')
print('------------------------------------')
# 'sibsp' - chi2
observed2 = pd.crosstab(train.survived, train.sibsp)
chi2, p, degf, expected = stats.chi2_contingency(observed2)
print('     sibsp')
print('')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')
print('------------------------------------')
# 'parch' - chi2
observed3 = pd.crosstab(train.survived, train.parch)
chi2, p, degf, expected = stats.chi2_contingency(observed3)
print('     parch')
print('')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')
print('-------------------------------------')
# 'alone' - chi2
observed4 = pd.crosstab(train.survived, train.alone)
chi2, p, degf, expected = stats.chi2_contingency(observed4)
print('     alone')
print('')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')
print('-------------------------------------')
# 'embarked_Q' - chi2
observed5 = pd.crosstab(train.survived, train.embarked_Q)
chi2, p, degf, expected = stats.chi2_contingency(observed5)
print('     embarked_Q')
print('')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')
print('-------------------------------------')
# 'embarked_S' - chi2
observed6 = pd.crosstab(train.survived, train.embarked_S)
chi2, p, degf, expected = stats.chi2_contingency(observed6)
print('     embarked_S')
print('')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')
print('-------------------------------------')
# 'sex_male' - chi2
observed7 = pd.crosstab(train.survived, train.sex_male)
chi2, p, degf, expected = stats.chi2_contingency(observed7)
print('     sex_male')
print('')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')
print('-------------------------------------')

     pclass

chi^2 = 55.2252
p     = 0.0000
------------------------------------
     sibsp

chi^2 = 24.8926
p     = 0.0004
------------------------------------
     parch

chi^2 = 15.4412
p     = 0.0086
-------------------------------------
     alone

chi^2 = 18.1920
p     = 0.0000
-------------------------------------
     embarked_Q

chi^2 = 0.3542
p     = 0.5517
-------------------------------------
     embarked_S

chi^2 = 12.3251
p     = 0.0004
-------------------------------------
     sex_male

chi^2 = 159.2890
p     = 0.0000
-------------------------------------


In [8]:
# now the continuous...
# 'age'
# 'fare'

# create survival_rate
survival_rate = train['survived'].mean()
# create age variable for ttest
age = train['age']
# ttest for age
alpha = 0.01
t, p = stats.ttest_1samp(age, survival_rate)
print('     age')
print('')
print('tscore:', t.round(2))
print('p/2:   ', p/2)
print('alpha: ', alpha)
print('-------------------------------------')
# create fare variable for ttest
fare = train['fare']
# ttest for fare
t, p = stats.ttest_1samp(fare, survival_rate)
print('     fare')
print('')
print('tscore:', t.round(2))
print('p/2:   ', p/2)
print('alpha: ', alpha)
print('-------------------------------------')

     age

tscore: 49.39
p/2:    3.8085496775045816e-194
alpha:  0.01
-------------------------------------
     fare

tscore: 14.55
p/2:    1.6753909571183626e-40
alpha:  0.01
-------------------------------------


In [9]:
# so which columns do I want to keep for my model?????
# 'sex_male', 'pclass', 'age', 'sibsp'
# and maybe 'alone'

In [None]:
X_cols = ['sex_male', 'pclass', 'age', 'sibsp']
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [None]:
# X_train is the DF w/o the 'survived' column
# y_train is the 'survived' column as a Series

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
# what is the mode of 'survived'?
y_train.value_counts()
# death wins

In [None]:
# 1. Create the object
baseline = DummyClassifier(strategy='constant', constant=0)
# 2. Fit the object
baseline.fit(X_train, y_train)
# how does it do on training data set?
print('Baseline accuracy: %.4f' % baseline.score(X_train, y_train))

2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
# let's look w/ default hyperparameters
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [None]:
# how does the DT perform w/ default hyperparameters?
print(f'training score: {tree.score(X_train, y_train):.2%}')
print(f'validate score: {tree.score(X_validate, y_validate):.2%}')

In [None]:
# what does the tree look like?
print(export_text(tree, feature_names=X_train.columns.tolist(), show_weights=True))

In [None]:
# this is garbage

plot_tree(tree, filled=True, rounded=True)

In [None]:
# now let's dive in and take a close look once we add some hyperparameters



In [None]:
tree1 = DecisionTreeClassifier(max_leaf_nodes=4)
tree1.fit(X_train, y_train)

In [None]:
print(f'training score: {tree1.score(X_train, y_train):.2%}')
print(f'validate score: {tree1.score(X_validate, y_validate):.2%}')

In [None]:
plot_tree(tree1, filled=True, rounded=True)

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
print(f'Tree 1 training score: {tree1.score(X_train, y_train):.2%}')
print(f'Tree 1 validate score: {tree1.score(X_validate, y_validate):.2%}')

4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
# let's add the predictions to the df

train['prediction'] = tree1.predict(X_train)
train.head()

In [None]:
# confusion matrix
confusion_matrix(train.survived, train.prediction)

In [None]:
train['survived'].value_counts()

In [None]:
train.prediction.value_counts()

In [None]:
pre1_df = pd.DataFrame([['TN', 'FP'],['FN', 'TP']], index=['actual death', 'actual survived'], columns=['pred death', 'pred survived'])

In [None]:
# Carl Sagan went to outer space to find Madeleine's code...
pre1_df + ' : ' + confusion_matrix(train.survived, train.prediction).astype(str)

In [None]:
# 1st 4 create the variables
tp = 141
tn = 261
fp = 46
fn = 50
# these calculate the rates
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
tnr = tn/(tn+fp)
fnr = fn/(fn+tp)

In [None]:
# accuracy
accuracy_1 = (train.survived == train.prediction).mean()
# Precision
subset = train[train.prediction == 1]
precision_t1 = (subset.prediction == subset.survived).mean()
# Recall
subset = train[train.survived == 1]
recall_t1 = (subset.prediction == subset.survived).mean()
# f1-score


pd.DataFrame(classification_report(train.survived, train.prediction, output_dict=True)).T

In [None]:
print(f'The overall Accuracy is {accuracy_1:.2%}')
print(f'The True Positive rate is {tpr:.2%}')
print(f'The False Positive rate is {fpr:.2%}')
print(f'The True Negative rate is {tnr:.2%}')
print(f'The False Negative rate is {fnr:.2%}')
print(f'Precision for tree 1 is {precision_t1:.2%}')
print(f'Recall for tree 1 is {recall_t1:.2%}')

In [None]:
classification = classification_report(train.survived, train.prediction, target_names=['Survive', 'Death'])

In [None]:
accuracy = accuracy_score(train.survived, train.prediction)
precision = precision_score(train.survived, train.prediction, pos_label=1)
recall = recall_score(train.survived, train.prediction, pos_label=1)
classification = classification_report(train.survived, train.prediction, output_dict=True)
pd.DataFrame(classification)

5. Run through steps 2-4 using a different max_depth value.

In [None]:
tree2 = DecisionTreeClassifier(max_depth=4)
tree2.fit(X_train, y_train)

In [None]:
print(f'training score: {tree2.score(X_train, y_train):.2%}')
print(f'validate score: {tree2.score(X_validate, y_validate):.2%}')

In [None]:
train['prediction_2'] = tree2.predict(X_train)

In [None]:
pre2_df = pd.DataFrame([['TN', 'FP'],['FN', 'TP']], index=['actual death', 'actual survived'], columns=['pred_2 death', 'pred_2 survived'])

In [None]:
pre2_df + ' : ' + confusion_matrix(train.survived, train.prediction_2).astype(str)

In [None]:
tp = 140
tn = 281
fp = 26
fn = 51
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
tnr = tn/(tn+fp)
fnr = fn/(fn+tp)

In [None]:
# accuracy
accuracy_2 = (train.survived == train.prediction_2).mean()
# Precision
subset = train[train.prediction_2 == 1]
precision_t2 = (subset.prediction_2 == subset.survived).mean()
# Recall
subset = train[train.survived == 1]
recall_t2 = (subset.prediction_2 == subset.survived).mean()
# f1-score


class_report = pd.DataFrame(classification_report(train.survived, train.prediction_2, output_dict=True)).T

In [None]:
class_report = class_report.rename(index={'0': 'Died', '1': 'Survived'})

In [None]:
class_report

In [None]:
print(f'The overall Accuracy of Tree 2 is {accuracy_2:.2%}')
print(f'The True Positive rate of Tree 2 is {tpr:.2%}')
print(f'The False Positive rate Tree 2 is {fpr:.2%}')
print(f'The True Negative rate of Tree 2 is {tnr:.2%}')
print(f'The False Negative rate of Tree 2 is {fnr:.2%}')
print(f'Precision for tree 2 is {precision_t2:.2%}')
print(f'Recall for tree 2 is {recall_t2:.2%}')

6. Which model performs better on your in-sample data?

In [None]:
# model 2 performs better on my train set
# it is (at least slightly) overfit

7. Which model performs best on your out-of-sample data, the validate set?

In [None]:
# model 2 also performs better on the validate set

In [None]:
# save for later... Thanks Heather, and Parker, and Chad

def run_metrics(model, data_set):
    """
    This function takes in a model and ouputs metrics. 
    model = name of class model
    data_set = train, validate, test (AS A STRING)
    Will output the Precision Score, the classification report, and the confusion matrix
    It is advisable to print the name of the model you're working with before hand for clarity
    i.e. print('Metrics for Model 1 with Train data\n')
    """
    if data_set == 'train':
        X = X_train
        y = y_train
        df = train
    if data_set == 'validate':
        X = X_validate
        y = y_validate
        df = validate
    if data_set == 'test':
        X = X_test
        y = y_test
        df = test
    score = model.score(X, y)
    matrix = confusion_matrix(y, model.predict(X))
    tpr = matrix[1,1] / (matrix[1,1] + matrix[1,0])
    fpr = matrix[0,1] / (matrix[0,1] + matrix[0,0])
    tnr = matrix[0,0] / (matrix[0,0] + matrix[0,1])
    fnr = matrix[1,0] / (matrix[1,1] + matrix[1,0])
    print(f'{data_set} data set accuracy score: {score:.2%}')
    class_report = classification_report(y, model.predict(X), zero_division=True)
    print('-------------------------------')
    print(f'classification report')
    print(class_report)
    print ('-------------------------------')
    print('')
    print('confusion matrix')
    print(matrix)
    print(' ')
    print(f'{data_set} data set model metrics')
    print('---------------------------------')
    print(f'True positive rate for the model is {tpr:.2%}')
    print(f'False positive rate for the model is  {fpr:.2%}')
    print(f'True negative rate for the model is {tnr:.2%}')
    print(f'False negative rate for the model is {fnr:.2%}')

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



In [None]:
rf1 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=1221)

In [None]:
rf1.fit(X_train, y_train)

In [None]:
y_pred = rf1.predict(X_train)

In [None]:
y_pred_proba = rf1.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
run_metrics(rf1, 'train')

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [None]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=1221)

In [None]:
rf2.fit(X_train, y_train)
y_pred = rf1.predict(X_train)
y_pred_proba = rf1.predict_proba(X_train)
run_metrics(rf2, 'train')



In [None]:
rf3 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=1221)

In [None]:
rf3.fit(X_train, y_train)
y_pred = rf3.predict(X_train)
y_pred_proba = rf3.predict_proba(X_train)
run_metrics(rf3, 'train')

In [None]:
# rf 1 train vs validate
run_metrics(rf1, 'validate')

In [None]:
run_metrics(rf2, 'validate')

In [None]:
run_metrics(rf3, 'validate')

In [None]:
rf4 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=6, 
                            random_state=1221)

In [None]:
rf4.fit(X_train, y_train)
y_pred = rf4.predict(X_train)
y_pred_proba = rf4.predict_proba(X_train)
run_metrics(rf4, 'train')

In [None]:
run_metrics(rf4, 'validate')

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Final Thoughts... After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [None]:
# model 1 is way overfit
# 91% accuracy on train, 79% on validate

# model 2 is still too overfit
# 87% accuracy on train, 81% on validate

# model 3 is tsimilar to model 2
# 86% accuracy on train, 79% on validate

# model 4 is the best so far...
# 86% accuracy on train, 81% on validate

In [None]:
run_metrics(tree2, 'train')

In [None]:
run_metrics(tree1, 'train')

In [None]:
# I like this function better...
model_func.model_performs(X_train, y_train, tree1)

## KNN Exercises

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
X_train = train[['pclass', 'sex_male', 'age']]
X_validate = validate[['pclass', 'sex_male', 'age']]
y_train = train.survived
y_validate = validate.survived

In [None]:
knn_1 = KNeighborsClassifier(n_neighbors=1)
knn_1.fit(X_train, y_train)
knn_1.score(X_train, y_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
model_func.model_performs(X_train, y_train, knn_1)

In [None]:
print(f'training score: {knn_1.score(X_train, y_train):.2%}')
print(f'validate score: {knn_1.score(X_validate, y_validate):.2%}')

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps 2-4 setting k to 10

In [None]:
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_10.fit(X_train, y_train)
knn_10.score(X_train, y_train)


In [None]:
model_func.model_performs(X_train, y_train, knn_10)

In [None]:
print(f'training score: {knn_10.score(X_train, y_train):.2%}')
print(f'validate score: {knn_10.score(X_validate, y_validate):.2%}')

5. Run through setps 2-4 setting k to 20

In [None]:
knn_20 = KNeighborsClassifier(n_neighbors=20)
knn_20.fit(X_train, y_train)
knn_20.score(X_train, y_train)

In [None]:
model_func.model_performs(X_train, y_train, knn_20)

In [None]:
print(f'training score: {knn_20.score(X_train, y_train):.2%}')
print(f'validate score: {knn_20.score(X_validate, y_validate):.2%}')

In [None]:
mf.model_performs(X_train, y_train, knn_20)

In [None]:
for k in range(2,40,2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_validate, y_validate)
    print(f'{k:2d}: {accuracy:.2%}')

In [None]:
for k in range(2,40,2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    print(f'Model: {k}')
    print(f'Training score: {knn.score(X_train, y_train):.2%}')
    print(f'Validate Score: {knn.score(X_validate, y_validate):.2%}')
    print('----------------------------------------------------')

In [None]:
# now that we have an overview, lets tighten it up...

for k in range(5,15):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    print(f'Model: {k}')
    print(f'Training score: {knn.score(X_train, y_train):.2%}')
    print(f'Validate Score: {knn.score(X_validate, y_validate):.2%}')
    print('----------------------------------------------------')

6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- The higher k, the better it performs on in-sample data (overfit)
- up until k = 12

7. Which model performs best on our out-of-sample data from validate?

- I think k=11 performs the best...
- on train= 78.92%
- on validate= 78.50%

In [None]:
knn_11 = KNeighborsClassifier(n_neighbors=11)
knn_11.fit(X_train, y_train)
knn_11.score(X_train, y_train)

In [None]:
model_func.model_performs(X_train, y_train, knn_11)

In [None]:
# # what if I simplified...
# X_train = train[['sex_male', 'pclass']]
# X_validate = validate[['sex_male', 'pclass']]
# y_train = train.survived
# y_validate = validate.survived




In [None]:
# for k in range(2,40,2):
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train, y_train)
#     print(f'Model: {k}')
#     print(f'Training score: {knn.score(X_train, y_train):.2%}')
#     print(f'Validate Score: {knn.score(X_validate, y_validate):.2%}')
#     print('----------------------------------------------------')

In [None]:
mf.compare(knn_11, knn_20, X_train, y_train)

In [None]:
mf.compare_train_validate(knn_11, X_train, y_train, X_validate, y_validate)

## Logically Logistic 

1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?



In [None]:
X_train = train[['pclass', 'fare', 'age']]
X_validate = validate[['pclass', 'fare', 'age']]
y_train = train.survived
y_validate = validate.survived

In [None]:
logit_1 = LogisticRegression(C=1, class_weight={0:61, 1:39}, random_state = 1221, intercept_scaling = 1, solver = 'lbfgs')

In [None]:
logit_1.fit(X_train, y_train)

In [None]:
print('Coefficient: \n', logit_1.coef_)
print('Intercept: \n', logit_1.intercept_)

In [None]:
y_pred = logit_1.predict(X_train)

In [None]:
y_pred_proba = logit_1.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier 1 on training set: {:.2%}'.format(logit_1.score(X_train, y_train)))

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [None]:
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived

In [None]:
logit_2 = LogisticRegression(C=1, class_weight={0:61, 1:39}, random_state = 1221, intercept_scaling = 1, solver = 'lbfgs')

In [None]:
logit_2.fit(X_train, y_train)

In [None]:
print('Coefficient: \n', logit_2.coef_)
print('Intercept: \n', logit_2.intercept_)

In [None]:
y_pred = logit_2.predict(X_train)
y_pred_proba = logit_2.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier 2 on training set: {:.2%}'.format(logit_2.score(X_train, y_train)))

3. Try out other combinations of features and models.

In [None]:
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived

In [None]:
logit_3 = LogisticRegression(C=0.5, random_state = 1221, solver = 'liblinear')

In [None]:
logit_3.fit(X_train, y_train)

In [None]:
print('Coefficient: \n', logit_3.coef_)
print('Intercept: \n', logit_3.intercept_)

In [None]:
y_pred = logit_3.predict(X_train)
y_pred_proba = logit_3.predict_proba(X_train)

In [None]:
print('Accuracy of Logistic Regression classifier 3 on training set: {:.2%}'.format(logit_3.score(X_train, y_train)))

In [None]:
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_4 = LogisticRegression(C=0.5, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'liblinear')
logit_4.fit(X_train, y_train)
y_pred = logit_4.predict(X_train)
y_pred_proba = logit_4.predict_proba(X_train)
print('Coefficient: \n', logit_4.coef_)
print('Intercept: \n', logit_4.intercept_)
print('Accuracy of Logistic Regression classifier 4 on training set: {:.2%}'.format(logit_4.score(X_train, y_train)))

In [None]:
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_5 = LogisticRegression(C=1, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'lbfgs')
logit_5.fit(X_train, y_train)
y_pred = logit_5.predict(X_train)
y_pred_proba = logit_5.predict_proba(X_train)
print('Coefficient: \n', logit_5.coef_)
print('Intercept: \n', logit_5.intercept_)
print('Accuracy of Logistic Regression classifier 4 on training set: {:.2%}'.format(logit_5.score(X_train, y_train)))

In [None]:
X_train = train[['pclass', 'age', 'sex_male']]
X_validate = validate[['pclass', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_6 = LogisticRegression(C=1, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'lbfgs')
logit_6.fit(X_train, y_train)
y_pred = logit_6.predict(X_train)
y_pred_proba = logit_6.predict_proba(X_train)
print('Coefficient: \n', logit_6.coef_)
print('Intercept: \n', logit_6.intercept_)
print('Accuracy of Logistic Regression classifier 4 on training set: {:.2%}'.format(logit_6.score(X_train, y_train)))

4. Use you best 3 models to predict and evaluate on your validate sample.

In [None]:
# what are my top three models?
# 5, 6, 4

In [None]:
# 5 - 80.72% on train, 78.04% on validate
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_5 = LogisticRegression(C=1, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'lbfgs')
logit_5.fit(X_train, y_train)
y_pred = logit_5.predict(X_validate)
y_pred_proba = logit_5.predict_proba(X_validate)
print('Coefficient: \n', logit_5.coef_)
print('Intercept: \n', logit_5.intercept_)
print('Accuracy of Logistic Regression classifier 4 on training set: {:.2%}'.format(logit_5.score(X_validate, y_validate)))


In [None]:
X_train = train[['pclass', 'age', 'sex_male']]
X_validate = validate[['pclass', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_6 = LogisticRegression(C=1, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'lbfgs')
logit_6.fit(X_train, y_train)
y_pred = logit_6.predict(X_validate)
y_pred_proba = logit_6.predict_proba(X_validate)
print('Coefficient: \n', logit_6.coef_)
print('Intercept: \n', logit_6.intercept_)
print('Accuracy of Logistic Regression classifier 4 on training set: {:.2%}'.format(logit_6.score(X_validate, y_validate)))

In [None]:
# 4
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_4 = LogisticRegression(C=0.5, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'liblinear')
logit_4.fit(X_train, y_train)
y_pred = logit_4.predict(X_validate)
y_pred_proba = logit_4.predict_proba(X_validate)
print('Coefficient: \n', logit_4.coef_)
print('Intercept: \n', logit_4.intercept_)
print('Accuracy of Logistic Regression classifier 4 on training set: {:.2%}'.format(logit_4.score(X_validate, y_validate)))

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [None]:
# 5 made it this far...
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_5 = LogisticRegression(C=1, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'lbfgs')
logit_5.fit(X_train, y_train)
y_pred = logit_5.predict(X_test)
y_pred_proba = logit_5.predict_proba(X_test)
print('Coefficient: \n', logit_5.coef_)
print('Intercept: \n', logit_5.intercept_)
print('Accuracy of Logistic Regression classifier 5 on test set: {:.2%}'.format(logit_5.score(X_test, y_test)))

In [None]:
# Holy Coyote!!!!! That was big time overfit

In [None]:
# 4
X_train = train[['pclass', 'fare', 'age', 'sex_male']]
X_validate = validate[['pclass', 'fare', 'age', 'sex_male']]
y_train = train.survived
y_validate = validate.survived
logit_4 = LogisticRegression(C=1, random_state = 1221, fit_intercept=True, intercept_scaling=1, solver = 'liblinear')
logit_4.fit(X_train, y_train)
y_pred = logit_4.predict(X_test)
y_pred_proba = logit_4.predict_proba(X_test)
print('Coefficient: \n', logit_4.coef_)
print('Intercept: \n', logit_4.intercept_)
print('Accuracy of Logistic Regression classifier 4 on training set: {:.2%}'.format(logit_4.score(X_test, y_test)))