In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from env import host, user, password

import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from graphviz import Graph

## Start of Decision Tree

In [2]:
def get_titanic_data(host = host, user = user, password = password):
    db = 'titanic_db'
    return pd.read_sql('SELECT * FROM passengers', f'mysql+pymysql://{user}:{password}@{host}/{db}')

In [3]:
titanic = get_titanic_data()

In [4]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
# What is your baseline prediction?
# Based on the numbers baseline should be that passengers did not survive

titanic.groupby('survived').count()

Unnamed: 0_level_0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,549,549,549,424,549,549,549,549,549,67,549,549
1,342,342,342,290,342,342,342,340,342,136,340,342


In [6]:
titanic['baseline'] = 0

In [7]:
549 / (549 + 342)

0.6161616161616161

In [8]:
titanic.isna().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
baseline          0
dtype: int64

In [9]:
titanic['sex'] = (titanic['sex'] == 'female')

In [10]:
titanic.drop(columns = ['embark_town'], inplace = True)

In [11]:
# Create dummy variables of the species name.
titanic.drop(columns = ['deck'], inplace = True)
titanic.dropna(how = 'any', inplace = True)
dummies_embarked = pd.get_dummies(titanic[['embarked']])
dummies_class = pd.get_dummies(titanic[['class']])
titanic.drop(columns = ['embarked', 'passenger_id', 'class'], inplace = True)


In [12]:
titanic = pd.concat([titanic, dummies_embarked, dummies_class], axis=1)

In [13]:
# What is your baseline accuracy?
# 61.62%
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,alone,baseline,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
0,0,3,False,22.0,1,0,7.2500,0,0,0,0,1,0,0,1
1,1,1,True,38.0,1,0,71.2833,0,0,1,0,0,1,0,0
2,1,3,True,26.0,0,0,7.9250,1,0,0,0,1,0,0,1
3,1,1,True,35.0,1,0,53.1000,0,0,0,0,1,1,0,0
4,0,3,False,35.0,0,0,8.0500,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,True,39.0,0,5,29.1250,0,0,0,1,0,0,0,1
886,0,2,False,27.0,0,0,13.0000,1,0,0,0,1,0,1,0
887,1,1,True,19.0,0,0,30.0000,1,0,0,0,1,1,0,0
889,1,1,False,26.0,0,0,30.0000,1,0,1,0,0,1,0,0


In [14]:
# Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train['survived'])

In [15]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,alone,baseline,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
450,0,2,False,36.0,1,2,27.75,0,0,0,0,1,0,1,0
543,1,2,False,32.0,1,0,26.0,0,0,0,0,1,0,1,0
157,0,3,False,30.0,0,0,8.05,1,0,0,0,1,0,0,1
462,0,1,False,47.0,0,0,38.5,1,0,0,0,1,1,0,0
397,0,2,False,46.0,0,0,26.0,1,0,0,0,1,0,1,0


In [16]:
x_train = train.drop(columns = ['survived'])
y_train = train.survived

x_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

x_test = test.drop(columns = ['survived'])
y_test = test.survived

In [17]:
clf3 = DecisionTreeClassifier(max_depth=3, random_state=123)
clf3 = clf3.fit(x_train, y_train)
y3_pred = clf3.predict(x_train)

In [18]:
dot_data = export_graphviz(clf3, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('tips_decision_tree', view=True, format="pdf")

'tips_decision_tree.pdf'

In [19]:
# Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, 
# precision, recall, f1-score, and support.


# True positive: survived predicted survived
# True negative: drown predicted drown
# False positive: drown predicted survived
# False negative: survived predicted drown

In [20]:
confusion_matrix(y_train, y3_pred)

array([[230,   7],
       [ 56, 105]])

In [21]:
print(classification_report(y_train, y3_pred))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88       237
           1       0.94      0.65      0.77       161

    accuracy                           0.84       398
   macro avg       0.87      0.81      0.82       398
weighted avg       0.86      0.84      0.83       398



In [22]:
clf4 = DecisionTreeClassifier(max_depth=4, random_state=123)
clf4 = clf4.fit(x_train, y_train)
y4_pred = clf4.predict(x_train)

In [23]:
confusion_matrix(y_train, y4_pred)

array([[231,   6],
       [ 55, 106]])

In [24]:
print(classification_report(y_train, y4_pred))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88       237
           1       0.95      0.66      0.78       161

    accuracy                           0.85       398
   macro avg       0.88      0.82      0.83       398
weighted avg       0.86      0.85      0.84       398



In [25]:
y3_val = clf3.predict(x_validate)
print(classification_report(y_validate, y3_val))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84       102
           1       0.88      0.52      0.65        69

    accuracy                           0.78       171
   macro avg       0.81      0.74      0.75       171
weighted avg       0.80      0.78      0.76       171



In [26]:
y4_val = clf4.predict(x_validate)
print(classification_report(y_validate, y4_val))

              precision    recall  f1-score   support

           0       0.74      0.94      0.83       102
           1       0.86      0.52      0.65        69

    accuracy                           0.77       171
   macro avg       0.80      0.73      0.74       171
weighted avg       0.79      0.77      0.76       171



In [27]:
clf5 = DecisionTreeClassifier(max_depth=5, random_state=123)
clf5 = clf5.fit(x_train, y_train)
y5_pred = clf5.predict(x_train)

In [28]:
print(classification_report(y_train, y5_pred))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       237
           1       0.89      0.74      0.81       161

    accuracy                           0.86       398
   macro avg       0.87      0.84      0.85       398
weighted avg       0.86      0.86      0.86       398



In [29]:
y5_val = clf5.predict(x_validate)
print(classification_report(y_validate, y5_val))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       102
           1       0.80      0.62      0.70        69

    accuracy                           0.78       171
   macro avg       0.79      0.76      0.77       171
weighted avg       0.79      0.78      0.78       171



In [30]:
dot_data = export_graphviz(clf4, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('tips_decision_tree', view=True, format="pdf")

'tips_decision_tree.pdf'

## Start of Random Forest

In [31]:
# Which model performs better on your in-sample data?
# for in sample data adjusting the max depth is always going to lead to a higher in sample accuracy

In [32]:
# Which model performs best on your out-of-sample data, the validate set?
# A max depth of 5 resulted in the most accurate model when used with the validate data set.

In [33]:
# Fit the Random Forest classifier to your training sample and transform 
# (i.e. make predictions on the training sample) 
# setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
from sklearn.ensemble import RandomForestClassifier

In [34]:
rf10 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 2, random_state=123)
rf10 = rf10.fit(x_train, y_train)
y10_pred = rf10.predict(x_train)

In [35]:
def gimme_them_weights(rf, x_train):
    feat = rf.feature_importances_
    key = x_train.columns.tolist()
    val = feat.tolist()
    val = [round(num, 2) for num in val]
    res = {key[i]: val[i] for i in range(len(key))} 
    return res

In [36]:
gimme_them_weights(rf10, x_train)

{'pclass': 0.08,
 'sex': 0.32,
 'age': 0.19,
 'sibsp': 0.02,
 'parch': 0.04,
 'fare': 0.18,
 'alone': 0.01,
 'baseline': 0.0,
 'embarked_C': 0.03,
 'embarked_Q': 0.0,
 'embarked_S': 0.01,
 'class_First': 0.04,
 'class_Second': 0.01,
 'class_Third': 0.07}

In [37]:
print(classification_report(y_train, y10_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92       237
           1       0.94      0.81      0.87       161

    accuracy                           0.90       398
   macro avg       0.91      0.89      0.90       398
weighted avg       0.90      0.90      0.90       398



In [38]:
confusion_matrix(y_train, y10_pred)

array([[228,   9],
       [ 30, 131]])

In [39]:
rf6 = RandomForestClassifier(max_depth = 6, min_samples_leaf = 2, random_state=123)
rf6 = rf6.fit(x_train, y_train)
y6_pred = rf6.predict(x_train)

In [40]:
gimme_them_weights(rf6, x_train)

{'pclass': 0.1,
 'sex': 0.35,
 'age': 0.14,
 'sibsp': 0.03,
 'parch': 0.03,
 'fare': 0.16,
 'alone': 0.02,
 'baseline': 0.0,
 'embarked_C': 0.03,
 'embarked_Q': 0.0,
 'embarked_S': 0.02,
 'class_First': 0.04,
 'class_Second': 0.02,
 'class_Third': 0.07}

In [41]:
print(classification_report(y_train, y6_pred))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90       237
           1       0.92      0.76      0.83       161

    accuracy                           0.88       398
   macro avg       0.89      0.86      0.87       398
weighted avg       0.88      0.88      0.87       398



In [42]:
confusion_matrix(y_train, y6_pred)

array([[227,  10],
       [ 39, 122]])

In [43]:
# Which model performs better on your in-sample data?
# The model with max depth of 10 performed better on in sample data than the model with a max depth of 6

In [44]:
y10_val = rf10.predict(x_validate)
print(classification_report(y_validate, y10_val))

              precision    recall  f1-score   support

           0       0.77      0.89      0.83       102
           1       0.79      0.61      0.69        69

    accuracy                           0.78       171
   macro avg       0.78      0.75      0.76       171
weighted avg       0.78      0.78      0.77       171



In [45]:
y6_val = rf6.predict(x_validate)
print(classification_report(y_validate, y6_val))

              precision    recall  f1-score   support

           0       0.77      0.93      0.84       102
           1       0.85      0.58      0.69        69

    accuracy                           0.79       171
   macro avg       0.81      0.76      0.77       171
weighted avg       0.80      0.79      0.78       171



## Start of KNN model

In [46]:
# Which model performs best on your out-of-sample data, the validate set?
# The model with a max depth of 6 performed better than the model with a max depth of 10

In [47]:
# Fit a K-Nearest Neighbors classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
from sklearn.neighbors import KNeighborsClassifier

In [48]:
kn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
kn = kn.fit(x_train, y_train)
kn_pred = kn.predict(x_train)

In [49]:
print(classification_report(y_train, kn_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       237
           1       0.78      0.69      0.73       161

    accuracy                           0.79       398
   macro avg       0.79      0.78      0.78       398
weighted avg       0.79      0.79      0.79       398



In [50]:
print(confusion_matrix(y_train, kn_pred))

[[205  32]
 [ 50 111]]


In [51]:
# Run through steps 2-4 setting k to 10

kn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')
kn10 = kn10.fit(x_train, y_train)
kn10_pred = kn10.predict(x_train)
print(classification_report(y_train, kn10_pred))

              precision    recall  f1-score   support

           0       0.76      0.88      0.81       237
           1       0.76      0.58      0.66       161

    accuracy                           0.76       398
   macro avg       0.76      0.73      0.74       398
weighted avg       0.76      0.76      0.75       398



In [52]:
print(confusion_matrix(y_train, kn10_pred))

[[208  29]
 [ 67  94]]


In [53]:
# Run through setps 2-4 setting k to 20

kn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')
kn20 = kn20.fit(x_train, y_train)
kn20_pred = kn20.predict(x_train)
print(classification_report(y_train, kn20_pred))

              precision    recall  f1-score   support

           0       0.74      0.86      0.80       237
           1       0.73      0.55      0.63       161

    accuracy                           0.74       398
   macro avg       0.73      0.71      0.71       398
weighted avg       0.74      0.74      0.73       398



In [54]:
print(confusion_matrix(y_train, kn20_pred))

[[204  33]
 [ 72  89]]


In [55]:
# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
# When k was equal to 5 it yielded the most accurate results for our in sample data, though the 
# differences in accuracy were small.

In [56]:
# Which model performs best on our out-of-sample data from validate?
# When k was equal to 20, it performed best on our validate data

kn_val_pred = kn.predict(x_validate)
print(classification_report(y_validate, kn_val_pred))

              precision    recall  f1-score   support

           0       0.68      0.72      0.70       102
           1       0.54      0.49      0.52        69

    accuracy                           0.63       171
   macro avg       0.61      0.60      0.61       171
weighted avg       0.62      0.63      0.62       171



In [57]:
kn10_val_pred = kn10.predict(x_validate)
print(classification_report(y_validate, kn10_val_pred))

              precision    recall  f1-score   support

           0       0.68      0.84      0.75       102
           1       0.64      0.42      0.51        69

    accuracy                           0.67       171
   macro avg       0.66      0.63      0.63       171
weighted avg       0.67      0.67      0.66       171



In [58]:
kn20_val_pred = kn20.predict(x_validate)
print(classification_report(y_validate, kn20_val_pred))

              precision    recall  f1-score   support

           0       0.69      0.85      0.76       102
           1       0.67      0.43      0.53        69

    accuracy                           0.68       171
   macro avg       0.68      0.64      0.64       171
weighted avg       0.68      0.68      0.67       171



## Start of Logistic Regression

In [59]:
titanic = get_titanic_data()
titanic.drop(columns = ['deck'], inplace = True)
titanic.dropna(how = 'any', inplace = True)
titanic['sex'] = (titanic['sex'] == 'female')
dummies_embarked = pd.get_dummies(titanic[['embarked']])
dummies_class = pd.get_dummies(titanic[['class']])
titanic.drop(columns = ['embarked', 'passenger_id', 'class', 'embark_town'], inplace = True)
titanic = pd.concat([titanic, dummies_embarked, dummies_class], axis=1)

In [60]:
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train['survived'])

In [61]:
x_train = train.drop(columns = ['survived'])
y_train = train.survived

x_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

x_test = test.drop(columns = ['survived'])
y_test = test.survived

In [62]:
def gimme_them_coef(logit2, x_train):
    feat = logit2.coef_[0]
    key = x_train.columns.tolist()
    val = feat.tolist()
    val = [round(num, 2) for num in val]
    res = {key[i]: val[i] for i in range(len(key))} 
    return res

In [63]:
# Create a model that includes age in addition to fare and pclass. 
# Does this model perform better than your baseline?
# Baseline = 59.6%

from sklearn.linear_model import LogisticRegression
logit2 = LogisticRegression(C=10, random_state=123)
logit2 = logit2.fit(x_train, y_train)

### MODEL MADE FROM DROPPING NULL AGES

In [64]:
y_predict = logit2.predict(x_train)
print(classification_report(y_train, y_predict))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       237
           1       0.80      0.69      0.74       161

    accuracy                           0.80       398
   macro avg       0.80      0.79      0.79       398
weighted avg       0.80      0.80      0.80       398



In [65]:
gimme_them_coef(logit2, x_train)

{'pclass': -0.48,
 'sex': 2.43,
 'age': -0.03,
 'sibsp': -0.32,
 'parch': -0.04,
 'fare': 0.0,
 'alone': -0.3,
 'embarked_C': 0.57,
 'embarked_Q': -0.26,
 'embarked_S': 0.17,
 'class_First': 0.74,
 'class_Second': 0.43,
 'class_Third': -0.69}

In [66]:
y_val = logit2.predict(x_validate)
print(classification_report(y_validate, y_val))

              precision    recall  f1-score   support

           0       0.77      0.92      0.84       102
           1       0.84      0.59      0.69        69

    accuracy                           0.79       171
   macro avg       0.80      0.76      0.77       171
weighted avg       0.80      0.79      0.78       171



In [67]:
titanic = get_titanic_data()
titanic.drop(columns = ['deck'], inplace = True)
titanic.dropna(how = 'any', inplace = True)
titanic['sex'] = (titanic['sex'] == 'female')
titanic.drop(columns = ['embarked', 'passenger_id', 'sibsp', 'class', 'embark_town', 'alone', 'parch'], inplace = True)

In [68]:
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train['survived'])

In [69]:
x_train = train.drop(columns = ['survived'])
y_train = train.survived

x_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

x_test = test.drop(columns = ['survived'])
y_test = test.survived

### MODEL MADE FROM JUST FARE, SEX, AGE(DROPPED NA), AND CLASS

In [70]:
logit2 = LogisticRegression(C=10, random_state=123)
logit2 = logit2.fit(x_train, y_train)

In [71]:
y_predict = logit2.predict(x_train)
print(classification_report(y_train, y_predict))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       237
           1       0.76      0.72      0.74       161

    accuracy                           0.80       398
   macro avg       0.79      0.78      0.79       398
weighted avg       0.80      0.80      0.80       398



In [72]:
gimme_them_coef(logit2, x_train)

{'pclass': -1.32, 'sex': 2.4, 'age': -0.03, 'fare': 0.0}

In [73]:
y_val = logit2.predict(x_validate)
print(classification_report(y_validate, y_val))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       102
           1       0.78      0.65      0.71        69

    accuracy                           0.78       171
   macro avg       0.78      0.76      0.77       171
weighted avg       0.78      0.78      0.78       171



In [82]:
titanic = get_titanic_data()
titanic.drop(columns = ['deck'], inplace = True)
titanic['age'].fillna(titanic['age'].mean(), inplace=True)
titanic.dropna(how = 'any', inplace = True)
titanic['sex'] = (titanic['sex'] == 'female')
dummies_embarked = pd.get_dummies(titanic[['embarked']])
dummies_class = pd.get_dummies(titanic[['class']])
titanic.drop(columns = ['embarked', 'passenger_id', 'class', 'embark_town'], inplace = True)
titanic = pd.concat([titanic, dummies_embarked, dummies_class], axis=1)

In [83]:
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train['survived'])

In [84]:
x_train = train.drop(columns = ['survived'])
y_train = train.survived

x_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

x_test = test.drop(columns = ['survived'])
y_test = test.survived

### MODEL MADE WITH IMPUTED AGE VALUES

In [85]:
logit2 = LogisticRegression(C=1000, random_state=123)
logit2 = logit2.fit(x_train, y_train)

In [86]:
y_predict = logit2.predict(x_train)
print(classification_report(y_train, y_predict))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       307
           1       0.77      0.69      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



In [87]:
gimme_them_coef(logit2, x_train)

{'pclass': -0.24,
 'sex': 2.6,
 'age': -0.03,
 'sibsp': -0.58,
 'parch': -0.3,
 'fare': 0.0,
 'alone': -1.0,
 'embarked_C': 0.04,
 'embarked_Q': 0.5,
 'embarked_S': 0.18,
 'class_First': 1.08,
 'class_Second': 0.25,
 'class_Third': -0.61}

In [88]:
y_val = logit2.predict(x_validate)
print(classification_report(y_validate, y_val))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       132
           1       0.77      0.70      0.73        82

    accuracy                           0.80       214
   macro avg       0.80      0.78      0.79       214
weighted avg       0.80      0.80      0.80       214



In [89]:
# Choose your best model from the validation performation, and evaluate it on the test dataset. 
# How do the performance metrics compare to validate? to train?
# performance of the best model was the same across all data sets.

y_test_pre = logit2.predict(x_test)
print(classification_report(y_test, y_test_pre))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       110
           1       0.75      0.75      0.75        68

    accuracy                           0.81       178
   macro avg       0.80      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178



## Dropping age test

In [90]:
titanic = get_titanic_data()
titanic.drop(columns = ['deck'], inplace = True)
titanic['age'].fillna(titanic['age'].mean(), inplace=True)
titanic.dropna(how = 'any', inplace = True)
titanic['sex'] = (titanic['sex'] == 'female')
dummies_embarked = pd.get_dummies(titanic[['embarked']])
dummies_class = pd.get_dummies(titanic[['class']])
titanic.drop(columns = ['embarked', 'passenger_id', 'age', 'class', 'embark_town'], inplace = True)
titanic = pd.concat([titanic, dummies_embarked, dummies_class], axis=1)

In [91]:
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train['survived'])

In [92]:
x_train = train.drop(columns = ['survived'])
y_train = train.survived

x_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

x_test = test.drop(columns = ['survived'])
y_test = test.survived

In [93]:
logit2 = LogisticRegression(C=1000, random_state=123)
logit2 = logit2.fit(x_train, y_train)

In [94]:
y_predict = logit2.predict(x_train)
print(classification_report(y_train, y_predict))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



In [95]:
y_val = logit2.predict(x_validate)
print(classification_report(y_validate, y_val))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       132
           1       0.75      0.70      0.72        82

    accuracy                           0.79       214
   macro avg       0.78      0.78      0.78       214
weighted avg       0.79      0.79      0.79       214



In [None]:
# How do different strategies for handling the missing values in the age column affect model performance?
# Surprisingly there is very little difference in strategies for handling age