In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.linear_model

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns


import acquire as ac
import prepare as pr

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [4]:
# aquire the data 
df = ac.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
df = pr.prep_titanic_2(df)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,Southampton,1
3,1,1,female,35.0,1,0,53.1,Southampton,0
4,0,3,male,35.0,0,0,8.05,Southampton,1


In [6]:
dumiies = pd.get_dummies(df['embark_town'], drop_first = False)
df = pd.concat([df, dumiies], axis = 1)

In [7]:
sex_dummy = pd.get_dummies(df['sex'], drop_first = False)
df = pd.concat([df, sex_dummy], axis = 1)
drop_cols = ['sex', 'embark_town']
df = df.drop(columns = drop_cols)

In [8]:
# split the data 
train, validate, test = pr.split_data(df, 'titanic')
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,Cherbourg,Queenstown,Southampton,female,male
474,0,3,22.0,0,0,9.8375,1,0,0,1,1,0
370,1,1,25.0,1,0,55.4417,0,1,0,0,0,1
573,1,3,30.0,0,0,7.75,1,0,1,0,1,0
110,0,1,47.0,0,0,52.0,1,0,0,1,0,1
167,0,3,45.0,1,4,27.9,0,0,0,1,1,0


In [9]:
# drop object columns and create X_train of features only 
# and y_train of survived only. 
X_train = train.drop(columns=['survived'])
y_train = train['survived']

X_validate = validate.drop(columns=['survived'])
y_validate = validate['survived']

X_test = test.drop(columns=['survived'])
y_test = test['survived']

# check the shape
X_train.shape, X_validate.shape, X_test.shape

((498, 11), (214, 11), (179, 11))

In [10]:
baseline_accuracy = (train.survived == 0).mean()
round(baseline_accuracy, 2)

0.62

# Question 1: 
Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [11]:
# Create the logistic regression
logit = LogisticRegression(random_state=123)

# specify the features we're using
features = ["age", "pclass", "fare"]

# Fit a model using only these specified features
# logit.fit(X_train[["age", "pclass", "fare"]], y_train)
logit.fit(X_train[features], y_train)

# Since we .fit on a subset, we .predict on that same subset of features
y_pred = logit.predict(X_train[features])

print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train[features], y_train)))

Baseline is 0.62
Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 0.74


# Question 2
Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [13]:
# Create the logistic regression
logit1 = LogisticRegression(random_state=123)

# specify the features we're using
features = ["age", "pclass", "fare", "male"]

# Fit a model using only these specified features
logit1.fit(X_train[features], y_train)

y_pred = logit1.predict(X_train[features])

print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features], y_train)))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.80


# Questoin 3: 
Try out other combinations of features and models.

In [14]:
# All features, all default hyperparameters
logit2 = LogisticRegression(random_state=123)

logit2.fit(X_train, y_train)

y_pred = logit2.predict(X_train)

print("Model trained on all features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Model trained on all features
Accuracy of Logistic Regression classifier on training set: 0.81


In [15]:

# All features, but we'll use the class_weights to hold the actual ratios`
logit3 = LogisticRegression(random_state=123, class_weight='balanced')

logit3.fit(X_train, y_train)

y_pred = logit3.predict(X_train)

accuracy = logit3.score(X_train, y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.81


In [16]:
# Only Age 
features = ["age"]

# All features, but we'll use the class_weights to hold the actual ratios
logit4 = LogisticRegression(random_state=123)

logit4.fit(X_train[features], y_train)

y_pred = logit4.predict(X_train[features])

accuracy = logit4.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.62


In [17]:
# Only pclass
features = ["pclass"]

# All features, but we'll use the class_weights to hold the actual ratios
logit5 = LogisticRegression(random_state=123)

logit5.fit(X_train[features], y_train)

y_pred = logit5.predict(X_train[features])
accuracy = logit5.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.7


In [18]:
# All Features, C ~ 0
# All features, but we'll use the class_weights to hold the actual ratios
logit6 = LogisticRegression(random_state=123, C=0.0001)

logit6.fit(X_train, y_train)

y_pred = logit6.predict(X_train)

accuracy = logit6.score(X_train, y_train)

print("All Features, C hyperparameter approaching 0")
print("Baseline is", round(baseline_accuracy, 2))
print(f'Accuracy of this Logistic Regression on training set: {accuracy:.2}')

All Features, C hyperparameter approaching 0
Baseline is 0.62
Accuracy of this Logistic Regression on training set: 0.67


# Question 1: 
Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [21]:

# Let's determine logit1's metrics on validate
features = ["age", "pclass", "fare", "male"]

y_pred = logit1.predict(X_validate[features])

print('Logit1 model using age, pclass, fare, and is_female as the features')
print(classification_report(y_validate, y_pred))

Logit1 model using age, pclass, fare, and is_female as the features
              precision    recall  f1-score   support

           0       0.80      0.75      0.78       132
           1       0.64      0.71      0.67        82

    accuracy                           0.73       214
   macro avg       0.72      0.73      0.72       214
weighted avg       0.74      0.73      0.74       214



In [22]:
# Logit2 uses all features
y_pred = logit2.predict(X_validate)

print("Logit2 model using all features and all model defaults")
print(classification_report(y_validate, y_pred))

Logit2 model using all features and all model defaults
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       132
           1       0.67      0.68      0.67        82

    accuracy                           0.75       214
   macro avg       0.73      0.74      0.73       214
weighted avg       0.75      0.75      0.75       214



In [23]:
# Logit3 uses all features and class_weight='balanced'
y_pred = logit3.predict(X_validate)

print("Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default")
print(classification_report(y_validate, y_pred))

Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default
              precision    recall  f1-score   support

           0       0.82      0.71      0.76       132
           1       0.62      0.76      0.68        82

    accuracy                           0.73       214
   macro avg       0.72      0.73      0.72       214
weighted avg       0.75      0.73      0.73       214



In [25]:
# using logit3 model for demo here

y_pred_proba = logit3.predict_proba(X_train)


y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['not-survived', 'survived'])
y_pred_proba.head()


Unnamed: 0,not-survived,survived
0,0.29237,0.70763
1,0.232574,0.767426
2,0.298946,0.701054
3,0.62182,0.37818
4,0.647529,0.352471


In [26]:

# Set new threshold as 0.3 instead of 0.5
t = 0.57

y_pred = (y_pred_proba.survived > t).astype(int)
y_pred.head()

0    1
1    1
2    1
3    0
4    0
Name: survived, dtype: int64

In [27]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       307
           1       0.77      0.74      0.76       191

    accuracy                           0.82       498
   macro avg       0.81      0.80      0.80       498
weighted avg       0.82      0.82      0.82       498



In [29]:
import logistic_regression_util

ModuleNotFoundError: No module named 'logistic_regression_util'

In [28]:


# plot metrics vs thresholds
logistic_regression_util.plot_metrics_by_thresholds(y_train, y_pred_proba.survived)

ModuleNotFoundError: No module named 'logistic_regression_util'