# Building Predictive Models

In [1]:
import os
import pandas as pd
import numpy as np

## Data Import

In [2]:
# Import processed data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')
    
train_df = pd.read_csv(train_file_path, index_col = 'PassengerId')
test_df = pd.read_csv(test_file_path, index_col = 'PassengerId')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 34 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 33 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Office          418 n

## Data Preparation

In [5]:
# We will be using uppercase X because the variable is a matrix. y is a 1-D array (vector), so we use lowercase
X = train_df.loc[:,'Age':].values.astype('float')
y = train_df['Survived'].ravel()

In [9]:
print(X.shape, y.shape)

(891, 33) (891,)


In [10]:
#train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 33) (712,)
(179, 33) (179,)


In [11]:
# Average survival in train and test datasets
print(f'mean survival in train: {y_train.mean():.3f}')
print(f'mean survival in test: {np.mean(y_test):.3f}')

mean survival in train: 0.383
mean survival in test: 0.385


In [12]:
# We can see that both test and train data have equal distribution of positive cases. 
# Also there is 39% positive result. This is almost balanced. If the conversion percentage is too small (2-5%) we will follow a slightly different approach

## Baseline Model

In [13]:
# Instantiate a data model
from sklearn.dummy import DummyClassifier
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [14]:
# Train the data model
model_dummy.fit(X_train, y_train);

In [15]:
# determine the model performance (accuracy is the default)
print(f'score for baseline model: {model_dummy.score(X_test, y_test): .2f}')

score for baseline model:  0.61


In [17]:
# The base model as an accuracy of 0.61, meaning if we predict Survived values of 0 for all records,
# we will be 61% correct. Our predictive model musyt have a higher accuracy. Let's get more performance metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

print(f'Accuracy for the baseline model: {accuracy_score(y_test, model_dummy.predict(X_test)):.2f}')
print(f'precision for the baseline model: {precision_score(y_test, model_dummy.predict(X_test)):.2f}')
print(f'Recall for the baseline model: {recall_score(y_test, model_dummy.predict(X_test)):.2f}')
print(f'Confusion matrix for the baseline model: \n {confusion_matrix(y_test, model_dummy.predict(X_test))}')

Accuracy for the baseline model: 0.61
precision for the baseline model: 0.00
Recall for the baseline model: 0.00
Confusion matrix for the baseline model: 
 [[110   0]
 [ 69   0]]


## First Kaggle Submission

In [18]:
# Now that we have measured our base model using the split of our original train data, 
# let's use the base model to make predictions for the actual test data and submit to Kaggle to assess the performance

# convert to matrix
test_X = test_df.values.astype('float')

In [19]:
#get predictions of the generated matrix
predictions = model_dummy.predict(test_X)

In [20]:
# Generate a dataframe object for submission
submission_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})

In [21]:
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [22]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_data_file = os.path.join(submission_data_path, '01_dummy.csv')

# setting index to False ensures no additional index column is added to df.
submission_df.to_csv(submission_data_file, index=False)

In [23]:
# Putting all the steps of submission file creation into a single function
def get_submission_file(model, filename):
    test_X = test_df.values.astype('float')
    
    predictions = model.predict(test_X)
    submission_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})
    
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_data_file = os.path.join(submission_data_path, filename)
    submission_df.to_csv(submission_data_file, index=False)

In [24]:
# Call the function t create the submission file
get_submission_file(model_dummy, '01_dummy.csv')

## Logistic Regression Model

In [26]:
# Let's create our predictive model using the logistic Regression Model and compare its performance with that of base model
from sklearn.linear_model import LogisticRegression
# create model
model_lr_1 = LogisticRegression(random_state=0)

#train model
model_lr_1.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [27]:
#Evaluate model
print(f'Score for logistic regression version1: {model_lr_1.score(X_test, y_test): .2f}')

Score for logistic regression version1:  0.83


In [28]:
# This is an improvement over the base model accuracy of 0.61. Let's get more metrics
print(f'Accuracy for the baseline model: {accuracy_score(y_test, model_lr_1.predict(X_test)):.2f}')
print(f'precision for the baseline model: {precision_score(y_test, model_lr_1.predict(X_test)):.2f}')
print(f'Recall for the baseline model: {recall_score(y_test, model_lr_1.predict(X_test)):.2f}')
print(f'Confusion matrix for the baseline model: {confusion_matrix(y_test, model_lr_1.predict(X_test))}')

Accuracy for the baseline model: 0.83
precision for the baseline model: 0.78
Recall for the baseline model: 0.78
Confusion matrix for the baseline model: [[95 15]
 [15 54]]


In [29]:
# model coefficients
model_lr_1.coef_

array([[-0.02843207,  0.00455984, -0.49918331,  0.61664398, -0.83390092,
         0.12732472, -0.16977068, -0.39917376,  0.52202281,  1.1017798 ,
         0.40511429, -0.18462285, -0.3032014 ,  0.96090838,  0.4831779 ,
        -0.34461334,  0.28004258,  1.2400795 ,  0.57218845, -1.41524966,
         1.08114281, -0.19503338, -0.00245443, -0.46124295,  0.16186854,
         0.24686171,  0.27844768,  0.41229499,  0.49073604,  0.46124492,
         0.14749197,  0.37010802,  0.7293649 ]])

In [30]:
# Second (improved) Kaggle submission file
get_submission_file(model_dummy, '01_lr.csv')

## Part2

### Hyperparameter Optimization

In [35]:
# base model
model_lr = LogisticRegression(random_state=0, solver='liblinear')

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
parameters = {'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']}
clf = GridSearchCV(model_lr, param_grid=parameters, cv=3)


clf.fit(X_train, y_train);

In [38]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [39]:
print(f'Best score: {clf.best_score_: .2f}')

Best score:  0.83


In [40]:
print(f'Score for logistic regression version2: {clf.score(X_test, y_test): .2f}')

Score for logistic regression version2:  0.83


### Normalization and Standardization

In [53]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### Feature Normalization

In [54]:
# Create a scaler object(using the MinMaxScaler class), fit and normalize the train data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [55]:
# Let's confirm that the feature are actually normalized with a max of 1 and min value of 0
X_train_scaled[:,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

In [56]:
# Normalise test data
X_test_scaled = scaler.transform(X_test)

#### Feature Standardization

In [62]:
# Create a scaler object(using the StabdardScaler class), fit and normalize the train data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Create a model from the standardized and normalized data

In [63]:
# The model is created from the normalized and standardized data. The parameters are also optimized using GridsearchCV
model_lr = LogisticRegression(random_state=0, solver='liblinear')
parameters = {'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']}
clf = GridSearchCV(model_lr, param_grid=parameters, cv=3)
clf.fit(X_train_scaled, y_train);

In [64]:
clf.best_score_

0.8089887640449438

In [65]:
print(f'Score for logistic regression after standardisation: {clf.score(X_test_scaled, y_test): .2f}')

Score for logistic regression after standardisation:  0.84


### Model Persistence

In [66]:
import pickle

In [67]:
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

In [68]:
with open(model_file_path, 'wb') as model_file_pickle:
    pickle.dump(clf, model_file_pickle)

In [69]:
with open(scaler_file_path, 'wb') as scaler_file_pickle:
    pickle.dump(clf, scaler_file_pickle)