# Model Training and Predictions 

In this section, we will try to fit a classification model to predict whether a passenger would survive the Titanic

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [5]:
train = pd.read_csv("../data/interim/train_processed.csv")
validation = pd.read_csv("../data/interim/test_processed.csv")
train.set_index('PassengerId', inplace = True)
validation.set_index('PassengerId', inplace = True)

In [6]:
train.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'filledAge', 'Title', 'sex_female',
       'sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Title_Capt', 'Title_Col', 'Title_Don',
       'Title_Dr', 'Title_Jonkheer', 'Title_Lady', 'Title_Major',
       'Title_Master', 'Title_Miss', 'Title_Mlle', 'Title_Mme', 'Title_Mr',
       'Title_Mrs', 'Title_Ms', 'Title_Rev', 'Title_Sir',
       'Title_the Countess'],
      dtype='object')

In [7]:
selectedCols = ['Pclass_1','Pclass_2','SibSp', 'Parch','Fare', 'filledAge', 
                'sex_female','Embarked_C', 'Embarked_Q','Title_Capt', 'Title_Col',
                'Title_Don','Title_Dr', 'Title_Jonkheer', 'Title_Lady', 'Title_Major',
               'Title_Master', 'Title_Miss', 'Title_Mlle', 'Title_Mme',
               'Title_Mrs', 'Title_Ms', 'Title_Rev', 'Title_Sir',
               'Title_the Countess']
# Include all titles except Mr

In [8]:
y = train.Survived
X = train[selectedCols]

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.25)

### Train and Optimize Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C = 10, solver = 'liblinear')


In [48]:
lr.fit(X_train, y_train)

LogisticRegression(C=10, solver='liblinear')

In [49]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [50]:
# Get accuracy score
score_train = lr.score(X_train, y_train)
print("Accuracy score of training data is %.2f"%score_train)
score = lr.score(X_test, y_test)
print("Accuracy score for test dataset is %.2f"%score)

Accuracy score of training data is 0.84
Accuracy score for test dataset is 0.81


In [51]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

In [52]:
cm_train = metrics.confusion_matrix(y_train, y_train_pred)
print(cm_train)

[[361  43]
 [ 64 200]]


In [53]:
cm_test = metrics.confusion_matrix(y_test, y_test_pred)
print(cm_test)

[[117  28]
 [ 14  64]]


### Train and evaluate XGBoost

In [54]:
from sklearn import  metrics, model_selection
from xgboost import XGBClassifier

In [170]:
params = {
'booster' : 'gbtree',
'objective': 'binary:logitraw',

'max_depth': 1,

'learning_rate': 1,

'n_estimators':150,
'lambda' : 10, 
'alpha' : 0.1
}

xgb = XGBClassifier(**params)

In [171]:
xgb.fit(X_train, y_train)



XGBClassifier(alpha=0.1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='', lambda=10,
              learning_rate=1, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=150, n_jobs=4, num_parallel_tree=1,
              objective='binary:logitraw', random_state=0,
              reg_alpha=0.100000001, reg_lambda=10, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [172]:
# Predictions
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

# how did our model perform?
count_misclassified = (y_test != y_test_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))

score_train = metrics.accuracy_score(y_train, y_train_pred)
print("Accuracy score of training data is %.2f"%score_train)
score_test = metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy score for test dataset is %.2f"%score_test)


Misclassified samples: 35
Accuracy score of training data is 0.86
Accuracy score for test dataset is 0.84


### Evaluate performance on validation set 

In [173]:
X_valid = validation[selectedCols]

In [174]:
validation.columns

Index(['Unnamed: 0', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'filledAge', 'Title',
       'sex_female', 'sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Col', 'Title_Don',
       'Title_Dr', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Ms', 'Title_Rev', 'Title_Capt', 'Title_Jonkheer', 'Title_Lady',
       'Title_Major', 'Title_Mlle', 'Title_Mme', 'Title_Sir',
       'Title_the Countess', 'Survived'],
      dtype='object')

In [178]:
submission_pred_xgb = xgb.predict(X_valid)
submission_pred_lr = lr.predict(X_valid)

In [176]:
validation['Survived'] = submission_pred_xgb

In [179]:
validation['Survived'].to_csv("../data/final/submissionXGB.csv")

In [180]:
validation['Survived'] = submission_pred_lr
validation['Survived'].to_csv("../data/final/submissionLR.csv")