# Modeling

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import warnings
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
warnings.filterwarnings("ignore")

In [12]:
data = pd.read_csv('../step_data/feature_engineering.csv')
train_features = data.drop('Survived',axis=1)
train_labels = data['Survived']

* Setting Seed for reproducibility

In [9]:
SEED = 42

### Printing and Inspecting

* Data are already scaled, so we just need to feed some classifiers here

In [13]:
train_features.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,1.0,0.293286,0.125,0.0,0.014151,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.510737,0.125,0.0,0.139136,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.347649,0.0,0.0,0.015469,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.469965,0.125,0.0,0.103644,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.469965,0.0,0.0,0.015713,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
train_labels.head(20)

0     0.0
1     1.0
2     1.0
3     1.0
4     0.0
5     0.0
6     0.0
7     0.0
8     1.0
9     1.0
10    1.0
11    1.0
12    0.0
13    0.0
14    0.0
15    1.0
16    0.0
17    1.0
18    0.0
19    1.0
Name: Survived, dtype: float64

## Splitting on training and test set

In [60]:
X_train,X_test,y_train,y_test = train_test_split(train_features,
                                                 train_labels,
                                                 test_size=0.3,
                                                 stratify=labels,
                                                 random_state=SEED)

# K-NN

### Tuning

In [20]:
param_grid = {'n_neighbors':np.arange(1,10)} 

knn_temp = KNeighborsClassifier()                                      
knn_cv = GridSearchCV(knn_temp,param_grid,cv=3)

_ = knn_cv.fit(X_train,y_train)                                                 

print('Best n_neighbors '+str(knn_cv.best_params_),end='\n \n \n')          

print('Best score is '+ str(knn_cv.best_score_))  

Best n_neighbors {'n_neighbors': 5}
 
 
Best score is 0.8314288368636195


* So, the best number of neighbours is 5....We fit a KNN with 5 neighbours

* We also perform cross validation to see

In [22]:
knn_cross_val = KNeighborsClassifier(n_neighbors=5)

knn_cross_val_scores = cross_val_score(knn_cross_val, X_train, y_train, cv=5)

print(knn_cross_val_scores)

[0.832      0.856      0.816      0.81451613 0.78225806]


* Pretty Consistent overall for cross validation....

* We will now fit the classifier on whole X_train,y_train and test on X_test,y_test

### Testing and Evaluating on X_test,y_test

In [23]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train,y_train)

predictions = knn.predict(X_test)

#### Classification Report

In [24]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81       165
         1.0       0.69      0.69      0.69       102

    accuracy                           0.76       267
   macro avg       0.75      0.75      0.75       267
weighted avg       0.76      0.76      0.76       267



#### Confusion Matrix

In [26]:
print(confusion_matrix(y_test,predictions))

[[134  31]
 [ 32  70]]


#### ROC_AUC Score

In [27]:
y_pred_proba = knn.predict_proba(X_test)[:,1]

knn_roc_auc_score = roc_auc_score(y_test,y_pred_proba)

print(knn_roc_auc_score)

0.8173202614379085


# Random Forest

### Tuning

In [29]:
rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=SEED, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10, 12, 16], 
              "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.8474777034559643
{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 1000}


* So, we will set the best params and fit a Classifier

* We also perform cross validation to see how it goes

In [31]:
rf_cross_val = RandomForestClassifier(max_features='auto', oob_score=True, random_state=SEED, n_jobs=-1,
                              criterion='entropy',min_samples_leaf= 1, min_samples_split= 4, n_estimators= 1000)

rf_cross_val_scores = cross_val_score(rf_cross_val, X_train, y_train, cv=5)

print(rf_cross_val_scores)

[0.832      0.864      0.84       0.81451613 0.80645161]


### Testing and Evaluating on X_test,y_test

In [32]:
rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=42, n_jobs=-1,
                              criterion='entropy',min_samples_leaf= 1, min_samples_split= 4, n_estimators= 1000)

rf.fit(X_train,y_train)

predictions=rf.predict(X_test)

#### Classification Report

In [33]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.82      0.85      0.83       165
         1.0       0.74      0.69      0.71       102

    accuracy                           0.79       267
   macro avg       0.78      0.77      0.77       267
weighted avg       0.79      0.79      0.79       267



#### Confusion Matrix

In [34]:
print(confusion_matrix(y_test,predictions))

[[141  24]
 [ 32  70]]


#### ROC_AUC_SCORE

In [35]:
y_pred_proba = rf.predict_proba(X_test)[:,1]

rf_roc_auc_score=roc_auc_score(y_test,y_pred_proba)

print(rf_roc_auc_score)

0.8371954842543077


# AdaBoost ( Adaptive Boosting )

### Tuning

In [37]:
dt = DecisionTreeClassifier(max_depth=1,random_state=SEED)

adb_clf = AdaBoostClassifier(base_estimator=dt,random_state=SEED)

param_grid = {'n_estimators':[10,20,50,100,200,500,1000,2000],'learning_rate':[.001,0.01,.1,1,10]}

gs = GridSearchCV(estimator=adb_clf, param_grid=param_grid, scoring='accuracy', cv=3)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.807398117180726
{'learning_rate': 1, 'n_estimators': 10}


### Cross Validation

In [39]:
dt = DecisionTreeClassifier(max_depth=1,random_state=SEED)

adb_clf_cross_val = AdaBoostClassifier(base_estimator=dt,n_estimators=10,learning_rate=1,random_state=SEED)

adb_cross_val_scores = cross_val_score(adb_clf_cross_val, X_train, y_train, cv=5)

print(adb_cross_val_scores)

[0.808      0.816      0.816      0.80645161 0.79032258]


### Testing and Evaluating on X_test,y_test

In [40]:
dt = DecisionTreeClassifier(max_depth=1,random_state=SEED)

adb_clf = AdaBoostClassifier(base_estimator=dt,n_estimators=10,learning_rate=1)

adb_clf.fit(X_train,y_train)


predictions=adb_clf.predict(X_test)

#### Classification Report

In [41]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.78      0.82       165
         1.0       0.69      0.79      0.74       102

    accuracy                           0.78       267
   macro avg       0.77      0.78      0.78       267
weighted avg       0.79      0.78      0.79       267



#### Confusion Matrix

In [42]:
print(confusion_matrix(y_test,predictions))

[[128  37]
 [ 21  81]]


#### ROC_AUC_SCORE 

In [43]:
y_pred_proba = adb_clf.predict_proba(X_test)[:,1]
adb_clf_roc_auc_score = roc_auc_score(y_test,y_pred_proba)

print(adb_clf_roc_auc_score)

0.8357397504456328


# XGBOOST

### Tuning

In [47]:
xgb_clf=xgb.XGBClassifier()

param_grid = {
    'n_estimators': [100,200,500,750,1000],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'learning_rate': [0.01, 0.02, 0.05, 0.1]
}



gs = GridSearchCV(estimator = xgb_clf, 
                       param_grid = param_grid, 
                       cv=5, verbose = False)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

































































































0.8410322580645161
{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.6}


### Cross Validation

In [48]:
xgb_clf_cross_val = xgb.XGBClassifier(n_estimators=500,colsample_bytree=0.7,subsample=0.6,
                                    learning_rate=0.01,random_state=SEED)

xgb_cross_val_scores = cross_val_score(xgb_clf_cross_val, X_train, y_train, cv=5)

print(xgb_cross_val_scores)

[0.832      0.88       0.848      0.82258065 0.80645161]


### Testing and Evaluating on X_test,y_test

In [49]:
xgb_clf = xgb.XGBClassifier(n_estimators=500,colsample_bytree=0.7,subsample=0.6,learning_rate=0.01)

xgb_clf.fit(X_train,y_train)


predictions = xgb_clf.predict(X_test)



#### Classification Report

In [50]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.80      0.88      0.84       165
         1.0       0.78      0.65      0.71       102

    accuracy                           0.79       267
   macro avg       0.79      0.77      0.77       267
weighted avg       0.79      0.79      0.79       267



#### Confusion Matrix

In [51]:
print(confusion_matrix(y_test,predictions))

[[146  19]
 [ 36  66]]


#### ROC_AUC_SCORE 

In [52]:
y_pred_proba = xgb_clf.predict_proba(X_test)[:,1]
xgb_clf_roc_auc_score = roc_auc_score(y_test,y_pred_proba)

print(xgb_clf_roc_auc_score)

0.8451574569221627


# Logistic Regression

### Tuning

In [53]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

param_grid = {'C':[0.001,0.1,1,2,5,7,9,10,20,50,100],'penalty':['l2','l2']}

gs = GridSearchCV(estimator=lr, param_grid=param_grid, scoring='accuracy', cv=3)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.80738263346959
{'C': 50, 'penalty': 'l2'}


### Cross Validation

In [54]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

lr_cross_val = LogisticRegression(C=50,penalty='l2')

lr_cross_val_scores = cross_val_score(lr_cross_val, X_train, y_train, cv=5)

print(lr_cross_val_scores)

[0.816      0.824      0.768      0.80645161 0.7983871 ]


### Testing and Evaluating on X_test,y_test

In [56]:
lr = LogisticRegression(C=50,penalty='l2')

lr.fit(X_train,y_train)

predictions = lr.predict(X_test)

#### Classification Report

In [57]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.84      0.82      0.83       165
         1.0       0.73      0.75      0.74       102

    accuracy                           0.80       267
   macro avg       0.79      0.79      0.79       267
weighted avg       0.80      0.80      0.80       267



#### Confusion Matrix

In [58]:
print(confusion_matrix(y_test,predictions))

[[136  29]
 [ 25  77]]


#### ROC_AUC_SCORE 

In [59]:
y_pred_proba = lr.predict_proba(X_test)[:,1]
xgb_clf_roc_auc_score = roc_auc_score(y_test,y_pred_proba)

print(xgb_clf_roc_auc_score)

0.8506833036244801


## Model Chosing

* We should not chose only based on the results on the holdout set. After all, we also performed Cross-Validation on all clasifiers, after feeding them by using the best parameters after greed-searching.

* Taking into account the consistency on cross-validation and the result on the acutal held-out dataset, we should go with XGboost here.