# Modeling

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv('D:/KAGGLE COMPETITIONS/Titanic/4. Analysis/train_after_feature_encoding.csv')
labels=pd.read_csv('D:/KAGGLE COMPETITIONS/Titanic/4. Analysis/labels.csv',header=None,index_col=0)

* Setting Seed for reproducibility

In [3]:
SEED=42

### Printing and Inspecting

* Data are already scaled, so we just need to feed some classifiers here

In [4]:
data.head(20)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Q,Embarked_S,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,1.0,0.293286,0.125,0.0,0.014151,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.510737,0.125,0.0,0.139136,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.347649,0.0,0.0,0.015469,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.469965,0.125,0.0,0.103644,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.469965,0.0,0.0,0.015713,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,1.0,0.396963,0.0,0.0,0.01651,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.728187,0.0,0.0,0.101229,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,1.0,0.021473,0.375,0.166667,0.041136,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,1.0,0.361239,0.0,0.333333,0.021731,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,0.5,0.184561,0.125,0.0,0.058694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
labels.head(20)

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0,0
1,1
2,1
3,1
4,0
5,0
6,0
7,0
8,1
9,1


## Splitting on training and test set

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(data,labels,test_size=0.3,stratify=labels,random_state=SEED)

# K-NN

### Tuning

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


param_grid={'n_neighbors':np.arange(1,10)} 

knn_temp=KNeighborsClassifier()                                      
knn_cv=GridSearchCV(knn_temp,param_grid,cv=3)

_=knn_cv.fit(X_train,y_train)                                                 

print('Best n_neighbors '+str(knn_cv.best_params_),end='\n \n \n')          

print('Best score is '+ str(knn_cv.best_score_))  

Best n_neighbors {'n_neighbors': 5}
 
 
Best score is 0.8314606741573034


* So, the best number of neighbours is 5....We fit a KNN with 5 neighbours

* We also perform cross validation to see

In [8]:
from sklearn.model_selection import cross_val_score

knn_cross_val=KNeighborsClassifier(n_neighbors=5)

knn_cross_val_scores = cross_val_score(knn_cross_val, X_train, y_train, cv=5)

print(knn_cross_val_scores)

[0.832      0.872      0.824      0.832      0.76422764]


* Pretty Consistent overall for cross validation....

* We will now fit the classifier on whole X_train,y_train and test on X_test,y_test

### Testing and Evaluating on X_test,y_test

In [9]:
knn=KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train,y_train)

predictions=knn.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score

#### Classification Report

In [11]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82       165
           1       0.70      0.73      0.71       102

    accuracy                           0.78       267
   macro avg       0.77      0.77      0.77       267
weighted avg       0.78      0.78      0.78       267



#### Confusion Matrix

In [12]:
print(confusion_matrix(y_test,predictions))

[[134  31]
 [ 28  74]]


#### ROC_AUC Score

In [13]:
y_pred_proba=knn.predict_proba(X_test)[:,1]

knn_roc_auc_score=roc_auc_score(y_test,y_pred_proba)

print(knn_roc_auc_score)

0.8313131313131312


# Random Forest

### Tuning

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=SEED, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10, 12, 16], 
              "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.8491171749598716
{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 1000}


In [15]:
print(gs.cv_results_)

{'mean_fit_time': array([0.1565961 , 0.19894306, 0.62008262, 1.22296834, 1.80116932,
       0.1813736 , 0.22138278, 0.6773231 , 1.2681636 , 1.67963346,
       0.1770951 , 0.17474016, 0.67548633, 1.37097708, 1.77173559,
       0.14603353, 0.22571747, 0.81751855, 1.02974486, 1.57763902,
       0.14403311, 0.17397801, 0.71674411, 1.19594407, 1.81591296,
       0.15626073, 0.18230422, 0.7501862 , 1.20854998, 1.7346278 ,
       0.14063819, 0.19271803, 0.7760946 , 1.13942973, 1.62275894,
       0.15105216, 0.18272861, 0.65481408, 1.21258481, 1.73827497,
       0.14236577, 0.19539611, 0.7551144 , 1.20150391, 1.57297254,
       0.1499393 , 0.19671019, 0.69146713, 1.22998961, 1.63811501,
       0.14428512, 0.19272161, 0.74540965, 1.17759124, 1.59230741,
       0.14063462, 0.18230343, 0.73372173, 1.15534782, 1.56868815,
       0.14636676, 0.1873757 , 0.71849585, 1.13868078, 1.61375109,
       0.14870071, 0.1867307 , 0.64156747, 1.17118621, 1.64241982,
       0.14063438, 0.20357839, 0.73183195, 1

* So, we will set the best params and fit a Classifier

* We also perform cross validation to see how it goes

In [16]:
from sklearn.model_selection import cross_val_score

rf_cross_val=RandomForestClassifier(max_features='auto', oob_score=True, random_state=SEED, n_jobs=-1,
                              criterion='entropy',min_samples_leaf= 1, min_samples_split= 4, n_estimators= 1000)

rf_cross_val_scores = cross_val_score(rf_cross_val, X_train, y_train, cv=5)

print(rf_cross_val_scores)

[0.832      0.872      0.856      0.816      0.79674797]


### Testing and Evaluating on X_test,y_test

In [17]:
rf=RandomForestClassifier(max_features='auto', oob_score=True, random_state=42, n_jobs=-1,
                              criterion='entropy',min_samples_leaf= 1, min_samples_split= 4, n_estimators= 1000)

rf.fit(X_train,y_train)

predictions=rf.predict(X_test)

#### Classification Report

In [18]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       165
           1       0.79      0.70      0.74       102

    accuracy                           0.81       267
   macro avg       0.81      0.79      0.80       267
weighted avg       0.81      0.81      0.81       267



#### Confusion Matrix

In [19]:
print(confusion_matrix(y_test,predictions))

[[146  19]
 [ 31  71]]


#### ROC_AUC_SCORE

In [20]:
y_pred_proba=rf.predict_proba(X_test)[:,1]

rf_roc_auc_score=roc_auc_score(y_test,y_pred_proba)

print(rf_roc_auc_score)

0.851455733808675


# AdaBoost ( Adaptive Boosting )

### Tuning

In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

dt=DecisionTreeClassifier(max_depth=1,random_state=SEED)

adb_clf=AdaBoostClassifier(base_estimator=dt,random_state=SEED)

param_grid = {'n_estimators':[10,20,50,100,200,500,1000,2000],'learning_rate':[.001,0.01,.1,1,10]}

gs = GridSearchCV(estimator=adb_clf, param_grid=param_grid, scoring='accuracy', cv=3)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.8154093097913323
{'learning_rate': 0.01, 'n_estimators': 1000}


### Cross Validation

In [22]:
from sklearn.model_selection import cross_val_score

dt=DecisionTreeClassifier(max_depth=1,random_state=SEED)

adb_clf_cross_val=AdaBoostClassifier(base_estimator=dt,n_estimators=1000,learning_rate=0.01,random_state=SEED)

adb_cross_val_scores = cross_val_score(adb_clf_cross_val, X_train, y_train, cv=5)

print(adb_cross_val_scores)

[0.808      0.848      0.808      0.816      0.80487805]


### Testing and Evaluating on X_test,y_test

In [23]:
dt=DecisionTreeClassifier(max_depth=1,random_state=SEED)

adb_clf=AdaBoostClassifier(base_estimator=dt,n_estimators=1000,learning_rate=0.01)

adb_clf.fit(X_train,y_train)


predictions=adb_clf.predict(X_test)

#### Classification Report

In [24]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85       165
           1       0.74      0.81      0.78       102

    accuracy                           0.82       267
   macro avg       0.81      0.82      0.81       267
weighted avg       0.83      0.82      0.82       267



#### Confusion Matrix

In [25]:
print(confusion_matrix(y_test,predictions))

[[136  29]
 [ 19  83]]


#### ROC_AUC_SCORE 

In [26]:
y_pred_proba=adb_clf.predict_proba(X_test)[:,1]
adb_clf_roc_auc_score=roc_auc_score(y_test,y_pred_proba)

print(adb_clf_roc_auc_score)

0.8694592988710635


# XGBOOST

In [27]:
import xgboost as xgb

### Tuning

In [28]:
xgb_clf=xgb.XGBClassifier()

param_grid = {
    'n_estimators': [100,200,500,750,1000],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'learning_rate': [0.01, 0.02, 0.05, 0.1]
}



gs =GridSearchCV(estimator = xgb_clf, 
                       param_grid = param_grid, 
                        iid=False,
                       cv=5, verbose = False)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.8490016260162602
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'n_estimators': 500, 'subsample': 0.8}


### Cross Validation

In [29]:
from sklearn.model_selection import cross_val_score

xgb_clf_cross_val=xgb.XGBClassifier(n_estimators=500,colsample_bytree=0.7,subsample=0.8,
                                    learning_rate=0.05,random_state=SEED)

xgb_cross_val_scores = cross_val_score(xgb_clf_cross_val, X_train, y_train, cv=5)

print(xgb_cross_val_scores)

[0.832      0.904      0.84       0.832      0.80487805]


### Testing and Evaluating on X_test,y_test

In [30]:
xgb_clf=xgb.XGBClassifier(n_estimators=500,colsample_bytree=0.7,subsample=0.8,learning_rate=0.05)

xgb_clf.fit(X_train,y_train)


predictions=xgb_clf.predict(X_test)

#### Classification Report

In [31]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       165
           1       0.79      0.69      0.73       102

    accuracy                           0.81       267
   macro avg       0.80      0.79      0.79       267
weighted avg       0.81      0.81      0.81       267



#### Confusion Matrix

In [32]:
print(confusion_matrix(y_test,predictions))

[[146  19]
 [ 32  70]]


#### ROC_AUC_SCORE 

In [33]:
y_pred_proba=xgb_clf.predict_proba(X_test)[:,1]
xgb_clf_roc_auc_score=roc_auc_score(y_test,y_pred_proba)

print(xgb_clf_roc_auc_score)

0.8475044563279857


# Logistic Regression

### Tuning

In [34]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()

param_grid={'C':[0.001,0.1,1,2,5,7,9,10,20,50,100],'penalty':['l2','l2']}

gs = GridSearchCV(estimator=lr, param_grid=param_grid, scoring='accuracy', cv=3)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.826645264847512
{'C': 5, 'penalty': 'l2'}


### Cross Validation

In [35]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

lr_cross_val=LogisticRegression(C=5,penalty='l2')

lr_cross_val_scores = cross_val_score(lr_cross_val, X_train, y_train, cv=5)

print(lr_cross_val_scores)

[0.824      0.848      0.816      0.824      0.81300813]


### Testing and Evaluating on X_test,y_test

In [36]:
lr=LogisticRegression(C=5,penalty='l2')

lr.fit(X_train,y_train)

predictions=lr.predict(X_test)

#### Classification Report

In [37]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86       165
           1       0.77      0.80      0.78       102

    accuracy                           0.83       267
   macro avg       0.82      0.83      0.82       267
weighted avg       0.83      0.83      0.83       267



#### Confusion Matrix

In [38]:
print(confusion_matrix(y_test,predictions))

[[140  25]
 [ 20  82]]


#### ROC_AUC_SCORE 

In [39]:
y_pred_proba=lr.predict_proba(X_test)[:,1]
xgb_clf_roc_auc_score=roc_auc_score(y_test,y_pred_proba)

print(xgb_clf_roc_auc_score)

0.8693404634581104
