In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
train = pd.read_csv('titanic_train.csv')

In [3]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
                    return 38
        elif Pclass == 2:
                    return 30
        else:
                    return 25
    else:
        return Age

In [4]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

In [5]:
train.drop('Cabin',axis=1,inplace=True)
#train.dropna(inplace=True)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [6]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
#drop the sex,embarked,name and tickets columns
train.drop(['Sex','Embarked','Name','Ticket', 'PassengerId'],axis=1,inplace=True)
#concatenate new sex and embark column to our train dataframe
train = pd.concat([train,sex,embark],axis=1)
#check the head of dataframe
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived',axis=1), 
           train['Survived'], test_size=0.30, random_state=101)

## Random Forest

In [8]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, 
                            random_state = 100,
                            max_features = 'sqrt',
                            criterion="gini", 
                            max_depth=5, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            bootstrap=True, 
                            oob_score=True, 
                            verbose=1, 
                            n_jobs=-1)
# Train the model on training data
rf.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.8s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=100, verbose=1,
                       warm_start=False)

In [9]:
pred_test = rf.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.2s finished


In [10]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred_test))

[[143  11]
 [ 43  71]]


In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_test))

              precision    recall  f1-score   support

           0       0.77      0.93      0.84       154
           1       0.87      0.62      0.72       114

    accuracy                           0.80       268
   macro avg       0.82      0.78      0.78       268
weighted avg       0.81      0.80      0.79       268



## Random Search

In [12]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 10, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 294, 389, 484, 578, 673, 768, 863, 957, 1052, 1147, 1242, 1336, 1431, 1526, 1621, 1715, 1810, 1905, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [13]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 5, 
                               verbose = 1, 
                               random_state =42, 
                               n_jobs = -1)

In [14]:
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   32.9s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [15]:
rf_random.best_params_

{'n_estimators': 1621,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [16]:
rf_random.best_score_

0.8154093097913323

In [17]:
rf_random.best_estimator_.feature_importances_

array([0.12270171, 0.14883518, 0.04004811, 0.0280886 , 0.19926927,
       0.42497359, 0.00810158, 0.02798196])

In [18]:
pred_test1 = rf_random.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred_test1))

[[140  14]
 [ 37  77]]


In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_test1))

              precision    recall  f1-score   support

           0       0.79      0.91      0.85       154
           1       0.85      0.68      0.75       114

    accuracy                           0.81       268
   macro avg       0.82      0.79      0.80       268
weighted avg       0.81      0.81      0.81       268



## Grid Search

In [21]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 10, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [200, 650, 1100, 1550, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [22]:
rf = RandomForestClassifier()
rf_random = GridSearchCV(estimator = rf, 
                         param_grid = param_grid, 
                         cv = 2, 
                         verbose = 1, 
                         n_jobs = -1)

In [23]:
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 2 folds for each of 540 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  8.9min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [24]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 1100}

In [25]:
rf_random.best_score_

0.8250401284109149

In [26]:
pred_test2 = rf_random.predict(X_test)

In [27]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred_test2))

[[140  14]
 [ 32  82]]


In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_test2))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       154
           1       0.85      0.72      0.78       114

    accuracy                           0.83       268
   macro avg       0.83      0.81      0.82       268
weighted avg       0.83      0.83      0.83       268



## Gradient Boosting Model

In [29]:
# Import the model we are using
from sklearn.ensemble import GradientBoostingClassifier
# Instantiate model with 1000 decision trees
gbm = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.05, random_state = 100, n_iter_no_change = 5, 
                                 max_depth=5, min_samples_split=2, min_samples_leaf=1, verbose=1)
# Train the model on training data
gbm.fit(X_train,y_train)

      Iter       Train Loss   Remaining Time 
         1           1.2579           27.05s
         2           1.2083           15.39s
         3           1.1636           10.22s
         4           1.1235            7.64s
         5           1.0871            6.06s
         6           1.0536            5.03s
         7           1.0227            4.29s
         8           0.9943            3.75s
         9           0.9668            3.33s
        10           0.9413            2.99s
        20           0.7514            1.39s
        30           0.6400            0.85s
        40           0.5647            0.57s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.05, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=5, presort='auto', random_state=100,
                           subsample=1.0, tol=0.0001, validation_fraction=0.1,
                           verbose=1, warm_start=False)

In [30]:
pred_test3 = gbm.predict(X_test)

In [31]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred_test3))

[[143  11]
 [ 36  78]]


In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_test3))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       154
           1       0.88      0.68      0.77       114

    accuracy                           0.82       268
   macro avg       0.84      0.81      0.81       268
weighted avg       0.83      0.82      0.82       268



## Random Search

In [33]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 20)]
# Learning rate
learning_rate = [0.01, 0.05, 0.1, 0.5]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 10, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 294, 389, 484, 578, 673, 768, 863, 957, 1052, 1147, 1242, 1336, 1431, 1526, 1621, 1715, 1810, 1905, 2000], 'learning_rate': [0.01, 0.05, 0.1, 0.5], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [34]:
gbm = GradientBoostingClassifier()
gbm_random = RandomizedSearchCV(estimator = gbm, param_distributions = random_grid, cv = 2, verbose=1)

In [35]:
# Fit the random search model
gbm_random.fit(X_train,y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   17.2s finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                     

In [36]:
gbm_random.best_params_

{'n_estimators': 1147,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 10,
 'learning_rate': 0.5}

In [37]:
gbm_random.best_score_

0.8105939004815409

In [38]:
pred_test4 = rf_random.predict(X_test)

In [39]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred_test4))

[[140  14]
 [ 32  82]]


In [40]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_test4))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       154
           1       0.85      0.72      0.78       114

    accuracy                           0.83       268
   macro avg       0.83      0.81      0.82       268
weighted avg       0.83      0.83      0.83       268

