In [1]:
import numpy as np
import pandas as pd
import random 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm_notebook

In [2]:
train = pd.read_csv('./train.csv', index_col='PassengerId')
test = pd.read_csv('./test.csv', index_col='PassengerId')

In [3]:
data = pd.concat([train, test], axis=0)

In [4]:
cat_cols=['Pclass','Sex','Embarked', 'Status']

In [5]:
data['Status'] = data.Name.apply(lambda x: x.split(',')[1].split('.')[0])

In [6]:
#fill nan in Age by their mean values in Status group
data.Age = data.Age.fillna(data.Status.map(data.groupby('Status').Age.mean()))

In [7]:
data = pd.get_dummies(data, columns=cat_cols, drop_first=True)

In [8]:
cols = [x for x in data if x not in ['Survived', 'Ticket', 'Cabin', 'Name', 'Fare']]

In [9]:
X_train = data[~data.Survived.isnull()][cols]
y_train = data[~data.Survived.isnull()].Survived
X_test = data[data.Survived.isnull()][cols]

In [10]:
folds = StratifiedKFold(5)
folds = folds.split(X_train, y_train)
folds = list(folds)

In [11]:
ParamGrid = {'max_depth': np.arange(3, 7),
             'ntree': np.arange(50, 200, 10)}

In [12]:
best_score = -np.inf
best_params_rf = None
for param in tqdm_notebook(ParameterGrid(ParamGrid)):
    model = RandomForestClassifier(n_estimators=param['ntree'], max_depth=param['max_depth'], random_state=1)
    scores = cross_val_score(model, X_train.fillna(0), y_train, n_jobs=1, cv=folds)
    if best_score<np.mean(scores):
        best_score = np.mean(scores)
        best_params_rf = param
        print('Improved, best score is {}'.format(best_score))

A Jupyter Widget

Improved, best score is 0.8136756074129003
Improved, best score is 0.8147992029185183
Improved, best score is 0.8260668979607326
Improved, best score is 0.8294188532679951
Improved, best score is 0.8294377554049296
Improved, best score is 0.8305550738406838
Improved, best score is 0.8316850173435085



In [13]:
best_score = -np.inf
best_params_et = None
for param in tqdm_notebook(ParameterGrid(ParamGrid)):
    model = ExtraTreesClassifier(n_estimators=param['ntree'], max_depth=param['max_depth'], random_state=1)
    scores = cross_val_score(model, X_train.fillna(0), y_train, n_jobs=1, cv=folds)
    if best_score<np.mean(scores):
        best_score = np.mean(scores)
        best_params_et = param
        print('Improved, best score is {}'.format(best_score))

A Jupyter Widget

Improved, best score is 0.8058040199490245
Improved, best score is 0.8058291991558229
Improved, best score is 0.8181447393012309
Improved, best score is 0.8192620577369851
Improved, best score is 0.8203982783096737
Improved, best score is 0.820429876441022
Improved, best score is 0.8204487785779564
Improved, best score is 0.8249496504523215
Improved, best score is 0.8272095374579711
Improved, best score is 0.8272221625250419



In [23]:
#fit it for different random states for stability and reducing the variance
preds = np.zeros(len(X_train))
for rs in tqdm_notebook(np.arange(1, 11, 1)):
    model_rf = RandomForestClassifier(n_estimators=best_params_rf['ntree'], max_depth=best_params_rf['max_depth'], random_state=rs)
    model_rf.fit(X_train.fillna(0), y_train)
    model_et = ExtraTreesClassifier(n_estimators=best_params_et['ntree'], max_depth=best_params_et['max_depth'], random_state=rs)
    model_et.fit(X_train.fillna(0), y_train)
    preds += model_rf.predict_proba(X_test)[:, 1]
    preds += model_et.predict_proba(X_test)[:, 1]

A Jupyter Widget




ValueError: operands could not be broadcast together with shapes (891,) (418,) (891,) 

In [162]:
X_test['Survived'] = model.predict(X_test.fillna(0)).astype(int)

In [163]:
X_test[['Survived']].to_csv('submit.csv')