In [36]:
import pandas as pd
import numpy as np
import datetime as dt
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

import xgboost as xgb

from data import process

In [2]:
df = process.Data('train.csv').return_data()

In [3]:
df.head()

Unnamed: 0,Survived,Age,Fare,Master.,Miss.,Mr.,Mrs.,Other,A_CABIN,B_CABIN,...,class_2,class_3,female,male,C,Q,S,FamilyAboard,IsAlone,InCabin
0,0,22.0,7.25,0,0,1,0,0,0,0,...,0,1,0,1,0,0,1,1,False,False
1,1,38.0,71.2833,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,1,False,True
2,1,26.0,7.925,0,1,0,0,0,0,0,...,0,1,1,0,0,0,1,0,True,False
3,1,35.0,53.1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,1,False,True
4,0,35.0,8.05,0,0,1,0,0,0,0,...,0,1,0,1,0,0,1,0,True,False


In [4]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15144)

## Random Forest

In [8]:
param_grid = {'criterion':['gini', 'entropy'],
             'n_estimators':[10, 50, 100, 500, 1000],
              'max_features':['auto', 'log2', 1, 2, 5],
              'max_depth':[None, 1, 2, 3, 5, 10],
              'min_samples_split':[2, 4, 6, 10],
              'min_samples_leaf':[1, 2, 3, 5, 10],
             }
rf_gridsearch = GridSearchCV(RandomForestClassifier(random_state=15144),param_grid=param_grid, cv=5, scoring='accuracy')
rf_gridsearch.fit(X_train, y_train)
print(rf_gridsearch.best_score_)
print(rf_gridsearch.best_params_)
rf_params = rf_gridsearch.best_params_

0.8328651685393258
{'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}


In [28]:
with open('rf_best_params.p', "wb" ) as dump:
    pickle.dump(rf_params, dump)
    #(pickle.dump(rf_params, open( 'rf_best_params.p', "wb" ))

In [6]:
with open('rf_best_params.p', "rb" ) as load:
    rf_params = pickle.load(load)

### Random Forest - Validation

In [7]:
rf = RandomForestClassifier(random_state=15144)

In [14]:
rf.set_params(**rf_params)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=15144,
                       verbose=0, warm_start=False)

In [62]:
print('Train Set Score:', rf.score(X_train, y_train))
print('Test Set Score:', rf.score(X_test, y_test))
print('Cross Val Score:', np.mean(cross_val_score(rf, X, y, cv=10)))

Train Set Score: 0.9073033707865169
Test Set Score: 0.8770949720670391
Cross Val Score: 0.8394841675178754


## XG Boost Classifier

In [41]:
param_grid = {'eta':[0, 0.1, 0.3, 0.5],
              'gamma':[0, 1, 10],
              'max_depth':[1, 3, 6, 9, 12],
             }
gb_gridsearch = GridSearchCV(xgb.XGBClassifier(random_state=15144),param_grid=param_grid, cv=5, scoring='accuracy')
gb_gridsearch.fit(X_train, y_train)
print(gb_gridsearch.best_score_)
print(gb_gridsearch.best_params_)
gb_params = gb_gridsearch.best_params_

0.8216292134831461
{'eta': 0, 'gamma': 1, 'max_depth': 3}


In [23]:
gb = xgb.XGBClassifier(random_state=15144)
gb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=15144,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [61]:
print('Train Set Score:', gb.score(X_train, y_train))
print('Test Set Score:', gb.score(X_test, y_test))
print('Cross Val Score:', np.mean(cross_val_score(gb, X, y, cv=10)))

Train Set Score: 0.8834269662921348
Test Set Score: 0.8715083798882681
Cross Val Score: 0.8452153558052433


## Decision Tree Classifier

In [52]:
dt = DecisionTreeClassifier(random_state=15144)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=15144, splitter='best')

In [54]:
print('Train Set Score:', dt.score(X_train, y_train))
print('Test Set Score:', dt.score(X_test, y_test))
print('Cross Val Score:', np.mean(cross_val_score(dt, X, y, cv=10)))

Train Set Score: 0.9887640449438202
Test Set Score: 0.770949720670391
Cross Val Score: 0.7733906480535694


## Support Vector

In [71]:
sv = SVC(random_state=15144, gamma='auto')
sv.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=15144, shrinking=True,
    tol=0.001, verbose=False)

In [72]:
print('Train Set Score:', sv.score(X_train, y_train))
print('Test Set Score:', sv.score(X_test, y_test))
print('Cross Val Score:', np.mean(cross_val_score(sv, X, y, cv=10)))

Train Set Score: 0.8651685393258427
Test Set Score: 0.7039106145251397
Cross Val Score: 0.7611567926455567


## Ensemble Voting Classifier

In [79]:
voter = VotingClassifier([('rf', rf), ('gb', gb), ('dt', dt), ('sv', sv)], weights=[0.45, 0.45, 0.05, 0.05])

In [80]:
voter.fit(X_train, y_train)
voter.score(X_test, y_test)
print('Train Set Score:', voter.score(X_train, y_train))
print('Test Set Score:', voter.score(X_test, y_test))
print('Cross Val Score:', np.mean(cross_val_score(voter, X, y, cv=10)))

Train Set Score: 0.9087078651685393
Test Set Score: 0.8770949720670391
Cross Val Score: 0.841768811712632


# Submission

In [81]:
test_data = process.Data('test.csv')
df_test = test_data.return_data()

In [82]:
df_test.head()

Unnamed: 0,Age,Fare,Master.,Miss.,Mr.,Mrs.,Other,A_CABIN,B_CABIN,C_CABIN,...,class_2,class_3,female,male,C,Q,S,FamilyAboard,IsAlone,InCabin
0,34.5,7.8292,0,0,1,0,0,0,0,0,...,0,1,0,1,0,1,0,0,True,False
1,47.0,7.0,0,0,0,1,0,0,0,0,...,0,1,1,0,0,0,1,1,False,False
2,62.0,9.6875,0,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,0,True,False
3,27.0,8.6625,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,True,False
4,22.0,12.2875,0,0,0,1,0,0,0,0,...,0,1,1,0,0,0,1,2,False,False


In [83]:
predictions = voter.predict(df_test)

In [84]:
submission = test_data.return_prediction_df(predictions)

In [85]:
submission.to_csv('submission.csv', index=False)