# Supervised Classification with Titanic
Author: Nick Brooks

Date: Summer 2017

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import os

import pickle
import multiprocessing

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')

# machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
#from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

#Evalaluation
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Grid
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Performance
%load_ext memory_profiler

#Warnings
import warnings
warnings.filterwarnings('ignore')

## Load

In [2]:
#os.chdir(r"D:/My Computer/")
#os.getcwd()

In [3]:
#Titanic
path = r"C:/Users/Nicol/Google Drive/Learning/Jupyter/Titanic"
#path = r"/Users/nicapotato/Google Drive/Learning/Jupyter/Titanic"

#train_df = pd.read_csv(open(os.path.join(path, "clean_train.csv"), "r")) 
#test_df = pd.read_csv(open(os.path.join(path, "clean_test.csv"), "r"))

#train_df = pd.read_csv(open(os.path.join(path, "clean_train2.csv"), "r")) 
#test_df = pd.read_csv(open(os.path.join(path, "clean_test2.csv"), "r")) 

train_df = pd.read_csv(open(os.path.join(path, "clean_train_nick.csv"), "r"), index_col="PassengerId") 
test_df = pd.read_csv(open(os.path.join(path, "clean_test_nick.csv"), "r"), index_col="PassengerId") 

X = train_df.drop(["Survived"] , axis=1)
y = train_df["Survived"]

#test_df  = test_df.drop(["PassengerId"] , axis=1).copy()
print(X.shape, y.shape, test_df.shape)

results = pd.DataFrame()

def save(model, modelname):
    global results
    model.fit(X, y)
    submission = model.predict(test_df)
    df = pd.DataFrame({'PassengerId':test_df.index, 
                           'Survived':submission})
    df.to_csv((os.path.join(path,("submissions/{}.csv".format(modelname)))),header=True,index=False)
    
    # CV and Save Scores
    results = results.append({'Model': modelname,'Para': grid.best_params_, 'Test_Score': (grid.best_score_*100)}, ignore_index=True)
    
    print("Optimal Model CV Accuracy: {}".format(grid.best_score_*100))
    print("Optimal Model Parameters: {}".format(grid.best_params_))

    with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model, f)
        
def norm_save(model, modelname):
    global results
    model.fit(X, y)
    submission = model.predict(test_df)
    df = pd.DataFrame({'PassengerId':test_df.index, 
                           'Survived':submission})
    results = results.append({'Model': modelname,'Para': model, 'Test_Score':score.mean()*100}, ignore_index=True)
    df.to_csv((os.path.join(path,("submissions/{}.csv".format(modelname)))),header=True,index=False)
    with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model, f)

def ensembling(model, modelname):
    global results
    model.fit(X, y)
    submission = model.predict(test_df)
    df = pd.DataFrame({'PassengerId':test_df.index, 
                           'Survived':submission})
    results = results.append({'Model': modelname,'Para': model, 'Test_Score':metrics.accuracy_score(clf.predict(X_test), y_test)*100}, ignore_index=True)
    print(len(df))
    df.to_csv((os.path.join(path,(r"submissions/{}.csv".format(modelname)))),header=True,index=False)
    with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model, f)

(891, 29) (891,) (418, 29)


In [4]:
print(y.value_counts(normalize=True))

0    0.616162
1    0.383838
Name: Survived, dtype: float64


In [5]:
# Should Balance This DataSet

# Should Perhaps Normalize Data
# Re-configurate input data by myself

In [6]:
# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Stratified Cross Validation
cv = StratifiedShuffleSplit(n_splits=4, test_size=0.2)

In [7]:
print(X.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 29 columns):
Sex             891 non-null int64
Age             891 non-null float64
Fare            891 non-null float64
Embarked_C      891 non-null int64
Embarked_Q      891 non-null int64
Embarked_S      891 non-null int64
Title_Master    891 non-null int64
Title_Miss      891 non-null int64
Title_Mr        891 non-null int64
Title_Mrs       891 non-null int64
Title_Rare      891 non-null int64
Parch_0         891 non-null int64
Parch_1         891 non-null int64
Parch_2         891 non-null int64
Parch_3         891 non-null int64
Parch_4         891 non-null int64
Parch_5         891 non-null int64
Parch_6         891 non-null int64
Parch_9         891 non-null int64
SibSp_0         891 non-null int64
SibSp_1         891 non-null int64
SibSp_2         891 non-null int64
SibSp_3         891 non-null int64
SibSp_4         891 non-null int64
SibSp_5         891 non-null int64
SibSp_8         

# Generative Classification
Probabilistically determine the label from the features

## Gaussian

In [8]:
model = GaussianNB()

score = cross_val_score(model, X, y, cv=10, scoring='accuracy')
print(score.mean())
norm_save(GaussianNB(), "Gaussian")

0.407413744183


## Logistic Regression


In [9]:
model= LogisticRegression()
score = cross_val_score(model, X, y, cv=10, scoring='accuracy')
print(score.mean())
norm_save(LogisticRegression(), "Logistic_Regression")

0.820458234026


## Neural Net

### With ScikitLearn

In [10]:
from sklearn.neural_network import MLPClassifier

In [11]:
#?neural_network
MLPClassifier().get_params().keys()

dict_keys(['max_iter', 'shuffle', 'tol', 'batch_size', 'solver', 'beta_2', 'hidden_layer_sizes', 'random_state', 'learning_rate', 'momentum', 'epsilon', 'power_t', 'verbose', 'warm_start', 'validation_fraction', 'alpha', 'learning_rate_init', 'nesterovs_momentum', 'beta_1', 'early_stopping', 'activation'])

In [12]:
# Start with a RandomSearchCV to efficiently Narrow the Ballpark
param_grid ={'max_iter': np.logspace(1, 5, 5).astype("int32"),
             'hidden_layer_sizes': np.logspace(2, 3, 4).astype("int32"),}

model = MLPClassifier()

grid = RandomizedSearchCV(model,
                    param_grid, cv=cv, scoring='accuracy',
                    verbose=1)

grid.fit(X, y)
save(grid.best_estimator_, "RSNeural_Net")

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   32.0s finished


Optimal Model CV Accuracy: 83.10055865921788
Optimal Model Parameters: {'max_iter': 1000, 'hidden_layer_sizes': 215}


# TensorFlow NN

# Non-Parametric

# Esemble Method

Means that a bunch of the model get created and are aggregated at the end for best performance.

## Bagging, Bootstrap

Aka Bootstrap- creates a bunch of trees using a random 3/4 the the data for each, while using sampling without replacement, which means that values may be sampled multiple times.

https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/

HyperParameters:
- max_features: This is the random subset of features to be used for splitting node, the lower the better to reduce variance. For Classification model, ideal max_features = sqr(n_var)
- n_estimators: # of trees built before average prediciton is made
- min_sample_leaf: End node of trees. Too small = more noise. For Regression tree.
- n_jobs: computer processors utilized. -1 = no restrictions
- random_state: seed()

In [13]:
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=300, max_samples=0.8,
                        random_state=1)

print(cross_val_score(bag, X, y, cv=10, scoring='accuracy').mean()*100)

82.8398876404


In [14]:
np.arange(20, 500, 25)

array([ 20,  45,  70,  95, 120, 145, 170, 195, 220, 245, 270, 295, 320,
       345, 370, 395, 420, 445, 470, 495])

In [15]:
param_grid ={'n_estimators': np.arange(20, 500, 25)}

tree = DecisionTreeClassifier()
#bag = BaggingClassifier(tree)

grid = RandomizedSearchCV(BaggingClassifier(tree),
                    param_grid, cv=cv, scoring='accuracy',
                    verbose=1)

grid.fit(X, y)
save(grid.best_estimator_, "Bagger_ensemble")

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   14.4s finished


Optimal Model CV Accuracy: 83.37988826815642
Optimal Model Parameters: {'n_estimators': 245}


## Random Forest

Trees are created wih a randomly picked subset of observations and variables. More uncorrelated splits, less overemphasis on certain features.

In [16]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.793296089385


In [17]:
np.arange(6, 11, 1)

array([ 6,  7,  8,  9, 10])

In [18]:
RandomForestClassifier().get_params().keys()

dict_keys(['n_estimators', 'max_depth', 'oob_score', 'min_samples_split', 'min_weight_fraction_leaf', 'min_samples_leaf', 'class_weight', 'max_features', 'max_leaf_nodes', 'min_impurity_split', 'bootstrap', 'verbose', 'criterion', 'random_state', 'warm_start', 'n_jobs'])

In [19]:
param_grid ={'max_depth': np.arange(6, 11, 1),
             'n_estimators':np.arange(350, 450, 25),
             'max_features':np.arange(0.5,.81, 0.05),
            'max_leaf_nodes':np.arange(6, 10, 1)}
#param_grid ={'n_estimators':[200]}

from sklearn import feature_selection

#model = feature_selection.RFE(RandomForestClassifier())
model= RandomForestClassifier()

grid = RandomizedSearchCV(model,
                    param_grid, cv=cv,
                    scoring='accuracy',
                    verbose=1)

grid.fit(X, y)
save(grid.best_estimator_, "Random_Forest")

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   16.5s finished


Optimal Model CV Accuracy: 82.68156424581005
Optimal Model Parameters: {'n_estimators': 425, 'max_depth': 7, 'max_features': 0.5, 'max_leaf_nodes': 8}


## Extremely Randomized Trees (ExtraTree)

## AdaBoostClassifier: Boosting Method

Method, similarly to deep learning, applies weights to all data points and optimizes them using the loss function. Fixes mistakes by assigning high weights to them during iterative process.

Iterates through multiple models in order to determine the best boundaries. It relies on using weak models to determine the pattern, and evantually creates a strong combination of them.

In [20]:
param_grid ={'n_estimators':np.arange(50, 301, 25),
            'learning_rate':np.arange(.1, 4, .5)}

grid = RandomizedSearchCV(AdaBoostClassifier(),
                    param_grid,cv=cv, scoring='accuracy',
                    verbose=1)

grid.fit(X, y);
save(grid.best_estimator_, "AdaBoost_Ensemble")

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    7.0s finished


Optimal Model CV Accuracy: 82.12290502793296
Optimal Model Parameters: {'n_estimators': 125, 'learning_rate': 1.6000000000000001}


## Gradient Boosting Classifier

Part of the Generalized Boosting Algorithm family.

GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions.

Part of the generalized boosting algorithms. Can use more loss functions than AdaBoost, and uses gradients instead of high-weight data points.

In [21]:
#?GradientBoostingClassifier

In [22]:
param_grid ={'n_estimators':np.arange(100, 301, 25),
            'loss': ['deviance', 'exponential'],
            'learning_rate':np.arange(0.01, 0.32,.05),
            'max_depth': np.arange(2, 4.1, .5)}

grid = RandomizedSearchCV(GradientBoostingClassifier(),
                    param_grid,cv=cv,
                    scoring='accuracy',
                    verbose=1)

grid.fit(X, y)
save(grid.best_estimator_, "Gradient_Boosting")

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    4.6s finished


Optimal Model CV Accuracy: 83.65921787709497
Optimal Model Parameters: {'n_estimators': 275, 'max_depth': 2.0, 'loss': 'deviance', 'learning_rate': 0.11}


## XGB eXtreme Gradient Boosting

Optimized Generalized Gradient Booster, developped in 2014, competetes will in Kaggle Competitions!

Install: https://www.ibm.com/developerworks/community/blogs/jfp/entry/Installing_XGBoost_For_Anaconda_on_Windows?lang=en

In [23]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
num_rounds = 100
model = XGBClassifier(n_estimators = num_rounds,
                        objective= 'binary:logistic',
                        seed=2017)
# use early_stopping_rounds to stop the cv when there is no score imporovement
model.fit(X_train,y_train, early_stopping_rounds=20, eval_set=[(X_test,
y_test)], verbose=False)
score = cross_val_score(model, X_train,y_train, cv=cv)
print("\nxgBoost - CV Train : %.2f" % score.mean())
print("xgBoost - Train : %.2f" % metrics.accuracy_score(model.predict(X_train), y_train))
print("xgBoost - Test : %.2f" % metrics.accuracy_score(model.predict(X_test), y_test))

norm_save(model, "XGBsklearn")


xgBoost - CV Train : 0.81
xgBoost - Train : 0.87
xgBoost - Test : 0.83


In [24]:
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgtest = xgb.DMatrix(X_test, label=y_test)

# set xgboost params
param = {'max_depth': 3,  # the maximum depth of each tree
         'objective': 'binary:logistic'}

clf_xgb_cv = xgb.cv(param, xgtrain, num_rounds, 
                    stratified=True, 
                    nfold=5, 
                    early_stopping_rounds=20)
print("Optimal number of trees/estimators is %i" % clf_xgb_cv.shape[0])

watchlist  = [(xgtest,'test'), (xgtrain,'train')]                
clf_xgb = xgb.train(param, xgtrain,clf_xgb_cv.shape[0], watchlist)

# predict function will produce the probability 
# so we'll use 0.5 cutoff to convert probability to class label
y_train_pred = (clf_xgb.predict(xgtrain, ntree_limit=clf_xgb.best_iteration) > 0.5).astype(int)
y_test_pred = (clf_xgb.predict(xgtest, ntree_limit=clf_xgb.best_iteration) > 0.5).astype(int)
score= metrics.accuracy_score(y_test_pred, y_test)
print("XGB - Train : %.2f" % metrics.accuracy_score(y_train_pred, y_train))
print("XGB - Test : %.2f" % score)
norm_save(model, "XGBstandard")

Optimal number of trees/estimators is 6
[0]	test-error:0.206704	train-error:0.158708
[1]	test-error:0.206704	train-error:0.158708
[2]	test-error:0.206704	train-error:0.158708
[3]	test-error:0.184358	train-error:0.151685
[4]	test-error:0.173184	train-error:0.150281
[5]	test-error:0.178771	train-error:0.143258
XGB - Train : 0.85
XGB - Test : 0.83


## KNN

In [25]:
param_grid ={'n_neighbors': np.arange(1,21,1),
            'weights':['uniform','distance']
            }

grid = GridSearchCV(KNeighborsClassifier(),
                    param_grid,cv=cv, scoring='accuracy',
                    verbose=1)

grid.fit(X, y)

save(grid.best_estimator_, "KNN")

Fitting 4 folds for each of 40 candidates, totalling 160 fits
Optimal Model CV Accuracy: 83.37988826815642
Optimal Model Parameters: {'weights': 'uniform', 'n_neighbors': 9}


[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:    2.0s finished


# Discriminative Classification
Model new points by seeing where it falls upon a divide.
Fast prediction phase, work well in high dimensional data, versatile

Costly at high quantities of data

### Stochastic Gradient Descent

In [26]:
param_grid ={'loss':["hinge","log","modified_huber","squared_hinge","epsilon_insensitive","squared_epsilon_insensitive"]
            }

grid = GridSearchCV(SGDClassifier(),
                    param_grid,cv=cv, scoring='accuracy',
                    verbose=1)

grid.fit(X, y)
save(grid.best_estimator_, "StochasticGradientDescent")

Fitting 4 folds for each of 6 candidates, totalling 24 fits
Optimal Model CV Accuracy: 76.25698324022346
Optimal Model Parameters: {'loss': 'log'}


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    0.0s finished


## Support Vector Classifier
Creates a linear divide between point to classify. Maximizes the distance of the discriminatory margin.

Hyperparameters:
- C: Hardness of the margin. Higher C, less softening.


Radial Basis Function (RBF)
- Gamma: how far the influence of a single training example raches. low=far, high=close, Inverse of the radius of influence of samples selected by the model as support vectors.

In [27]:
# Define Model
model = LinearSVC()
#Fit Model
scores= cross_val_score(model, X, y, cv=10, scoring='accuracy')
print(scores.mean()*100)

norm_save(model, "LinearSV")
#submit(svm.LinearSVC(), name="80linear_svc.csv")

82.717483827


### Radial Basis Function Kernel - SVC

In [28]:
param_grid = [
  {'C': np.arange(25,176,5),
   'gamma': np.logspace(1, -4, 10),
   'kernel': ['rbf'],
   "probability" : [True]}
 ]
model= SVC()
grid = GridSearchCV(model,
                    param_grid, cv=cv,
                    scoring='accuracy', verbose=1)

grid.fit(X, y)
save(grid.best_estimator_, "SVCrbf")

Fitting 4 folds for each of 310 candidates, totalling 1240 fits


[Parallel(n_jobs=1)]: Done 1240 out of 1240 | elapsed:  2.1min finished


Optimal Model CV Accuracy: 82.68156424581005
Optimal Model Parameters: {'C': 25, 'gamma': 0.016681005372000592, 'probability': True, 'kernel': 'rbf'}


### Linear SVC

In [29]:
param_grid = {'C': [1,10],'kernel':['linear'], "probability" : [True]}

model = SVC()
grid = GridSearchCV(model,
                    param_grid, cv=cv,
                    scoring='accuracy', verbose=1)

grid.fit(X, y)
save(grid.best_estimator_, "SVCLinear")

Fitting 4 folds for each of 2 candidates, totalling 8 fits
Optimal Model CV Accuracy: 82.54189944134079
Optimal Model Parameters: {'C': 1, 'kernel': 'linear', 'probability': True}


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s finished


## Pipeline: Principle Components Analysis and Support Vector Classifier

In [30]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

pca = PCA()
svc = SVC(kernel= 'rbf')

model = Pipeline(steps=[('pca',pca),
                        ('svc', svc)])

param_grid = {'svc__C': np.logspace(-2, 5, 6),
              'svc__gamma': np.logspace(1, -7, 10),
             'pca__n_components': [10,15]}

grid = RandomizedSearchCV(model, param_grid,
                          cv=cv, verbose=1,
                         n_iter=20)

grid.fit(X, y)
save(grid.best_estimator_, "PCA_SVC")

Fitting 4 folds for each of 20 candidates, totalling 80 fits
Optimal Model CV Accuracy: 81.70391061452514
Optimal Model Parameters: {'svc__C': 100000.0, 'pca__n_components': 15, 'svc__gamma': 4.6415888336127818e-05}


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    7.2s finished


## Results

In [31]:
results.to_csv("Titanic/results.csv",index_label=False)

In [32]:
results = pd.read_csv(open(os.path.join(path, "results.csv"), "r"))

In [35]:
results.sort_values(by=["Test_Score"], ascending=False)

Unnamed: 0,Model,Para,Test_Score
6,Gradient_Boosting,"{'n_estimators': 275, 'max_depth': 2.0, 'loss'...",83.659218
3,Bagger_ensemble,{'n_estimators': 245},83.379888
9,KNN,"{'weights': 'uniform', 'n_neighbors': 9}",83.379888
2,RSNeural_Net,"{'max_iter': 1000, 'hidden_layer_sizes': 215}",83.100559
4,Random_Forest,"{'n_estimators': 425, 'max_depth': 7, 'max_fea...",82.681564
8,XGBstandard,"XGBClassifier(base_score=0.5, booster='gbtree'...",82.681564
11,LinearSV,"LinearSVC(C=1.0, class_weight=None, dual=True,...",82.681564
12,SVCrbf,"{'C': 25, 'gamma': 0.016681005372000592, 'prob...",82.681564
13,SVCLinear,"{'C': 1, 'kernel': 'linear', 'probability': True}",82.541899
5,AdaBoost_Ensemble,"{'n_estimators': 125, 'learning_rate': 1.60000...",82.122905


## Model Ensemble

In [36]:
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np
import pickle

# set seed for reproducability
np.random.seed(2017)

import statsmodels.api as sm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

# currently its available as part of mlxtend and not sklearn
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn import cross_validation
from sklearn import metrics
from sklearn.cross_validation import train_test_split

## Voting

Hard- Mode
Soft- Probabilistic

In [37]:
dic = {}
for name in results["Model"]:
    open_file = open(os.path.join(path,"Pickle/{}.pickle".format(name)), "rb")
    dic[name] = pickle.load(open_file)
    open_file.close()

In [38]:
models= list(zip(dic.values(), dic.keys()))
clfs = []
print('5-fold cross validation:\n')
for clf, label in models:
    scores = cross_validation.cross_val_score(clf, X_train, y_train,cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)    
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

5-fold cross validation:

Train CV Accuracy: 0.83 (+/- 0.03) [SVCLinear]
Test Accuracy: 0.81 
Train CV Accuracy: 0.82 (+/- 0.03) [LinearSV]
Test Accuracy: 0.82 
Train CV Accuracy: 0.81 (+/- 0.01) [PCA_SVC]
Test Accuracy: 0.81 
Train CV Accuracy: 0.84 (+/- 0.02) [Gradient_Boosting]
Test Accuracy: 0.87 
Train CV Accuracy: 0.83 (+/- 0.02) [SVCrbf]
Test Accuracy: 0.82 
Train CV Accuracy: 0.82 (+/- 0.02) [KNN]
Test Accuracy: 0.84 
Train CV Accuracy: 0.83 (+/- 0.01) [XGBsklearn]
Test Accuracy: 0.86 
Train CV Accuracy: 0.82 (+/- 0.02) [RSNeural_Net]
Test Accuracy: 0.84 
Train CV Accuracy: 0.83 (+/- 0.01) [XGBstandard]
Test Accuracy: 0.86 
Train CV Accuracy: 0.79 (+/- 0.03) [StochasticGradientDescent]
Test Accuracy: 0.80 
Train CV Accuracy: 0.82 (+/- 0.02) [Logistic_Regression]
Test Accuracy: 0.82 
Train CV Accuracy: 0.83 (+/- 0.02) [AdaBoost_Ensemble]
Test Accuracy: 0.83 
Train CV Accuracy: 0.83 (+/- 0.03) [Bagger_ensemble]
Test Accuracy: 0.89 
Train CV Accuracy: 0.41 (+/- 0.02) [Gaussian]
Te

In [39]:
dic.keys()

dict_keys(['SVCLinear', 'LinearSV', 'PCA_SVC', 'Gradient_Boosting', 'SVCrbf', 'KNN', 'XGBsklearn', 'RSNeural_Net', 'XGBstandard', 'StochasticGradientDescent', 'Logistic_Regression', 'AdaBoost_Ensemble', 'Bagger_ensemble', 'Gaussian', 'Random_Forest'])

In [40]:
keys =['RSNeural_Net', 'Gradient_Boosting', 'SVCLinear', 'StochasticGradientDescent', 'SVCrbf',
                      #'LinearSV',
                      'AdaBoost_Ensemble', 'Random_Forest', 'XGBstandard',
                      #'PCA_SVC',
       'XGBsklearn', 'Bagger_ensemble', 'Gaussian',
                      'Logistic_Regression', 'KNN']

bestkeys =['Gradient_Boosting', 'SVCLinear','XGBstandard',
       'XGBsklearn', 'Bagger_ensemble']
soft= [dic.get(key) for key in bestkeys]

In [41]:
# ### Ensemble Voting
allmodel = [x for x in dic.values()]

# [LR, RF, SVMR, SVCLinear, KNC, GBC, ABC, BC,GAU]
# w = [0,1,2,1,1,1,1,3,0]

ECH = EnsembleVoteClassifier(allmodel, voting='hard')
ECS = EnsembleVoteClassifier(soft, voting='soft')

print('5-fold cross validation:\n')
for clf, label in zip([ECS, ECH], 
                      ['Ensemble Soft Voting',
                       'Ensemble Hard Voting']):
    scores = cross_validation.cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)    
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

5-fold cross validation:

Train CV Accuracy: 0.84 (+/- 0.02) [Ensemble Soft Voting]
Test Accuracy: 0.87 
Train CV Accuracy: 0.83 (+/- 0.02) [Ensemble Hard Voting]
Test Accuracy: 0.84 


In [42]:
ensembling(ECH,"Hard_ensemble1")
ensembling(ECS,"Soft_ensemble1")

418
418


## Stacked Generalization
Stacked generalized models

In [43]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn import metrics

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
%matplotlib inline

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#np.random.seed(2017)  # seed to shuffle the train set

X = train_df.drop(["Survived"] , axis=1)
y = train_df["Survived"]

#test_df  = test_df.drop(["PassengerId"] , axis=1).copy()
print(X.shape, y.shape, test_df.shape)

#Normalize
X = StandardScaler().fit_transform(X)

# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017)

kfold = cross_validation.StratifiedKFold(y=y_train, n_folds=5, random_state=2017)
num_trees = 10
verbose = True # to print the progress

clfs = [KNeighborsClassifier(),
        RandomForestClassifier(n_estimators=num_trees, random_state=2017),
        GradientBoostingClassifier(n_estimators=num_trees, random_state=2017)]

(891, 29) (891,) (418, 29)


In [44]:
# Creating train and test sets for blending
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
dataset_blend_test_df = np.zeros((test_df.shape[0], len(clfs)))

print('5-fold cross validation:\n')
for i, clf in enumerate(clfs):   
    scores = cross_validation.cross_val_score(clf, X_train, y_train, cv=kfold, scoring='accuracy')
    print("##### Base Model %0.0f #####" % i)
    print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    clf.fit(X_train, y_train)   
    print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_train), y_train)))
    dataset_blend_train[:,i] = clf.predict_proba(X_train)[:, 1]
    dataset_blend_test[:,i] = clf.predict_proba(X_test)[:, 1]
    dataset_blend_test_df[:,i] = clf.predict_proba(test_df)[:, 1]
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))    

print("##### Meta Model #####")
clf = LogisticRegression()
scores = cross_validation.cross_val_score(clf, dataset_blend_train, y_train, cv=kfold, scoring='accuracy')
clf.fit(dataset_blend_train, y_train)
print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_train), y_train)))
print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_test), y_test)))

5-fold cross validation:

##### Base Model 0 #####
Train CV Accuracy: 0.79 (+/- 0.02)
Train Accuracy: 0.85 
Test Accuracy: 0.85 
##### Base Model 1 #####
Train CV Accuracy: 0.81 (+/- 0.02)
Train Accuracy: 0.92 
Test Accuracy: 0.85 
##### Base Model 2 #####
Train CV Accuracy: 0.82 (+/- 0.02)
Train Accuracy: 0.83 
Test Accuracy: 0.85 
##### Meta Model #####
Train CV Accuracy: 0.92 (+/- 0.01)
Train Accuracy: 0.92 
Test Accuracy: 0.84 


In [45]:
modelname= "stacked"
submission = clf.predict(dataset_blend_test_df)
df = pd.DataFrame({'PassengerId':test_df.index, 'Survived':submission})
results = results.append({'Model': modelname,'Para': clf, 'Test_Score':(metrics.accuracy_score(clf.predict(dataset_blend_test), y_test))*100}, ignore_index=True)
df.to_csv((os.path.join(path,("submissions/{}.csv".format(modelname)))),header=True,index=False)
with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model, f)
len(clf.predict(dataset_blend_test_df))

418

In [46]:
results.sort_values(by=["Test_Score"], ascending=False)

Unnamed: 0,Model,Para,Test_Score
17,stacked,"LogisticRegression(C=1.0, class_weight=None, d...",84.357542
16,Soft_ensemble1,EnsembleVoteClassifier(clfs=[GradientBoostingC...,83.798883
15,Hard_ensemble1,"EnsembleVoteClassifier(clfs=[SVC(C=1, cache_si...",83.798883
6,Gradient_Boosting,"{'n_estimators': 275, 'max_depth': 2.0, 'loss'...",83.659218
9,KNN,"{'weights': 'uniform', 'n_neighbors': 9}",83.379888
3,Bagger_ensemble,{'n_estimators': 245},83.379888
2,RSNeural_Net,"{'max_iter': 1000, 'hidden_layer_sizes': 215}",83.100559
4,Random_Forest,"{'n_estimators': 425, 'max_depth': 7, 'max_fea...",82.681564
8,XGBstandard,"XGBClassifier(base_score=0.5, booster='gbtree'...",82.681564
11,LinearSV,"LinearSVC(C=1.0, class_weight=None, dual=True,...",82.681564


## Reflection on model

Ensemble models suggest that testing accuracy is in the high 80s, however when applied to Kaggle out of sample data, perfromance is consistently in the high 70s. This suggests that either the submission data is very different, or that my model is overfitting on the given data.

 - May want to explore models with a greater emphasis on randomness in order to tone down the overfitting.
 - Perhaps compare variable distribution between submission data and training data