# Supervised Classification with Titanic
### AKA "Going Wide"
Author: Nick Brooks

Date: Summer 2017

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import os

import pickle
import multiprocessing

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')

# Machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import feature_selection

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

#Evalaluation
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Grid
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as st

# Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

# Esemble Voting
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

# Performance
%load_ext memory_profiler

# Stacking
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from matplotlib.colors import ListedColormap

# Warnings
import warnings
warnings.filterwarnings('ignore')

import time
start = time.time()

## Load

In [2]:
#os.chdir(r"C:\Users\Nicol\Google Drive\Learning\Jupyter\Classification and Regression\Titanic")
#os.getcwd()

In [3]:
## Master Parameters:
n_splits = 5
n_iter = 80

In [4]:
#Titanic
path = r"C:\Users\Nicol\Google Drive\Learning\Jupyter\Classification and Regression\Titanic"
#path = r"/Users/nicapotato/Google Drive/Learning/Jupyter/Titanic"

#train_df = pd.read_csv(open(os.path.join(path, "clean_train.csv"), "r")) 
#test_df = pd.read_csv(open(os.path.join(path, "clean_test.csv"), "r"))

#train_df = pd.read_csv(open(os.path.join(path, "clean_train2.csv"), "r")) 
#test_df = pd.read_csv(open(os.path.join(path, "clean_test2.csv"), "r")) 

train_df = pd.read_csv(open(os.path.join(path,
                "Data/clean_train_nick.csv"), "r"), index_col="PassengerId")
test_df = pd.read_csv(open(os.path.join(path,
                "Data/clean_test_nick.csv"), "r"), index_col="PassengerId") 

X = train_df.drop(["Survived"] , axis=1)
y = train_df["Survived"]

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

#test_df  = test_df.drop(["PassengerId"] , axis=1).copy()
print(X.shape, y.shape, test_df.shape)

results = pd.DataFrame(columns=['Model','Para','Test_Score','CV Mean','CV STDEV'])
ensemble_models= {}

def save(model, modelname):
    global results
    model.best_estimator_.fit(X, y)
    submission = model.predict(test_df)
    df = pd.DataFrame({'PassengerId':test_df.index, 
                           'Survived':submission})
    df.to_csv((os.path.join(path,("submissions/{}.csv".format(modelname)))),header=True,index=False)
    
    scores = cross_val_score(model.best_estimator_, X_train, y_train, cv=5, scoring='accuracy', verbose =0)
    CV_scores = scores.mean()
    STDev = scores.std()
    Test_scores = model.score(X_test, y_test)
    # print(metrics.accuracy_score(model.predict(X_test), y_test)) # Same

    # CV and Save scoress
    results = results.append({'Model': modelname,'Para': model.best_params_,'Test_Score': Test_scores,
                             'CV Mean':CV_scores, 'CV STDEV': STDev}, ignore_index=True)
    ensemble_models[modelname] = model.best_estimator_

    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (CV_scores, STDev, modelname))
    print("Optimal Model Parameters: {}".format(grid.best_params_))
    print('Test_Score:', Test_scores)

    # with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model.best_estimator_, f)
        
def norm_save(model,score, modelname):
    global results
    model.fit(X, y)
    submission = model.predict(test_df)
    df = pd.DataFrame({'PassengerId':test_df.index, 
                           'Survived':submission})
    
    CV_Score = score.mean()
    Test_Score = model.score(X_test, y_test)
    STDev = score.std()
    
    # CV and Save Scores
    Test_Score = model.score(X_test, y_test)
    results = results.append({'Model': modelname,'Para': model,'Test_Score': Test_Score,
                             'CV Mean':CV_Score, 'CV STDEV': STDev}, ignore_index=True)
    ensemble_models[modelname] = model
    df.to_csv((os.path.join(path,("submissions/{}.csv".format(modelname)))),header=True,index=False)
    
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (CV_Score, STDev, modelname))  
    print('Test_Score:', Test_Score)
    
    # with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model, f)

(891, 8) (891,) (418, 8)


In [5]:
print(y.value_counts(normalize=True))
# Should Balance This DataSet through resampling

0    0.616162
1    0.383838
Name: Survived, dtype: float64


In [6]:
# Stratified Cross Validation
cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.2)

In [7]:
print(X.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
Title       891 non-null int64
dtypes: float64(2), int64(6)
memory usage: 102.6 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 8 columns):
Pclass      418 non-null int64
Sex         418 non-null int64
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null int64
Title       418 non-null int64
dtypes: float64(2), int64(6)
memory usage: 29.4 KB
None


# Generative Classification
Probabilistically determine the label from the features

## Gaussian

In [8]:
model = GaussianNB()

score = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
norm_save(model,score, "Gaussian")

Train CV Accuracy: 0.77 (+/- 0.03) [Gaussian]
Test_Score: 0.821229050279


## Logistic Regression


In [9]:
model= LogisticRegression()
score = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
norm_save(LogisticRegression(),score, "Logistic_Regression")

Train CV Accuracy: 0.78 (+/- 0.02) [Logistic_Regression]
Test_Score: 0.815642458101


## Neural Net

### With ScikitLearn

In [10]:
MLPClassifier().get_params().keys()

dict_keys(['epsilon', 'random_state', 'momentum', 'beta_1', 'beta_2', 'hidden_layer_sizes', 'early_stopping', 'learning_rate', 'tol', 'solver', 'activation', 'alpha', 'warm_start', 'learning_rate_init', 'max_iter', 'verbose', 'power_t', 'shuffle', 'nesterovs_momentum', 'validation_fraction', 'batch_size'])

In [11]:
# Start with a RandomSearchCV to efficiently Narrow the Ballpark
param_grid ={'max_iter': np.logspace(1, 5, 10).astype("int32"),
             'hidden_layer_sizes': np.logspace(2, 3, 4).astype("int32"),
             'activation':['identity', 'logistic', 'tanh', 'relu'],
             'learning_rate': ['adaptive'],
             'early_stopping': [True],
             'alpha': np.logspace(2, 3, 4).astype("int32")
            }

model = MLPClassifier()

grid = RandomizedSearchCV(model,
                    param_grid, cv=cv, scoring='accuracy',
                    verbose=1, n_iter=n_iter)

grid.fit(X, y)
save(grid, "RSNeural_Net")

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Train CV Accuracy: 0.65 (+/- 0.03) [RSNeural_Net]
Optimal Model Parameters: {'hidden_layer_sizes': 464, 'early_stopping': True, 'learning_rate': 'adaptive', 'activation': 'identity', 'alpha': 100, 'max_iter': 4641}
Test_Score: 0.625698324022


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   10.2s finished


# TensorFlow NN

# Non-Parametric

# Esemble Method

Means that a bunch of the model get created and are aggregated at the end for best performance.

## Bagging, Bootstrap

Aka Bootstrap- creates a bunch of trees using a random 3/4 the the data for each, while using sampling without replacement, which means that values may be sampled multiple times.

https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/

HyperParameters:
- max_features: This is the random subset of features to be used for splitting node, the lower the better to reduce variance. For Classification model, ideal max_features = sqr(n_var)
- n_estimators: # of trees built before average prediciton is made
- min_sample_leaf: End node of trees. Too small = more noise. For Regression tree.
- n_jobs: computer processors utilized. -1 = no restrictions
- random_state: seed()

In [12]:
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=300, max_samples=0.8,
                        random_state=1)

print(cross_val_score(bag, X, y, cv=10, scoring='accuracy').mean()*100)

82.5002837362


In [13]:
np.arange(20, 500, 25)

array([ 20,  45,  70,  95, 120, 145, 170, 195, 220, 245, 270, 295, 320,
       345, 370, 395, 420, 445, 470, 495])

In [14]:
param_grid ={'n_estimators': st.randint(20, 500)}

tree = DecisionTreeClassifier()
#bag = BaggingClassifier(tree)

grid = RandomizedSearchCV(BaggingClassifier(tree),
                    param_grid, cv=cv, scoring='accuracy',
                    verbose=1,n_iter=n_iter)

grid.fit(X, y)
save(grid, "Bagger_ensemble")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  2.2min finished


Train CV Accuracy: 0.81 (+/- 0.02) [Bagger_ensemble]
Optimal Model Parameters: {'n_estimators': 226}
Test_Score: 0.938547486034


## Random Forest

Trees are created wih a randomly picked subset of observations and variables. More uncorrelated splits, less overemphasis on certain features.

In [15]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.854748603352


In [16]:
RandomForestClassifier().get_params().keys()

dict_keys(['n_jobs', 'n_estimators', 'random_state', 'class_weight', 'min_impurity_split', 'verbose', 'min_samples_leaf', 'min_weight_fraction_leaf', 'warm_start', 'max_depth', 'oob_score', 'criterion', 'min_impurity_decrease', 'bootstrap', 'min_samples_split', 'max_features', 'max_leaf_nodes'])

In [17]:
param_grid ={'max_depth': st.randint(6, 11),
             'n_estimators':st.randint(300, 500),
             'max_features':np.arange(0.5,.81, 0.05),
            'max_leaf_nodes':st.randint(6, 10)}
#param_grid ={'n_estimators':[200]}

#model = feature_selection.RFE(RandomForestClassifier())
model= RandomForestClassifier()

grid = RandomizedSearchCV(model,
                    param_grid, cv=cv,
                    scoring='accuracy',
                    verbose=1,n_iter=n_iter)

grid.fit(X, y)
save(grid, "Random_Forest")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  2.6min finished


Train CV Accuracy: 0.81 (+/- 0.04) [Random_Forest]
Optimal Model Parameters: {'max_depth': 9, 'max_features': 0.65000000000000013, 'max_leaf_nodes': 7, 'n_estimators': 300}
Test_Score: 0.837988826816


## Extremely Randomized Trees (ExtraTree)

## AdaBoostClassifier: Boosting Method

Method, similarly to deep learning, applies weights to all data points and optimizes them using the loss function. Fixes mistakes by assigning high weights to them during iterative process.

Iterates through multiple models in order to determine the best boundaries. It relies on using weak models to determine the pattern, and evantually creates a strong combination of them.

In [18]:
param_grid ={'n_estimators':st.randint(50, 400),
            'learning_rate':np.arange(.1, 4, .5)}

grid = RandomizedSearchCV(AdaBoostClassifier(),
                    param_grid,cv=cv, scoring='accuracy',
                    verbose=1, n_iter=n_iter)

grid.fit(X, y);
save(grid, "AdaBoost_Ensemble")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.6min finished


Train CV Accuracy: 0.81 (+/- 0.01) [AdaBoost_Ensemble]
Optimal Model Parameters: {'learning_rate': 1.6000000000000001, 'n_estimators': 81}
Test_Score: 0.837988826816


## Gradient Boosting Classifier

Part of the Generalized Boosting Algorithm family.

GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions.

Part of the generalized boosting algorithms. Can use more loss functions than AdaBoost, and uses gradients instead of high-weight data points.

In [19]:
#?GradientBoostingClassifier

In [20]:
param_grid ={'n_estimators':st.randint(100, 400),
            'loss': ['deviance', 'exponential'],
            'learning_rate':np.arange(0.01, 0.32,.05),
            'max_depth': np.arange(2, 4.1, .5)}

grid = RandomizedSearchCV(GradientBoostingClassifier(),
                    param_grid,cv=cv,
                    scoring='accuracy',
                    verbose=1, n_iter=n_iter)

grid.fit(X, y)
save(grid, "Gradient_Boosting")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   42.6s finished


Train CV Accuracy: 0.81 (+/- 0.03) [Gradient_Boosting]
Optimal Model Parameters: {'learning_rate': 0.16000000000000003, 'max_depth': 3.5, 'n_estimators': 237, 'loss': 'deviance'}
Test_Score: 0.921787709497


## XGB - eXtreme Gradient Boosting

Optimized Generalized Gradient Booster, developped in 2014, competetes will in Kaggle Competitions!

Install: https://www.ibm.com/developerworks/community/blogs/jfp/entry/Installing_XGBoost_For_Anaconda_on_Windows?lang=en

In [21]:
# import os
# mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
# os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
# import xgboost as xgb
# from xgboost.sklearn import XGBClassifier

In [22]:
# param_grid = {'max_depth': st.randint(1, 5),  # the maximum depth of each tree
#          'objective': 'binary:logistic'}

# model = XGBClassifier(n_estimators = num_rounds,
#                         objective= 'binary:logistic')

# grid = RandomizedSearchCV(model,
#                     param_grid,cv=cv,
#                     scoring='accuracy',
#                     verbose=1, n_iter=n_iter)
# grid.fit(X_train,y_train, early_stopping_rounds=20, eval_set=[(X_test,
# y_test)], verbose=False)
# #save(grid, "Gradient_Boosting")

In [23]:
XGBClassifier().get_params().keys()

dict_keys(['objective', 'n_jobs', 'colsample_bylevel', 'seed', 'colsample_bytree', 'random_state', 'learning_rate', 'n_estimators', 'missing', 'scale_pos_weight', 'min_child_weight', 'max_depth', 'reg_lambda', 'booster', 'max_delta_step', 'silent', 'subsample', 'reg_alpha', 'gamma', 'base_score', 'nthread'])

In [24]:
st.randint(3, 100)
st.uniform.cdf([0, 1, 2, 3, 4, 5], loc=1, scale=4)

array([ 0.  ,  0.  ,  0.25,  0.5 ,  0.75,  1.  ])

In [25]:
num_rounds = 100
one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {  
    "n_estimators": st.randint(3, 100),
    "max_depth": st.randint(3, 40),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

xgbreg = XGBClassifier(n_estimators = num_rounds,
                        objective= 'binary:logistic',
                       nthreads=-1)

grid = RandomizedSearchCV(xgbreg, params, n_jobs=1, verbose=1)  
grid.fit(X_train,y_train, verbose=False)
save(grid, "Sci_kit XGB")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Train CV Accuracy: 0.79 (+/- 0.04) [Sci_kit XGB]
Optimal Model Parameters: {'max_depth': 28, 'subsample': 0.90555175643033681, 'colsample_bytree': 0.98352497008302076, 'learning_rate': 0.22091698571780871, 'n_estimators': 28, 'reg_alpha': 4.4269488668367458, 'gamma': 1.7821128500742378, 'min_child_weight': 24.30807238193173}
Test_Score: 0.832402234637


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.2s finished


In [26]:
model = XGBClassifier(n_estimators = num_rounds,
                        objective= 'binary:logistic')

# use early_stopping_rounds to stop the cv when there is no score imporovement
model.fit(X_train,y_train, early_stopping_rounds=20, eval_set=[(X_test,
y_test)], verbose=False)
score = cross_val_score(model, X_train,y_train, cv=cv)
print("\nxgBoost - CV Train : %.2f" % score.mean())
print("xgBoost - Train : %.2f" % metrics.accuracy_score(model.predict(X_train), y_train))
print("xgBoost - Test : %.2f" % metrics.accuracy_score(model.predict(X_test), y_test))

norm_save(model,score, "XGBsklearn")


xgBoost - CV Train : 0.80
xgBoost - Train : 0.86
xgBoost - Test : 0.84
Train CV Accuracy: 0.80 (+/- 0.03) [XGBsklearn]
Test_Score: 0.860335195531


In [27]:
num_rounds = 100
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgtest = xgb.DMatrix(X_test, label=y_test)

# set xgboost params
param = {'max_depth': 3,  # the maximum depth of each tree
         'objective': 'binary:logistic'}

clf_xgb_cv = xgb.cv(param, xgtrain, num_rounds, 
                    stratified=True, 
                    nfold=n_splits, 
                    early_stopping_rounds=20)
print("Optimal number of trees/estimators is %i" % clf_xgb_cv.shape[0])

watchlist  = [(xgtest,'test'), (xgtrain,'train')]                
clf_xgb = xgb.train(param, xgtrain,clf_xgb_cv.shape[0], watchlist)

# predict function will produce the probability 
# so we'll use 0.5 cutoff to convert probability to class label
y_train_pred = (clf_xgb.predict(xgtrain, ntree_limit=clf_xgb.best_iteration) > 0.5).astype(int)
y_test_pred = (clf_xgb.predict(xgtest, ntree_limit=clf_xgb.best_iteration) > 0.5).astype(int)
score= metrics.accuracy_score(y_test_pred, y_test)
print("XGB - Train : %.2f" % score)
print("XGB - Test : %.2f" % metrics.accuracy_score(y_train_pred, y_train))
norm_save(model,score, "XGBstandard")

Optimal number of trees/estimators is 6
[0]	test-error:0.201117	train-error:0.16573
[1]	test-error:0.206704	train-error:0.164326
[2]	test-error:0.206704	train-error:0.164326
[3]	test-error:0.206704	train-error:0.164326
[4]	test-error:0.167598	train-error:0.157303
[5]	test-error:0.167598	train-error:0.160112
XGB - Train : 0.83
XGB - Test : 0.84
Train CV Accuracy: 0.83 (+/- 0.00) [XGBstandard]
Test_Score: 0.860335195531


## KNN

In [28]:
param_grid ={'n_neighbors': st.randint(1,40),
            'weights':['uniform','distance']
            }

grid = RandomizedSearchCV(KNeighborsClassifier(),
                    param_grid,cv=cv, scoring='accuracy',
                    verbose=1, n_iter=n_iter)

grid.fit(X, y)

save(grid, "KNN")

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Train CV Accuracy: 0.81 (+/- 0.03) [KNN]
Optimal Model Parameters: {'n_neighbors': 6, 'weights': 'uniform'}
Test_Score: 0.854748603352


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    3.3s finished


# Discriminative Classification
Model new points by seeing where it falls upon a divide.
Fast prediction phase, work well in high dimensional data, versatile

Costly at high quantities of data

### Stochastic Gradient Descent

In [29]:
SGDClassifier().get_params().keys()

dict_keys(['epsilon', 'n_jobs', 'eta0', 'fit_intercept', 'warm_start', 'n_iter', 'random_state', 'learning_rate', 'verbose', 'alpha', 'average', 'max_iter', 'class_weight', 'power_t', 'penalty', 'shuffle', 'l1_ratio', 'tol', 'loss'])

In [30]:
param_grid ={'loss':["hinge","log","modified_huber","squared_hinge","epsilon_insensitive","squared_epsilon_insensitive"]
            }

grid = GridSearchCV(SGDClassifier(),
                    param_grid,cv=cv, scoring='accuracy',
                    verbose=1)

grid.fit(X, y)
save(grid, "StochasticGradientDescent")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Train CV Accuracy: 0.74 (+/- 0.07) [StochasticGradientDescent]
Optimal Model Parameters: {'loss': 'squared_hinge'}
Test_Score: 0.72625698324


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.0s finished


## Support Vector Classifier
Creates a linear divide between point to classify. Maximizes the distance of the discriminatory margin.

Hyperparameters:
- C: Hardness of the margin. Higher C, less softening.


Radial Basis Function (RBF)
- Gamma: how far the influence of a single training example raches. low=far, high=close, Inverse of the radius of influence of samples selected by the model as support vectors.

In [31]:
LinearSVC().get_params().keys()

dict_keys(['dual', 'random_state', 'class_weight', 'verbose', 'max_iter', 'intercept_scaling', 'penalty', 'C', 'fit_intercept', 'tol', 'loss', 'multi_class'])

In [32]:
# Define Model
model = LinearSVC()
#Fit Model
scores= cross_val_score(model, X, y, cv=cv, scoring='accuracy')
norm_save(model, scores, "LinearSV")

Train CV Accuracy: 0.80 (+/- 0.03) [LinearSV]
Test_Score: 0.815642458101


### Radial Basis Function Kernel - SVC

In [33]:
SVC().get_params().keys()

dict_keys(['probability', 'degree', 'random_state', 'class_weight', 'verbose', 'gamma', 'max_iter', 'decision_function_shape', 'cache_size', 'shrinking', 'C', 'coef0', 'tol', 'kernel'])

In [34]:
svc = SVC(kernel= 'rbf', probability=True)

model = Pipeline(steps=[('svc', svc)])


param_grid = {'svc__C': st.randint(1,10000),
              'svc__gamma': np.logspace(1, -7, 10)}

grid = RandomizedSearchCV(model, param_grid,
                          cv=cv, verbose=1, scoring='accuracy',
                         n_iter=n_iter)

grid.fit(X, y)
save(grid, "SVCrbf")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  2.2min finished


Train CV Accuracy: 0.82 (+/- 0.02) [SVCrbf]
Optimal Model Parameters: {'svc__C': 5364, 'svc__gamma': 0.0027825594022071257}
Test_Score: 0.837988826816


### Linear SVC

In [35]:
# param_grid = {'C':st.randint(1,40),'kernel':['linear'], "probability" : [True]}

# model = SVC()
# grid = RandomizedSearchCV(model,
#                     param_grid, cv=cv,
#                     scoring='accuracy', verbose=1)

# grid.fit(X, y)
# save(grid, "SVCLinear")

## Pipeline: Principle Components Analysis and Support Vector Classifier

In [36]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

pca = PCA()
svc = SVC(kernel= 'rbf',probability=True)

model = Pipeline(steps=[('pca',pca),
                        ('svc', svc)])


param_grid = {'svc__C': st.randint(1,10000),
              'svc__gamma': np.logspace(1, -7, 10),
             'pca__n_components': st.randint(1,len(X.columns))}

grid = RandomizedSearchCV(model, param_grid,
                          cv=cv, verbose=1,
                         n_iter=n_iter)

grid.fit(X, y)
save(grid, "PCA_SVC")

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  3.5min finished


Train CV Accuracy: 0.82 (+/- 0.02) [PCA_SVC]
Optimal Model Parameters: {'pca__n_components': 6, 'svc__C': 4902, 'svc__gamma': 0.0027825594022071257}
Test_Score: 0.832402234637


## Results

In [37]:
results.to_csv("Titanic/results.csv",index_label=False)

In [38]:
results = pd.read_csv(open(os.path.join(path, "results.csv"), "r"))

In [39]:
results = results.sort_values(by=["Test_Score"], ascending=False)
results

Unnamed: 0,Model,Para,Test_Score,CV Mean,CV STDEV
3,Bagger_ensemble,{'n_estimators': 226},0.938547,0.813237,0.021745
6,Gradient_Boosting,"{'learning_rate': 0.16000000000000003, 'max_de...",0.921788,0.809022,0.034348
8,XGBsklearn,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.860335,0.797203,0.025407
9,XGBstandard,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.860335,0.832402,0.0
10,KNN,"{'n_neighbors': 6, 'weights': 'uniform'}",0.854749,0.811859,0.032075
4,Random_Forest,"{'max_depth': 9, 'max_features': 0.65000000000...",0.837989,0.813208,0.039981
5,AdaBoost_Ensemble,"{'learning_rate': 1.6000000000000001, 'n_estim...",0.837989,0.810401,0.012438
13,SVCrbf,"{'svc__C': 5364, 'svc__gamma': 0.0027825594022...",0.837989,0.818832,0.022658
7,Sci_kit XGB,"{'max_depth': 28, 'subsample': 0.9055517564303...",0.832402,0.786506,0.044517
14,PCA_SVC,"{'pca__n_components': 6, 'svc__C': 4902, 'svc_...",0.832402,0.821639,0.017969


## Model Ensemble

## Voting

Hard- Mode
Soft- Probabilistic

In [40]:
# dic = {}
# for name in results["Model"]:
#     open_file = open(os.path.join(path,"Pickle/{}.pickle".format(name)), "rb")
#     dic[name] = pickle.load(open_file)
#     open_file.close()

In [41]:
models = list(zip(ensemble_models.values(), ensemble_models.keys()))
# results[["Best_estimator","Model"]].apply(tuple, axis=1)
# zip(results["Best_estimator"],results["Model"]) 

clfs = []
print('5-fold cross validation:\n')
for clf, label in models:
    scores = cross_val_score(clf, X_train, y_train,cv=5, scoring='accuracy', verbose=0)
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)    
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

5-fold cross validation:

Train CV Accuracy: 0.82 (+/- 0.02) [PCA_SVC]
Test Accuracy: 0.83 
Train CV Accuracy: 0.81 (+/- 0.02) [Bagger_ensemble]
Test Accuracy: 0.94 
Train CV Accuracy: 0.82 (+/- 0.04) [Random_Forest]
Test Accuracy: 0.84 
Train CV Accuracy: 0.81 (+/- 0.03) [Gradient_Boosting]
Test Accuracy: 0.92 
Train CV Accuracy: 0.81 (+/- 0.03) [XGBsklearn]
Test Accuracy: 0.86 
Train CV Accuracy: 0.81 (+/- 0.01) [AdaBoost_Ensemble]
Test Accuracy: 0.84 
Train CV Accuracy: 0.65 (+/- 0.03) [RSNeural_Net]
Test Accuracy: 0.73 
Train CV Accuracy: 0.77 (+/- 0.03) [LinearSV]
Test Accuracy: 0.82 
Train CV Accuracy: 0.79 (+/- 0.04) [Sci_kit XGB]
Test Accuracy: 0.83 
Train CV Accuracy: 0.81 (+/- 0.03) [XGBstandard]
Test Accuracy: 0.86 
Train CV Accuracy: 0.81 (+/- 0.03) [KNN]
Test Accuracy: 0.85 
Train CV Accuracy: 0.78 (+/- 0.03) [Logistic_Regression]
Test Accuracy: 0.82 
Train CV Accuracy: 0.74 (+/- 0.05) [StochasticGradientDescent]
Test Accuracy: 0.75 
Train CV Accuracy: 0.76 (+/- 0.04) [Gau

In [42]:
def ensembling(model, modelname):
    global results
    model.fit(X, y)
    submission = model.predict(test_df)
    df = pd.DataFrame({'PassengerId':test_df.index, 
                           'Survived':submission})
    results = results.append({'Model': modelname,'Para': model, 'CV Mean': None,
            'Test_Score':metrics.accuracy_score(clf.predict(X_test), y_test)}, ignore_index=True)
    ensemble_models[modelname] = model
    print(len(df))
    df.to_csv((os.path.join(path,(r"submissions/{}.csv".format(modelname)))),header=True,index=False)
    # with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model, f)

In [43]:
## Add all models above X accuracy

In [44]:
results

Unnamed: 0,Model,Para,Test_Score,CV Mean,CV STDEV
3,Bagger_ensemble,{'n_estimators': 226},0.938547,0.813237,0.021745
6,Gradient_Boosting,"{'learning_rate': 0.16000000000000003, 'max_de...",0.921788,0.809022,0.034348
8,XGBsklearn,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.860335,0.797203,0.025407
9,XGBstandard,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.860335,0.832402,0.0
10,KNN,"{'n_neighbors': 6, 'weights': 'uniform'}",0.854749,0.811859,0.032075
4,Random_Forest,"{'max_depth': 9, 'max_features': 0.65000000000...",0.837989,0.813208,0.039981
5,AdaBoost_Ensemble,"{'learning_rate': 1.6000000000000001, 'n_estim...",0.837989,0.810401,0.012438
13,SVCrbf,"{'svc__C': 5364, 'svc__gamma': 0.0027825594022...",0.837989,0.818832,0.022658
7,Sci_kit XGB,"{'max_depth': 28, 'subsample': 0.9055517564303...",0.832402,0.786506,0.044517
14,PCA_SVC,"{'pca__n_components': 6, 'svc__C': 4902, 'svc_...",0.832402,0.821639,0.017969


In [45]:
prob_models = results[results.Model != 'LinearSV']

In [46]:
# results = results.sort_values(by=["Test_Score"], ascending=False)

# keys = results.Model[:10]
# allmodel= [ensemble_models.get(key) for key in results.Model[:10]]


# soft= [ensemble_models.get(key) for key in results.Model[:7]]


# keys =['RSNeural_Net', 'Gradient_Boosting', 'SVCLinear', 'StochasticGradientDescent', 'SVCrbf',
#                       #'LinearSV',
#                       'AdaBoost_Ensemble', 'Random_Forest', 'XGBstandard',
#                       #'PCA_SVC',
#        'XGBsklearn', 'Bagger_ensemble', 'Gaussian',
#                       'Logistic_Regression', 'KNN']

# keys = results.Model[:10]

# bestkeys =['Gradient_Boosting', 'SVCLinear','XGBstandard',
#        'XGBsklearn', 'Bagger_ensemble']

# bestkeys = results.Model[:10]
# for x in [""]
# del ensemble_models[PCA_SVC]

# soft= [ensemble_models.get(key) for key in bestkeys]

In [47]:
# ### Ensemble Voting
# w = [0,1,2,1,1,1,1,3,0]
hard_models = results
prob_models = results[results.Model != 'LinearSV']

for x in [2,3,5,7,10]:
    ECH = EnsembleVoteClassifier([ensemble_models.get(key) for key in hard_models.Model[:x]], voting='hard')
    ECS = EnsembleVoteClassifier([ensemble_models.get(key) for key in prob_models.Model[:x]], voting='soft')
    print('\n')
    print('{}-Voting Models: 5-fold cross validation:\n'.format(x))
    
    for clf, label in zip([ECS, ECH], 
                          ['{}-VM-Ensemble Soft Voting'.format(x),
                           '{}-VM-Ensemble Hard Voting'.format(x)]):
        scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
        print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
        md = clf.fit(X, y)    
        clfs.append(md)
        
        Test_Score = metrics.accuracy_score(clf.predict(X_test), y_test)
        print("Test Accuracy: %0.2f " % Test_Score)
        
        CV_Score = scores.mean()
        STDev = scores.std()
        
        submission = md.predict(test_df)
        df = pd.DataFrame({'PassengerId':test_df.index, 
                               'Survived':submission})
        global results
        results = results.append({'Model': label,'Para': clf, 'CV Mean': CV_Score,
                'Test_Score':Test_Score,'CV STDEV': STDev}, ignore_index=True)
        ensemble_models[label] = model
        df.to_csv((os.path.join(path,(r"submissions/{}.csv".format(label)))),header=True,index=False)



2-Voting Models: 5-fold cross validation:

Train CV Accuracy: 0.81 (+/- 0.02) [2-VM-Ensemble Soft Voting]
Test Accuracy: 0.93 
Train CV Accuracy: 0.81 (+/- 0.03) [2-VM-Ensemble Hard Voting]
Test Accuracy: 0.93 


3-Voting Models: 5-fold cross validation:

Train CV Accuracy: 0.81 (+/- 0.02) [3-VM-Ensemble Soft Voting]
Test Accuracy: 0.92 
Train CV Accuracy: 0.81 (+/- 0.03) [3-VM-Ensemble Hard Voting]
Test Accuracy: 0.92 


5-Voting Models: 5-fold cross validation:

Train CV Accuracy: 0.82 (+/- 0.02) [5-VM-Ensemble Soft Voting]
Test Accuracy: 0.88 
Train CV Accuracy: 0.81 (+/- 0.02) [5-VM-Ensemble Hard Voting]
Test Accuracy: 0.89 


7-Voting Models: 5-fold cross validation:

Train CV Accuracy: 0.82 (+/- 0.02) [7-VM-Ensemble Soft Voting]
Test Accuracy: 0.88 
Train CV Accuracy: 0.82 (+/- 0.02) [7-VM-Ensemble Hard Voting]
Test Accuracy: 0.86 


10-Voting Models: 5-fold cross validation:

Train CV Accuracy: 0.81 (+/- 0.02) [10-VM-Ensemble Soft Voting]
Test Accuracy: 0.87 
Train CV Accuracy

In [48]:
results.sort_values(by=["Test_Score"], ascending=False)

Unnamed: 0,Model,Para,Test_Score,CV Mean,CV STDEV
0,Bagger_ensemble,{'n_estimators': 226},0.938547,0.813237,0.021745
16,2-VM-Ensemble Hard Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.932961,0.811829,0.0284
15,2-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.927374,0.814616,0.024504
1,Gradient_Boosting,"{'learning_rate': 0.16000000000000003, 'max_de...",0.921788,0.809022,0.034348
17,3-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.921788,0.814626,0.024441
18,3-VM-Ensemble Hard Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.916201,0.814607,0.027604
20,5-VM-Ensemble Hard Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.893855,0.813208,0.023742
21,7-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.882682,0.821649,0.02095
19,5-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.882682,0.823057,0.022203
23,10-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.865922,0.813228,0.017787


## Stacked Generalization
Stacked generalized models

In [71]:
from sklearn import cross_validation

X = train_df.drop(["Survived"] , axis=1)
y = train_df["Survived"]

#test_df  = test_df.drop(["PassengerId"] , axis=1).copy()
print(X.shape, y.shape, test_df.shape)

#Normalize
X = StandardScaler().fit_transform(X)

# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017)

kfold = cross_validation.StratifiedKFold(y=y_train, n_folds=5, random_state=2017)
num_trees = 10
verbose = True # to print the progress

clfs = [KNeighborsClassifier(),
        RandomForestClassifier(n_estimators=num_trees, random_state=2017),
        GradientBoostingClassifier(n_estimators=num_trees, random_state=2017)]

# Creating train and test sets for blending
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
dataset_blend_test_df = np.zeros((test_df.shape[0], len(clfs)))

print('5-fold cross validation:\n')
for i, clf in enumerate(clfs):   
    scores = cross_validation.cross_val_score(clf, X_train, y_train, cv=kfold, scoring='accuracy')
    print("##### Base Model %0.0f #####" % i)
    print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    clf.fit(X_train, y_train)   
    print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_train), y_train)))
    dataset_blend_train[:,i] = clf.predict_proba(X_train)[:, 1]
    dataset_blend_test[:,i] = clf.predict_proba(X_test)[:, 1]
    dataset_blend_test_df[:,i] = clf.predict_proba(test_df)[:, 1]
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))    

print("##### Meta Model #####")
clf = LogisticRegression()
scores = cross_validation.cross_val_score(clf, dataset_blend_train, y_train, cv=kfold, scoring='accuracy')
clf.fit(dataset_blend_train, y_train)
print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_train), y_train)))
print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_test), y_test)))

(891, 8) (891,) (418, 8)
5-fold cross validation:

##### Base Model 0 #####
Train CV Accuracy: 0.80 (+/- 0.02)
Train Accuracy: 0.85 
Test Accuracy: 0.87 
##### Base Model 1 #####
Train CV Accuracy: 0.81 (+/- 0.02)
Train Accuracy: 0.92 
Test Accuracy: 0.84 
##### Base Model 2 #####
Train CV Accuracy: 0.82 (+/- 0.02)
Train Accuracy: 0.83 
Test Accuracy: 0.85 
##### Meta Model #####
Train CV Accuracy: 0.92 (+/- 0.01)
Train Accuracy: 0.92 
Test Accuracy: 0.83 


In [51]:
score = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
norm_save(clf, score, "stacked")

Train CV Accuracy: 0.78 (+/- 0.03) [stacked]
Test_Score: 0.821229050279


In [52]:
# modelname= "stacked"
# submission = clf.predict(dataset_blend_test_df)
# df = pd.DataFrame({'PassengerId':test_df.index, 'Survived':submission})
# results = results.append({'Model': modelname,'Para': clf, 'Test_Score':(metrics.accuracy_score(clf.predict(dataset_blend_test), y_test))}, ignore_index=True)
# df.to_csv((os.path.join(path,("submissions/{}.csv".format(modelname)))),header=True,index=False)
# with open((os.path.join(path,(r"Pickle/{}.pickle".format(modelname)))), 'wb') as f: pickle.dump(model, f)
# len(clf.predict(dataset_blend_test_df))

In [53]:
results.sort_values(by=["Test_Score"], ascending=False)

Unnamed: 0,Model,Para,Test_Score,CV Mean,CV STDEV
0,Bagger_ensemble,{'n_estimators': 226},0.938547,0.813237,0.021745
16,2-VM-Ensemble Hard Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.932961,0.811829,0.0284
15,2-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.927374,0.814616,0.024504
1,Gradient_Boosting,"{'learning_rate': 0.16000000000000003, 'max_de...",0.921788,0.809022,0.034348
17,3-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.921788,0.814626,0.024441
18,3-VM-Ensemble Hard Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.916201,0.814607,0.027604
20,5-VM-Ensemble Hard Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.893855,0.813208,0.023742
21,7-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.882682,0.821649,0.02095
19,5-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.882682,0.823057,0.022203
23,10-VM-Ensemble Soft Voting,EnsembleVoteClassifier(clfs=[BaggingClassifier...,0.865922,0.813228,0.017787


## Best Model: Soft Voting Ensemble with Top Seven Models

In [72]:
X = train_df.drop(["Survived"] , axis=1)
y = train_df["Survived"]

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

evalmodel = EnsembleVoteClassifier([ensemble_models.get(key) for key in prob_models.Model[:7]], voting='soft')
evalmodel.fit(X_train, y_train)
y_pred = evalmodel.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

# Output
clf = EnsembleVoteClassifier([ensemble_models.get(key) for key in prob_models.Model[:7]], voting='soft')
md = clf.fit(X, y)
df = pd.DataFrame({'PassengerId':test_df.index, 'Survived':md.predict(test_df)})
df.to_csv((os.path.join(path,("submissions/{}.csv".format("Soft_Voting_7_TopModel")))),header=True,index=False)

             precision    recall  f1-score   support

          0       0.85      0.91      0.88       109
          1       0.84      0.76      0.80        70

avg / total       0.85      0.85      0.85       179



In [70]:
end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

Model took 6337.77 seconds to train


## Reflection on model

Ensemble models suggest that testing accuracy is in the high 80s, however when applied to Kaggle out of sample data, perfromance is consistently in the high 70s. This suggests that either the submission data is very different, or that my model is overfitting on the given data.

 - May want to explore models with a greater emphasis on randomness in order to tone down the overfitting.
 - Perhaps compare variable distribution between submission data and training data