In [1]:
import pandas as pd
import numpy as np
from numpy.random import normal
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('train_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Master.,Miss.,Mr.,Mrs.,Other,...,class_1,class_2,class_3,C,Q,S,female,male,FamilyAboard,IsAlone
0,0,22.0,1,0,7.25,0,0,1,0,0,...,0,0,1,0,0,1,0,1,1,False
1,1,38.0,1,0,71.2833,0,0,0,1,0,...,1,0,0,1,0,0,1,0,1,False
2,1,26.0,0,0,7.925,0,1,0,0,0,...,0,0,1,0,0,1,1,0,0,True
3,1,35.0,1,0,53.1,0,0,0,1,0,...,1,0,0,0,0,1,1,0,1,False
4,0,35.0,0,0,8.05,0,0,1,0,0,...,0,0,1,0,0,1,0,1,0,True


## Creation of Training/Test Data

In [4]:
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=15144)

## Generic Decision Tree Model Evaluation

In [7]:
dt = DecisionTreeClassifier(random_state=15144)
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.848314606741573

In [8]:
dt.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': 15144,
 'splitter': 'best'}

In [76]:
param_grid = {'criterion':['gini', 'entropy'],
             'splitter':['best', 'random'],
             
             }
gridsearch = GridSearchCV(DecisionTreeClassifier(random_state=15144),param_grid=param_grid, cv=5, scoring='accuracy')

In [80]:
gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=15144,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'splitter': ['best', 'random']},
             pre_d

In [83]:
print(gridsearch.best_score_)
print(gridsearch.best_params_)

0.7468354430379747
{'criterion': 'entropy', 'splitter': 'best'}


## Generic RF Model Evaluation

In [9]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=500, random_state=15144)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8314606741573034

## Grid Search Hyperparameters

In [86]:
param_grid = {'max_features':['auto'], 
              'n_estimators':[10, 50, 100, 200, 500], 
              'max_depth':[None, 5, 10],
              'min_samples_split':[2, 0.1, 0.2, 0.4],
             'random_state':[15144]}
gridsearch = GridSearchCV(RandomForestClassifier(n_jobs=-1), iid=False, cv=5, param_grid=param_grid, scoring='accuracy')

In [87]:
gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid=F

In [88]:
print(gridsearch.best_score_)
print(gridsearch.best_params_)

0.8284711293102063
{'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 50, 'random_state': 15144}


## Ensemble Voting

In [10]:
dt = DecisionTreeClassifier(random_state=15144)
rf = RandomForestClassifier(n_jobs=-1, n_estimators=500, random_state=15144)

In [14]:
voter = VotingClassifier(estimators=[('dt',dt), ('rf',rf)], voting='soft')

In [15]:
voter.fit(X_train, y_train)

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf...jobs=-1,
            oob_score=False, random_state=15144, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [16]:
voter.score(X_test, y_test)

0.848314606741573