# Exploratory Data Analysis



In [114]:
# Do the imports
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

In [144]:
# Import the Dataset
titanic = pd.read_csv('data/train_cleaned.csv', index_col=0)

# Split the predictors and the target
predictors = titanic.drop(['Survived'], axis=1).as_matrix()
target = titanic.Survived.as_matrix()

# Creating the 'train-test-split'
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target)

In [145]:
# Define basic functions to be used for model training

def show_metrics(model):
    train_performance = (model.predict(x_train)==y_train).mean()
    test_performance = (cross_val_score(model, x_test, y_test)).mean()
    print('Training Performance', train_performance)
    print('Test Performance', test_performance)

### Train a basic Decission Tree Classifier

In [150]:
x_train[0]

array([3, 1, 33.0, 0, 0, '347062', 7.775, nan, 0, 0, 1], dtype=object)

In [146]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
param_grid = {
    'criterion':['entropy', 'gini'],
    'splitter':['best', 'random'],
    'max_features':['auto', 'sqrt', 'log2'],
    'max_depth': np.linspace(10,12, 5)
}
dt_model = GridSearchCV(dt_clf, param_grid).fit(x_train, y_train)

ValueError: could not convert string to float: 'C103'

In [123]:
print(dt_model.best_params_)
show_metrics(dt_model)

{'splitter': 'best', 'max_features': 'auto', 'criterion': 'gini', 'max_depth': 10.5}
Training Performance 0.923652694611
Test Performance 0.708675799087


### Training an AdaBoost Regressor

In [17]:
from sklearn.ensemble import AdaBoostClassifier

In [140]:
adab_clf = AdaBoostClassifier(base_estimator=dt_model.best_estimator_)

param_grid = {

    'n_estimators': [1,2,3,5,8,10, 20],
    'learning_rate': [0.01,0.1,0.2,0.4, 0.6,0.8, 1],
}
model = GridSearchCV(adab_clf, param_grid).fit(x_train, y_train)

In [141]:
print(model.best_params_)
print('Accuracy on the Dataset:', accuracy_score(model.predict(x_train), y_train))
print('Accuracy on the Testset:', (cross_val_score(rf_clf, x_test, y_test, cv=20)).mean())

{'n_estimators': 3, 'learning_rate': 0.4}
Accuracy on the Dataset: 0.974550898204
Accuracy on the Testset: 0.772348484848


In [82]:
# Testing Performance with Cross Validation
(cross_val_score(rf_clf, x_test, y_test, cv=20)).mean()

0.7439393939393939

### Training a ExtraTree Classifier

In [22]:
from sklearn.ensemble import ExtraTreesClassifier

In [23]:
extr_clf = ExtraTreesClassifier().fit(x_train, y_train)

In [24]:
(cross_val_score(extr_clf, x_test, y_test)).mean()

0.75768645357686448

### Training a gradient Boosting Classifier

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [39]:
gboost_clf = GradientBoostingClassifier()

In [71]:
# Using Grid Search CV to find the best hyperparamet tuning
param_grid = {
    'learning_rate': [0.01,0.05, 0.1, 0.15, 0.25],
    'n_estimators': [100, 150, 160, 170, 180, 200]
}

model = GridSearchCV(gboost_clf, param_grid, scoring='accuracy').fit(x_train, y_train)

In [73]:
# Estimating Modle Performance
(cross_val_score(model, x_test, y_test)).mean()

0.79829528158295282

In [74]:
model.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.15, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=150, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [76]:
(model.predict(x_train) == y_train).mean()

0.94910179640718562

### Training a Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf_clf = RandomForestClassifier()

param_grid = {
    ''
}

# Search the best Model for the data
model = GridSearchCV(rf_clf, param_grid)

In [12]:
accuracy_score(rf_clf.predict(x_test), y_test)

0.7847533632286996

In [14]:
# Running on cross validation
scores = cross_val_score(rf_clf, x_test, y_test, cv=20)

In [16]:
scores.mean()

0.76287878787878782

### Training a Voting Forrest Classifier

In [30]:
from sklearn.ensemble import VotingClassifier

In [31]:
v_clf = VotingClassifier().fit(x_train, y_train)

TypeError: __init__() missing 1 required positional argument: 'estimators'