In [None]:
# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import seaborn as sns
sns.set()

import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [None]:
y = train["Survived"]
X_train = train.drop(["Survived"], axis=1)

X_train[:5]

In [None]:
print(train["Age"].isna().count() - train["Age"].count())

# Set up the matplotlib figure
f, axes = plt.subplots(3, 2, figsize=(10, 10))
sns.despine(left=True)

sns.distplot(train["Age"].dropna(), ax=axes[0,0])
sns.distplot(train["Pclass"], ax=axes[0,1], kde=False)
sns.distplot(train["Parch"], ax=axes[1,0], kde=False)
sns.distplot(train["SibSp"], ax=axes[1,1], kde=False)
sns.distplot(train["Fare"], ax=axes[2,0])
sns.distplot(train["Fare"], ax=axes[2,0])

sns.catplot(x="Sex", y="Survived", kind="bar", data=train)
sns.catplot(x="Sex", y="Survived", kind="bar", data=train, hue="Pclass")

In [None]:
print("Pclass: ", X_train["Pclass"].hasnans)
print("Sex: ", X_train["Sex"].hasnans)
print("SibSp: ", X_train["SibSp"].hasnans)
print("Parch: ", X_train["Parch"].hasnans)
print("Embarked: ", X_train["Embarked"].hasnans)
print("Cabin: ", X_train["Cabin"].hasnans)
print("Fare: ", X_train["Fare"].hasnans)

X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.20, random_state=42)

print(X_train.shape)
print(X_test.shape)

In [None]:
data = X_train.assign(Survived=y_train)
sns.catplot(x="Cabin", y="Survived", kind="bar", data=data, hue="Pclass")

In [None]:
# Default onehotencoder for Strings
onehotencoder = OneHotEncoder(dtype='int', categories='auto', handle_unknown='ignore')

# Impute and scale numeric values 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Impute with constant fill_value to create an UNKNOWN class and onehotencode
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehotencoder)])

# Change sex from male or female to child if age is smaller or equal to 16
def mark_children(df):
    # Mark under 16 as children and replace sex with child
    children = df["Age"] > 16
    return df["Sex"].where(children, "child").to_numpy().reshape(-1, 1)

# Change sex to distinguish between children, men and women
augment_sex = Pipeline(steps = [
    ('mark_children', FunctionTransformer(mark_children, validate=False)),
    ('onehot', onehotencoder)
])

def take_first_char(array):
    # Shape of array is a list of a list of words e.g. [['word1'],['word2']]     
    return np.array([x[0][0] for x in array]).reshape(-1, 1)

# Imputer with constant fill_value to create an UNKNOWN class,
# take leading character and onehotencode
onehot_replace_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="UNKNOWN")),
    ('replacer', FunctionTransformer(take_first_char, validate=False)),
    ('onehot', onehotencoder)
])

ct = ColumnTransformer([
    ("onehotencode", onehotencoder, ["Pclass", "SibSp", "Parch"]), # 17 features
    ("impute-and-onehot-embarked", onehot_transformer, ["Embarked"]), # 4 features
    ("impute-and-onehot-cabin", onehot_replace_transformer, ["Cabin"]), # 9 features
    ("find-children", augment_sex, ["Sex", "Age"]), # 3 features
    ("impute-and-scale-age,fare", numeric_transformer, ["Age", "Fare"]) # 2 features
])

ct.fit_transform(X_train)

# print("Nans after transform: ", np.isnan(matrix.data).any())

In [None]:
clf = Pipeline(
    steps = [('data-transformer', ct),
             ('randomforest', RandomForestClassifier(n_estimators=100))]
)

clf.fit(X_train, y_train)

clf.score(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(clf.score(X_test, y_test))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'randomforest__n_estimators': n_estimators,
               'randomforest__max_features': max_features,
               'randomforest__max_depth': max_depth,
               'randomforest__min_samples_split': min_samples_split,
               'randomforest__min_samples_leaf': min_samples_leaf,
               'randomforest__bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 10, 
                               verbose=2, 
                               random_state=42,
                               n_jobs=-1
                              )

# Fit the random search model
rf_random.fit(X_train, y_train)

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = model.score(test_features, test_labels)*100
    print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

clf.fit(X_train, y_train)
base_accuracy = evaluate(clf, X_test, y_test)

best_random = rf_random.best_estimator_
best_random_accuracy = evaluate(best_random, X_test, y_test)

In [None]:
print(rf_random.best_params_)

# Create the parameter grid based on the results of random search 
param_grid = {
    'randomforest__bootstrap': [True],
    'randomforest__max_depth': [200],
    'randomforest__max_features': [10],
    'randomforest__min_samples_leaf': [5],
    'randomforest__min_samples_split': [5],
    'randomforest__n_estimators': [700]
}

est = Pipeline(
    steps = [('data-transformer', ct),
             ('randomforest', RandomForestClassifier())]
)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = est, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

grid_search.best_params_

best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))


In [None]:
results = best_grid.predict(test)
results = pd.Series(results,name="Survived")
submission = pd.concat([ test["PassengerId"] ,results],axis = 1)

submission.to_csv("data/results.csv", index=False)

In [None]:
# 78.21%. Submitted to kaggle
# {'randomforest__bootstrap': True,
#  'randomforest__max_depth': 200,
#  'randomforest__max_features': 10,
#  'randomforest__min_samples_leaf': 5,
#  'randomforest__min_samples_split': 5,
#  'randomforest__n_estimators': 700}

grid_search.best_params_
