In [1]:
# common stuff
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

In [2]:
from sklearn import preprocessing

# Stuff we did earlier

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
Ytrain = train.Survived
train.drop('Survived', axis=1, inplace=True)
train['is_test'] = 0
test['is_test'] = 1
df = pd.concat([train, test])
df["isMale"] = df.Sex.replace({"male": 1, "female":0})
df.drop(["Sex", "Ticket", "PassengerId"], axis=1, inplace=True)

# Some more transformations

df["CabinLetter"] = df.Cabin.str.split().str[0].str[:1]
df.drop(["Cabin"], axis=1, inplace=True)

df.CabinLetter.fillna('X', inplace=True)

le_cabinletter = preprocessing.LabelEncoder()
df.CabinLetter = le_cabinletter.fit_transform(df.CabinLetter)

df["Ref"] = df.Name.str.lower().str.split().apply(lambda x: [i for i in x if i in ["mr.", "mrs.", "miss."]]) \
    .apply(lambda x: x[0] if x else "")

le_ref = preprocessing.LabelEncoder()
df.Ref = le_ref.fit_transform(df.Ref)

df.drop(["Name"], axis=1, inplace=True)

df.Embarked.fillna('X', inplace=True)

le_embarked = preprocessing.LabelEncoder()
df.Embarked = le_embarked.fit_transform(df.Embarked)

imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
imputer.fit(df[df.is_test == 0])
df_imp = imputer.transform(df)
df = pd.DataFrame(df_imp, columns = df.columns)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 1308
Data columns (total 10 columns):
Pclass         1309 non-null float64
Age            1309 non-null float64
SibSp          1309 non-null float64
Parch          1309 non-null float64
Fare           1309 non-null float64
Embarked       1309 non-null float64
is_test        1309 non-null float64
isMale         1309 non-null float64
CabinLetter    1309 non-null float64
Ref            1309 non-null float64
dtypes: float64(10)
memory usage: 112.5 KB


In [3]:
# Split, now we have our Xtrain, Xtest and Ytrain
Xtrain = df[df.is_test == 0]
Xtest = df[df.is_test == 1]
print(Xtrain.shape)
print(Xtest.shape)
print(Ytrain.shape)

(891, 10)
(418, 10)
(891,)


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

depths = np.arange(1,10)
features_num = np.arange(1, Xtrain.shape[1] + 1)
grid = {'max_depth': depths, 'max_features': features_num}
gridsearch = GridSearchCV(DecisionTreeClassifier(), grid, scoring='neg_log_loss', cv=5)

In [5]:
%%time
gridsearch.fit(Xtrain, Ytrain)

CPU times: user 1.72 s, sys: 0 ns, total: 1.72 s
Wall time: 1.72 s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [6]:
sorted(gridsearch.grid_scores_, key = lambda x: -x.mean_validation_score)[:5]



[mean: -0.46213, std: 0.03321, params: {'max_depth': 2, 'max_features': 8},
 mean: -0.46213, std: 0.03321, params: {'max_depth': 2, 'max_features': 10},
 mean: -0.46265, std: 0.03364, params: {'max_depth': 2, 'max_features': 9},
 mean: -0.48017, std: 0.04542, params: {'max_depth': 2, 'max_features': 5},
 mean: -0.49540, std: 0.04348, params: {'max_depth': 2, 'max_features': 6}]

In [7]:
clf_final = DecisionTreeClassifier(max_depth=3, max_features=8)
clf_final.fit(Xtrain, Ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [10]:
from sklearn.metrics import accuracy_score

Ytest_proba = clf_final.predict_proba(Xtest)
Ytest = clf_final.predict(Xtest)

Ytrain_guess = clf_final.predict(Xtrain)

accuracy_score(Ytrain, Ytrain_guess)

0.8204264870931538