# Load Dataset

In [19]:
import pandas as pd

dataset_file_name = './datasets/titanic/dataset.csv'

raw_df = pd.read_csv(dataset_file_name)
raw_df.head(2)

# Dataset contains
# PassengerId
# Survived
# Pclass
# Name 
# Sex
# Age
# SibSp: number of brothers/sisters aboard
# Parch: number of parents aboard
# Ticket: Id
# Fare
# Cabin: Id
# Embarked: C = Cherbourg, Q = Queenstown, S = Southampton

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


# Keep revelant columns and clean strange lines

In [20]:
full_col = ["PassengerId", "Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Cabin", "Embarked"]
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Cabin", "Embarked"]

col_limited_df = raw_df[full_col]
df = col_limited_df.dropna()

df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
1,2,1,1,female,38.0,1,0,71.2833,C85,C
3,4,1,1,female,35.0,1,0,53.1,C123,S


In [21]:
df['Cabin'] = df['Cabin'].apply(lambda x: x[:1])
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
1,2,1,1,female,38.0,1,0,71.2833,C,C
3,4,1,1,female,35.0,1,0,53.1,C,S


# Encode classes

In [22]:
from sklearn import preprocessing
from collections import defaultdict

d = defaultdict(preprocessing.LabelEncoder)

fit = df.apply(lambda x: d[x.name].fit_transform(x))

fit.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
1,0,1,0,0,32,1,0,57,2,0
3,1,1,0,0,28,1,0,44,2,2


# Split datasets

In [23]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(fit, test_size=0.2)

# Train model

In [35]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from random import randint

max_score = 0
clf = None

for i in range(0,1000):
    if randint(0, 1) == 0: 
        c = 'gini' 
    else: 
        c = 'entropy'
    
    new_clf = RandomForestClassifier(
        criterion=c,
        max_depth=randint(2, 10000),
        min_samples_split=randint(2,100),
        min_samples_leaf=randint(2,100),
        n_estimators=randint(2,100),
        bootstrap=randint(0, 1) == 0
    )
    
    new_clf.fit(train[features], train.Survived)
    
    predicted = new_clf.predict(test[features])
    
    actual_score = sklearn.metrics.f1_score(test.Survived, predicted)
    
    if actual_score > max_score:
        max_score = actual_score
        print('loss {}'.format(actual_score))
        clf = new_clf

clf

loss 0.8666666666666666
loss 0.8813559322033898
loss 0.896551724137931


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1235, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=33,
            min_weight_fraction_leaf=0.0, n_estimators=97, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Predict then evaluate results

In [36]:
predicted = clf.predict(test[features])

sklearn.metrics.f1_score(test.Survived, predicted)

0.896551724137931