In [1]:
# Titanic Survivor Machine Learning Notebook: submission_score: 0.80382
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [2]:
# Import Data & set indices for trainig data, validation data, test data
df = pd.read_csv("train.csv")
targets=df['Survived']
test_idx = len(df)
validation_idx=int(len(df)*0.9)
df_test_data = pd.read_csv("test.csv")
df=df.append(df_test_data)
df=df.drop(['Survived'],axis=1)

In [3]:
#Fill missing Fare based on Pclass
fare_median=pd.pivot_table(df,index='Pclass',aggfunc='mean')['Fare']
df['Fare']=df[['Pclass','Fare']].apply(lambda x: fare_median.get(x['Pclass']) if pd.isnull(x['Fare']) else x['Fare'], axis=1)

#Convert Sex to categorical data
df['Sex']=df['Sex'].map({'male':0,'female':1}).astype(int)

#Convert Embarked to categorical data
df=pd.concat([df,pd.get_dummies(df['Embarked'],prefix='Title')],axis=1)

#Convert first char of cabin to categorical data
df['Cabin']=df[['Cabin']].apply(lambda x: x['Cabin'][0] if not pd.isnull(x['Cabin']) else 'U',axis=1)
df=pd.concat([df,pd.get_dummies(df['Cabin'],prefix='Cabin')],axis=1)

#Add title as a  derived feature
title_dict={'Capt':'occ','Col':'occ','Major':'occ','Don':'royal','Dona':'royal','Dr':'occ','Rev':'occ',
                  'Jonkheer':'royal','Lady':'royal','Master':'master','Miss':'miss','Mlle':'mrs',
                 'Mme':'mrs','Mr':'mr','Mrs':'mrs','Ms':'miss','Sir':'royal','the Countess':'royal'}
tmp_titles = [x.split(',')[-1].split('.')[0].strip() for x in df['Name']]
df['Title']=[title_dict.get(x, 'unk') for x in tmp_titles]
df=pd.concat([df,pd.get_dummies(df['Title'],prefix='Title')],axis=1)

#Fill missing Ages based on Sex, Pclass, Title
df["Age"] = df.groupby(['Sex','Pclass','Title'])['Age'].transform(lambda x: x.fillna(x.median()))

#Add family size as a derived feature
df['FamilySize']=df[['Parch','SibSp']].apply(lambda x: 1+x['Parch']+x['SibSp'],axis=1)
df['SoloTraveler']=df[['FamilySize']].apply(lambda x: 1 if x['FamilySize']==1 else 0, axis=1)
df['SmallFamily']=df[['FamilySize']].apply(lambda x: 1 if x['FamilySize']>1 and x['FamilySize']<=4 else 0, axis=1)
df['LargeFamily']=df[['FamilySize']].apply(lambda x: 1 if x['FamilySize']>4 else 0, axis=1)

#Add ticket prefix as a derived feature
tmp_ticket=(x.replace('/','').replace('.','').split(' ')[0] for x in df['Ticket'])
df['Ticket']=[v if not v.isdigit() else 'UNK' for v in tmp_ticket]
df=pd.concat([df,pd.get_dummies(df['Ticket'],prefix='Ticket')],axis=1)

In [4]:
#Delete unnecessary columns
df=df.drop(['Cabin','Embarked','Name','Title','Ticket'],axis=1)

In [5]:
#Build Training Model data split
X_training = df.as_matrix()[:validation_idx,:]
y_training = targets.as_matrix()[:validation_idx]

X_validation = df.as_matrix()[validation_idx:test_idx,:]
y_validation = targets.as_matrix()[validation_idx:test_idx]

X_solve = df.as_matrix()[test_idx:,:]


In [6]:
# Build training model
parameter_grid = {
    'max_depth' : [6, 8, 10],
    'n_estimators': [300, 350, 450, 500],
    'max_features': [0.4,0.5,0.6,0.7,0.8,1.],
    'min_samples_split': [3, 5, 10],
    'min_samples_leaf': [2, 3, 10],
    'bootstrap': [True, False],
    }

forest = RandomForestClassifier(random_state=1)
cross_validation = StratifiedKFold(shuffle=True, n_splits=10)

grid_search = GridSearchCV(forest, scoring='accuracy', n_jobs=-1, param_grid=parameter_grid, cv=cross_validation)
grid_search.fit(X_training, y_training)

model = grid_search
parameters = grid_search.best_params_

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))


KeyboardInterrupt: 

In [None]:
#Validation test
y_pred = model.predict(X_validation).astype(int)
print("Accuracy: {}".format(np.sum(y_pred==y_validation)*1/len(y_validation)))

In [None]:
#Generate results
output=model.predict(X_solve)
result = np.c_[df_test_data['PassengerId'].astype(int),output.astype(int)]
df_result=pd.DataFrame(result[:,0:2],columns=['PassengerId','Survived'])
df_result.to_csv("result.csv",index=False)