In [5]:
import csv as csv
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn_pandas import DataFrameMapper

In [6]:
df_train = pd.read_csv('train.csv', header=0, index_col='PassengerId')
df_test = pd.read_csv('test.csv', header=0, index_col='PassengerId')
# concatenate the data
df = pd.concat([df_train, df_test], keys=["train", "test"])

In [7]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
train,1,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171
train,2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599
train,3,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282
train,4,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803
train,5,35.0,,S,8.0500,"Allen, Mr. William Henry",0,3,male,0,0.0,373450
train,6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,0.0,330877
train,7,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,1,male,0,0.0,17463
train,8,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,3,male,3,0.0,349909
train,9,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3,female,0,1.0,347742
train,10,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2,female,1,1.0,237736


In [8]:
# new features from names
df['Title'] = df['Name'].apply(lambda c: c[c.index(',') + 2 : c.index('.')])
df['LastName'] = df['Name'].apply(lambda n: n[0:n.index(',')])
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
# replace missing data with the mode of the column
df.loc[df['Embarked'].isnull(), 'Embarked'] = df['Embarked'].mode()[0]
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mode()[0]
# features for family
df['FamilyID'] = df['LastName'] + ':' + df['FamilySize'].apply(str)
df.loc[df['FamilySize'] <= 2, 'FamilyID'] = 'Small_Family'
# replace missing age data with median by title
df['AgeOriginallyNaN'] = df['Age'].isnull().astype(int)
medians_by_title = pd.DataFrame(df.groupby('Title')['Age'].median()) \
  .rename(columns = {'Age': 'AgeFilledMedianByTitle'})
df = df.merge(medians_by_title, left_on = 'Title', right_index = True) \
  .sort_index(level = 0).sort_index(level = 1)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,LastName,FamilySize,FamilyID,AgeOriginallyNaN,AgeFilledMedianByTitle
Unnamed: 0_level_1,PassengerId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
train,1,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,Mr,Braund,2,Small_Family,0,29.0
train,2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,Mrs,Cumings,2,Small_Family,0,35.5
train,3,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,Miss,Heikkinen,1,Small_Family,0,22.0
train,4,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803,Mrs,Futrelle,2,Small_Family,0,35.5
train,5,35.0,,S,8.0500,"Allen, Mr. William Henry",0,3,male,0,0.0,373450,Mr,Allen,1,Small_Family,0,29.0
train,6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,0.0,330877,Mr,Moran,1,Small_Family,1,29.0
train,7,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,1,male,0,0.0,17463,Mr,McCarthy,1,Small_Family,0,29.0
train,8,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,3,male,3,0.0,349909,Master,Palsson,5,Palsson:5,0,4.0
train,9,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3,female,0,1.0,347742,Mrs,Johnson,3,Johnson:3,0,35.5
train,10,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2,female,1,1.0,237736,Mrs,Nasser,2,Small_Family,0,35.5


In [9]:

# reseparate using keys
df_train = df.ix['train']
df_test  = df.ix['test']

In [11]:
df_train

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,LastName,FamilySize,FamilyID,AgeOriginallyNaN,AgeFilledMedianByTitle
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,Mr,Braund,2,Small_Family,0,29.0
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,Mrs,Cumings,2,Small_Family,0,35.5
3,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,Miss,Heikkinen,1,Small_Family,0,22.0
4,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803,Mrs,Futrelle,2,Small_Family,0,35.5
5,35.0,,S,8.0500,"Allen, Mr. William Henry",0,3,male,0,0.0,373450,Mr,Allen,1,Small_Family,0,29.0
6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,0.0,330877,Mr,Moran,1,Small_Family,1,29.0
7,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,1,male,0,0.0,17463,Mr,McCarthy,1,Small_Family,0,29.0
8,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,3,male,3,0.0,349909,Master,Palsson,5,Palsson:5,0,4.0
9,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3,female,0,1.0,347742,Mrs,Johnson,3,Johnson:3,0,35.5
10,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2,female,1,1.0,237736,Mrs,Nasser,2,Small_Family,0,35.5


In [13]:
X_train = df_train[df_train.columns.drop('Survived')]
y_train = df_train['Survived']

In [17]:
X_train

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Title,LastName,FamilySize,FamilyID,AgeOriginallyNaN,AgeFilledMedianByTitle
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,Mr,Braund,2,Small_Family,0,29.0
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,Mrs,Cumings,2,Small_Family,0,35.5
3,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,Miss,Heikkinen,1,Small_Family,0,22.0
4,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,Mrs,Futrelle,2,Small_Family,0,35.5
5,35.0,,S,8.0500,"Allen, Mr. William Henry",0,3,male,0,373450,Mr,Allen,1,Small_Family,0,29.0
6,,,Q,8.4583,"Moran, Mr. James",0,3,male,0,330877,Mr,Moran,1,Small_Family,1,29.0
7,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,1,male,0,17463,Mr,McCarthy,1,Small_Family,0,29.0
8,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,3,male,3,349909,Master,Palsson,5,Palsson:5,0,4.0
9,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3,female,0,347742,Mrs,Johnson,3,Johnson:3,0,35.5
10,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2,female,1,237736,Mrs,Nasser,2,Small_Family,0,35.5
