In [144]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [145]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Pre process data
We need to consider the missing fields in different columns and whether we're using each feature or not. 


From the data analysis, here's some assumptions we're going to consider: 
* We're using Sex feature, as it seems that there more female survivors than male. 
* We're not considering Cabin, as there are way too missing data. 
* We're not considering Ticket nor PassengerId as these are unique identifiers with no further meaning. 
* At a first approach, we won't be using Name either, because it seems somehow redundant that Mr and Mrs map male and female. On a later attempt we might consider normalizing these titles to be the same whether it's  a male or a female.
* We're processing Embarked, but we need to polish this column (use __get_dummies__ to turn this categorical feature into two numeric features).



In [190]:
y = df['Survived']
y.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [195]:
import string
import re

def parseTitles(names):
    titles = []
    titles_pattern = '.+, ([a-zA-Z]+\.) .+'
    pattern = re.compile(titles_pattern) 
    
    for name in names:
        result = pattern.match(name) 
        
        title = result.group(1) if (result != None) else 'NaN'
        
        if title not in titles: 
            titles += [title]
            
    return titles

df['Title'] = df.apply(lambda person: parseTitles([person['Name']])[0], axis=1)

# For it is a female and it's age is around the average of the Mrs
df.loc[df['Title'] == 'NaN', 'Title'] = 'Mrs.'

# Assume that missing embarked data was from people that embarked on Queenstown (Q)
df.loc[df['Embarked'].isnull(), 'Embarked'] = 'Q'

In [150]:
# Assign missing Age values
title_age = df[['Title', 'Age']].groupby('Title').mean()
title_age

Unnamed: 0_level_0,Age
Title,Unnamed: 1_level_1
Capt.,70.0
Col.,58.0
Don.,40.0
Dr.,42.0
Jonkheer.,38.0
Lady.,48.0
Major.,48.5
Master.,4.574167
Miss.,21.773973
Mlle.,24.0


In [223]:
df.loc[df['Age'].isnull(), 'Age'] = df[df['Age'].isnull()].apply(lambda x: title_age.loc[x['Title']]['Age'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
Title          891 non-null object
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


In [263]:
titles = parseTitles(df['Name'])

df['Title'] = df['Title'].apply(lambda x: titles.index(x))

ValueError: 0 is not in list

In [233]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'SibSp', 'Parch', 'Title']
X = df[features]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
Fare        891 non-null float64
Embarked    891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Title       891 non-null int64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [234]:
X_clean = pd.DataFrame(X)

# Clean Sex Category
X_clean = pd.get_dummies(X_clean)
X_clean.head()

Unnamed: 0,Pclass,Age,Fare,SibSp,Parch,Title,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,7.25,1,0,0,0,1,0,0,1
1,1,38.0,71.2833,1,0,1,1,0,1,0,0
2,3,26.0,7.925,0,0,2,1,0,0,0,1
3,1,35.0,53.1,1,0,1,1,0,0,0,1
4,3,35.0,8.05,0,0,0,0,1,0,0,1


In [235]:
# Drop redudant Columns such as Sex_female and Embarked_C
X_clean.drop(axis=1, labels=['Sex_female', 'Embarked_C'], inplace=True)
X_clean.head()

Unnamed: 0,Pclass,Age,Fare,SibSp,Parch,Title,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,7.25,1,0,0,1,0,1
1,1,38.0,71.2833,1,0,1,0,0,0
2,3,26.0,7.925,0,0,2,0,0,1
3,1,35.0,53.1,1,0,1,0,0,1
4,3,35.0,8.05,0,0,0,1,0,1


In [236]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_clean, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [292]:
df_test = pd.read_csv('test.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [299]:
df_test['Title'] = df_test.apply(lambda person: parseTitles([person['Name']])[0], axis=1)
df_test.loc[df_test['Age'].isnull(), 'Age'] = df_test[df_test['Age'].isnull()].apply(lambda x: title_age.loc[x['Title']]['Age'], axis=1)
titles += [title for title in (df_test['Title'].unique()) if title not in titles ]
df_test['Title'] = df_test['Title'].apply(lambda x: titles.index(x))

features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'SibSp', 'Parch', 'Title']
X_test = df_test[features]


X_test_clean = pd.DataFrame(X_test)

# Clean Sex Category
X_test_clean = pd.get_dummies(X_test_clean)
X_test_clean.head()


# Assign median value to missing fare
fare_per_class = X_test_clean[['Pclass', 'Fare']].groupby('Pclass').mean()
X_test_clean.loc[X_test_clean['Fare'].isnull(), 'Fare'] = X_test_clean[X_test_clean['Fare'].isnull()].apply(lambda x: fare_per_class.loc[x['Pclass']]['Fare'], axis=1)

# Drop redudant Columns such as Sex_female and Embarked_C
X_test_clean.drop(axis=1, labels=['Sex_female', 'Embarked_C'], inplace=True)
X_test_clean.head()

df_test.info()
y_test_predict = model.predict(X_test_clean)
y_test_predict.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Title          418 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


(418,)

In [294]:
result = pd.DataFrame(df_test['PassengerId'])
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 1 columns):
PassengerId    418 non-null int64
dtypes: int64(1)
memory usage: 3.3 KB


In [300]:
result['Survived'] = y_test_predict

In [302]:
result.to_csv('test-result.csv', index=False)

# FUTURE WORK 
# APPLY PCA AND USE SCIKIT TOOLS FOR FEATURE EXTRACTION AND FEATURE SELECTION
# Search for better Algorithms... Naive bayes could possible behave well?