In [18]:
import numpy as np
import pandas as pd

In [19]:
print ('Reading Data...')
train_data = pd.read_csv('../input/train.csv',dtype = {'Age':np.float64},)
test_data = pd.read_csv('../input/train.csv',dtype={'Age':np.float64},)

Reading Data...


In [20]:
print ('Cleaning Data...')

combined2 = pd.concat([train_data,test_data],axis = 0)
combined2.Embarked.fillna('S',inplace = True)

combined2.Fare.fillna(np.median(combined2.Fare[combined2.Fare.notnull()]),inplace = True)

Cleaning Data...


In [21]:
import re
def get_title(name):
    #Use a regular expression to search for a title. Titles always consist of capital and lowercase 
    #letters , and end with a period
    title_search = re.search(' ([A-Za-z]+)\.',name)
    #if the title exsits, extract and return it
    if title_search:
        return title_search.group(1)
    return ""

combined2['Title'] = combined2['Name'].apply(get_title)
title_mapping = {'Mr':1,'Miss':2,'Mrs':3,'Master':4,'Dr':5,'Rev':6,'Major':7,'Col':7,'Mlle':8,
                'Mme':8,'Don':7,'Dona':10,'Lady':10,'Countess':10,'Jonkheer':10,'Sir':7,'Capt':7,'Ms':2}
combined2['TitleCat'] = combined2.loc[:,'Title'].map(title_mapping)

In [22]:
combined2['CabinCat']=pd.Categorical.from_array(combined2.Cabin.fillna('0').apply(lambda x:x[0])).codes
combined2.Cabin.fillna('0',inplace = True)

combined2['EmbarkedCat'] = pd.Categorical.from_array(combined2.Embarked).codes
combined2.drop(['Ticket'],axis = 1,inplace = True)

In [23]:
print ('Consolidating Data...')
full_data = pd.concat([combined2.drop(['Survived'],axis = 1),
                      pd.get_dummies(combined2.Sex,prefix = 'Sex'),
                      combined2.Survived],axis = 1)
#full_data.head()

Consolidating Data...


In [24]:
full_data['FamilySize'] = full_data['SibSp']+full_data['Parch']
full_data['NameLength'] = full_data.Name.apply(lambda x:len(x))

In [25]:

import operator
family_id_mapping = {}
def get_family_id(row):
    last_name = row['Name'].split(',')[0]
    family_id = '{0}{1}'.format(last_name,row['FamilySize'])
    
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            current_id = (max(family_id_mapping.items(),key = operator.itemgetter(1))[1]+1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

family_ids = full_data.apply(get_family_id,axis = 1)
#There are a lot of family ids , so we 'll compress all of the families under 3 members into one code

family_ids[full_data['FamilySize']<3] = -1
full_data["FamilyId"] = family_ids         

In [27]:
###Person Label
child_age = 14
def get_person(passenger):
    age,sex = passenger
    if(age<child_age):
        return 'child'
    elif (sex == 'female'):
        return 'female_adult'
    else:
        return 'male_adult'

full_data = pd.concat([full_data,pd.DataFrame(full_data[['Age','Sex']].apply(get_person,axis = 1),columns=['person'])],axis = 1)
dummies = pd.get_dummies(full_data['person'])
full_data = pd.concat([full_data,dummies],axis = 1)

In [28]:
def process_surname(nm):
    return nm.split(',')[0].lower()

full_data['surname'] = full_data['Name'].apply(process_surname)

In [29]:
###Persihing Females
perishing_female_surnames = list(set(full_data[(full_data.female_adult==1.0) & (full_data.Survived == 0.0) & ((full_data.Parch>0) | (full_data.SibSp>0))]['surname'].values))

def perishing_mother_wife(passenger):
    surname,Pclass,person = passenger
    return 1.0 if (surname in perishing_female_surnames) else 0.0

full_data['perishing_mother_wife'] = full_data[['surname','Pclass','person']].apply(perishing_mother_wife,axis = 1)

###Surviving Males
surviving_male_surnames = list(set(full_data[(full_data.male_adult == 1.0) & 
                                             (full_data.Survived == 1.0) & 
                                            ((full_data.Parch > 0) | (full_data.SibSp>0))]['surname'].values))

def surviving_father_husband(passenger):
    surname,Pclass,person = passenger
    return 1.0 if (surname in surviving_male_surnames) else 0.0

full_data['surviving_father_husband'] = full_data[['surname','Pclass','person']].apply(surviving_father_husband,axis = 1)

full_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,...,FamilySize,NameLength,FamilyId,person,child,female_adult,male_adult,surname,perishing_mother_wife,surviving_father_husband
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,0,S,...,1,23,-1,male_adult,0.0,0.0,1.0,braund,0.0,0.0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,...,1,51,-1,female_adult,0.0,1.0,0.0,cumings,0.0,0.0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,0,S,...,0,22,-1,female_adult,0.0,1.0,0.0,heikkinen,0.0,0.0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,...,1,44,-1,female_adult,0.0,1.0,0.0,futrelle,0.0,0.0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,0,S,...,0,24,-1,male_adult,0.0,0.0,1.0,allen,0.0,0.0


In [None]:
#Age Impute Location

