In [9]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import re

[Ensembling](http://scikit-learn.org/stable/modules/ensemble.html)  

In [14]:
df_train = pd.read_csv(train_in)
df_train.drop('Unnamed: 0', axis=1, inplace=True)
df_train['train'] = 1

df_test = pd.read_csv(test_in)
df_test.drop('Unnamed: 0', axis=1, inplace=True)
df_test['survived'] = 0
df_test['test'] = 1

df = pd.concat([df_train, df_test])

In [15]:
title_codes = {
    'Mr': 1,       # General adult male
    'Mrs': 2,      # General adult female
    'Miss': 3,     # General young female
    'Master': 4,   # General young male
    'Don': 5,      # Noble male
    'Rev': 6,      # Professional
    'Dr': 6,       # Professional
    'Mme': 2,      # General adult female
    'Ms': 2,       # General adult female
    'Major': 6,    # Professional
    'Lady': 7,     # Noble female
    'Sir' : 5,     # Noble male
    'Mlle': 3,     # General young female
    'Col': 6,      # Professional
    'Capt': 6,     # Professional
    'Countess': 7, # Noble female
    'Jonkheer': 5, # Noble male
    'Dona': 7      # Noble Female
}

df['family_size'] = df['sibsp'] + df['parch'] + 1
df.drop('cabin', axis=1, inplace=True)

def title(row):
    title_search = re.search(' ([A-Za-z]+)\.', row['name'])
    # If a title exists, extract and return it
    if title_search:
        return title_search.group(1)
    return ""

df['title'] = df.apply(title, axis=1)
df['title'].replace(title_codes, inplace=True)

def infer_age(row):
    '''
    Infers the age for nan values
    '''
    if(pd.isnull(row['age'])):
        
        if row['title'] == 1:    # Mr
            return 30
        elif row['title']  == 2:  # Mrs
            return 35
        elif row['title']  == 3:  # Miss
            return 21
        elif row['title']  == 4:  # Master
            return 4
        elif row['title']  == 5:  # Noble male
            return 40
        elif row['title']  == 6:  # Professional
            return 50
        elif row['title']  == 7:  # Noble female
            return 40

    else:
        return row['age']

df['age'] = df.apply(infer_age, axis=1)

# Getting the true cost of individual tickets
ticket_split = df['ticket'].str.split().tolist()
ticket_number = [x[-1] for x in ticket_split]
df['ticket_number'] = ticket_number
tb_ticket_counts = df[['passengerid', 'ticket_number']].groupby('ticket_number').count().reset_index().rename(columns={'passengerid':'group_size'})
df = pd.merge(
    left=df,
    right=tb_ticket_counts,
    how='left'
)


def family_sizer(row):
    if row['family_size'] == 1:
        return 1
    elif row['family_size'] < 5:
        return 2
    else:
        return 3
    
df['family_size_cat'] = df.apply(family_sizer, axis=1)

def group_sizer(row):
    if row['group_size'] == 1:
        return 1
    elif row['group_size'] < 5:
        return 2
    else:
        return 3
    
df['group_size_cat'] = df.apply(group_sizer, axis=1)

df_sex = pd.get_dummies(df['sex'])
df_sex.columns = ['sex_{}'.format(x) for x in df_sex.columns]

df_title = pd.get_dummies(df['title'])
df_title.columns = ['title_{}'.format(x) for x in df_title.columns]

df_embarked = pd.get_dummies(df['embarked'])
df_embarked.columns = ['embarked_{}'.format(int(x)) for x in df_embarked.columns]

df_family = pd.get_dummies(df['family_size_cat'])
df_family.columns = ['family_size_{}'.format(x)# Scaling
from sklearn.preprocessing import MinMaxScaler

df['fare'].fillna(df['fare'].median(), inplace=True)
df['true_fare'] = round(df['fare'] / df['group_size'],2)
scaler = MinMaxScaler()
fare_array = df['true_fare'].as_matrix()
age_array = df['age'].as_matrix()
family_array = df['family_size'].as_matrix()
group_array = df['group_size'].as_matrix()

fare_scaled = scaler.fit_transform(fare_array.reshape(-1, 1))
age_scaled = scaler.fit_transform(age_array.reshape(-1, 1))
family_scaled = scaler.fit_transform(family_array.reshape(-1, 1))
group_scaled = scaler.fit_transform(group_array.reshape(-1, 1))

df['true_fare_scaled'] = fare_scaled
df['age_scaled'] = age_scaled
df['family_scaled'] = family_scaled
df['group_scaled'] = group_scaled for x in df_family.columns]

df_group = pd.get_dummies(df['group_size_cat'])
df_group.columns = ['group_size_{}'.format(x) for x in df_group.columns]

df = pd.concat([df, df_sex, df_title, df_embarked, df_family, df_group], axis=1)





In [18]:
df.to_csv('../data/full_train_test.csv', index=False)