# Titanic Feature Engineering
## Nick's from Scratch

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("Data/train.csv", index_col='PassengerId')
test_df = pd.read_csv("Data/test.csv", index_col='PassengerId')
Survived = train_df['Survived'].copy()
train_df = train_df.drop('Survived', axis=1)

In [3]:
test_df.shape, train_df.shape

((418, 10), (891, 10))

In [4]:
df = pd.concat([test_df, train_df])
traindex = train_df.index
testdex = test_df.index
print(test_df.equals(df.loc[testdex,:]))
print(train_df.equals(df.loc[traindex,:]))
del train_df
del test_df

True
True


#### Missing Values

In [5]:
# Proportion Missing Table:
settypes=df.dtypes.reset_index()
def missing(df):
    missing = df.isnull().sum(axis=0).reset_index()
    missing.columns = ['column_name', 'missing_count']
    missing['missing_ratio'] = missing['missing_count'] / df.shape[0]
    missing = pd.merge(missing,settypes, left_on='column_name', right_on='index',how='inner')
    missing = missing.loc[(missing['missing_ratio']>0)]\
    .sort_values(by=["missing_ratio"], ascending=False)
    print(missing)

In [6]:
missing(df)

  column_name  missing_count  missing_ratio     index        0
8       Cabin           1014       0.774637     Cabin   object
3         Age            263       0.200917       Age  float64
9    Embarked              2       0.001528  Embarked   object
7        Fare              1       0.000764      Fare  float64


#### Fix for Missing Values

In [7]:
# Drop Identification Variable and data with heavy missing
df= df.drop(['Ticket', 'Cabin'], axis=1)

In [8]:
df[df.isnull()['Embarked']]
# df[df.isnull().any(axis=1)] # View all missing

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
62,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


In [9]:
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode().iloc[0])

In [10]:
# Model Missing values for Age
# Add Model at some point
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [12]:
df.describe(include=['O'])

Unnamed: 0,Name,Sex,Embarked
count,1309,1309,1309
unique,1307,2,3
top,"Connolly, Miss. Kate",male,S
freq,2,843,916


In [26]:
# No More Missing Values
missing(df)

Empty DataFrame
Columns: [column_name, missing_count, missing_ratio, index, 0]
Index: []


#### Titles

In [13]:
# Processing Passenger Title
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(df['Title'], df['Sex']).transpose()

Title,Capt,Col,Countess,Don,Dona,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
female,0,0,1,0,1,1,0,1,0,0,260,2,1,0,197,2,0,0
male,1,4,0,1,0,7,1,0,2,61,0,0,0,757,0,0,8,1


In [14]:
import re

# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

df['Title'] = df['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"

df['Title'] = df['Title'].replace(
    ['Lady', 'Countess','Capt', 'Col','Don',
     'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

# Mapping titles
## Will Use Dummy instead
#title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
#df['Title'] = df['Title'].map(title_mapping)
#df['Title'] = df['Title'].fillna(0)

df = df.drop('Name', axis=1)

#### Numeric Categories

In [15]:
## Assign Binary to Sex str
df['Sex'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
# Title
df['Title'] = df['Title'].map( {'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master':3, 'Rare':4} ).astype(int)
# Embarked
df['Embarked'] = df['Embarked'].map( {'Q': 0, 'S': 1, 'C': 2, 'Master':3, 'Rare':4} ).astype(int)

In [18]:
for col in ['Embarked', 'Title', 'Parch','SibSp','Pclass']:
    print(col, df[col].unique())
#df.aggregate(np.unique, axis=1)

Embarked [0 1 2]
Title [0 1 2 3 4]
Parch [0 1 3 2 4 6 5 9]
SibSp [0 1 2 3 4 5 8]
Pclass [3 2 1]


In [19]:
## Apply Dummy to Appropriate Variables
# df = pd.get_dummies(df, columns=['Embarked', 'Title', 'Parch','SibSp','Pclass'], )

#### Standardization

In [16]:
# Scaler
from sklearn import preprocessing
for col in ['Fare', 'Age']:
    transf = df.Fare.reshape(-1,1)
    scaler = preprocessing.StandardScaler().fit(transf)
    df[col] = scaler.transform(transf)



In [17]:
df.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,0,-0.492396,0,0,-0.492396,0,0
893,3,1,-0.508429,1,0,-0.508429,1,1
894,2,0,-0.456465,0,0,-0.456465,0,0
895,3,0,-0.476284,0,0,-0.476284,1,0
896,3,1,-0.406194,1,1,-0.406194,1,1


#### Recombine

In [20]:
train_df = df.loc[traindex, :]
train_df['Survived'] = Survived

In [21]:
#df.loc[traindex, :].to_csv((os.path.join(path,r"train_nick.csv")),header=True,index=True)
#df.loc[testdex, :].to_csv((os.path.join(path,r"test_nick.csv")),header=True,index=True)

#### Create File

In [22]:
train_df.to_csv('Data/clean_train_nick.csv',header=True,index=True)
df.loc[testdex, :].to_csv('Data/clean_test_nick.csv',header=True,index=True)

#### View Output

In [24]:
train_df.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,-0.503595,1,0,-0.503595,1,0,0
2,1,1,0.734503,1,0,0.734503,2,1,1
3,3,1,-0.490544,0,0,-0.490544,1,2,1
4,1,1,0.382925,1,0,0.382925,1,1,1
5,3,0,-0.488127,0,0,-0.488127,1,0,0


In [25]:
df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.355997,7.548499000000001e-17,0.498854,0.385027,7.561221000000001e-17,1.112299,0.78304
std,0.837836,0.478997,1.000382,1.041658,0.86556,1.000382,0.536505,1.058092
min,1.0,0.0,-0.6437751,0.0,0.0,-0.6437751,0.0,0.0
25%,2.0,0.0,-0.4911082,0.0,0.0,-0.4911082,1.0,0.0
50%,3.0,0.0,-0.3643001,0.0,0.0,-0.3643001,1.0,0.0
75%,3.0,1.0,-0.0390664,1.0,0.0,-0.0390664,1.0,2.0
max,3.0,1.0,9.262219,8.0,9.0,9.262219,2.0,4.0
