## Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from numpy.random import normal
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

## Import data and inspect

In [2]:
df = pd.read_csv('train.csv')

In [36]:
df['Embarked'].value_counts().keys()[0]

'S'

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [93]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Processing Titles

In [4]:
df['Titles'] = df['Name'].str.extract('(\w*\.)')

In [5]:
df['Titles'].value_counts()

Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
Sir.           1
Countess.      1
Lady.          1
Capt.          1
Jonkheer.      1
Don.           1
Mme.           1
Ms.            1
Name: Titles, dtype: int64

In [6]:
df.loc[~df['Titles'].isin(['Mr.', 'Miss.', 'Mrs.', 'Master.']), 'Titles'] = 'Other'

In [7]:
df['Titles'].value_counts()

Mr.        517
Miss.      182
Mrs.       125
Master.     40
Other       27
Name: Titles, dtype: int64

In [8]:
df = pd.concat([df, pd.get_dummies(df['Titles'])], axis=1)

## Processing Age Related Data

In [9]:
df['Age'].isnull().sum()

177

There are 177 null values - we have two choices here: impute the data using the mean/median OR randomly assign a value. To randomly assign a value we will use the Numpy Random function passing in the index/passenger ID as the seed for reproducibility.

In [100]:
df['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

#### Impute Age Using Randomly Assigned Value from Mean/Std (Option 1)

In [101]:
seed = np.random.RandomState(15144)
age_estimates = seed.normal(df['Age'].mean(), scale=df['Age'].std(), size=df['Age'].isnull().sum())

In [102]:
age_estimates = [x if x > 0 else df['Age'].mean() for x in age_estimates]
df.loc[df['Age'].isnull(), 'Age'] = age_estimates

In [103]:
df['Age'].describe()

count    891.000000
mean      29.549299
std       14.262134
min        0.420000
25%       20.000000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

#### Imputing Age Using Median/Mean (Option 2)

In [10]:
mean_age_by_gender = df.groupby('Sex').mean()['Age'].to_dict()
null_age = df.loc[df['Age'].isna(), :].Sex.map(mean_age_by_gender)
df.loc[null_age.index, 'Age'] = null_age

## Processing Ticket Related Data

In [9]:
df['Ticket'] = df['Ticket'].str.replace('\.|/', '')

In [10]:
temp_ticket = df['Ticket'].str.extract('(\w*)?\s?(.*)')

In [11]:
for index, row in temp_ticket.iterrows():
    
    if not row[1]:
        
        temp_ticket.iloc[index][1] = row[0]
        temp_ticket.iloc[index][0] = ''

temp_ticket = pd.concat([temp_ticket[1], 
                         pd.get_dummies(temp_ticket[0])[temp_ticket[0].value_counts().keys()[1:10].tolist()]], 
                        axis=1)

In [12]:
temp_ticket[1] = temp_ticket[1].str.extract('(\d{2,})')
temp_ticket[1] = temp_ticket[1].fillna(0).astype(int)
temp_ticket.rename(columns={1:"TicketID"}, inplace=True)

## Mining Cabin Related Data

#### Extracting Cabin Info into Numerical Values

In [13]:
cabin_extract = df['Cabin'].str.extract('(\w)\d*$')
cabin_extract = pd.get_dummies(cabin_extract[0])
z = dict(zip(cabin_extract.columns.tolist(), [x + '_CABIN' for x in cabin_extract.columns.tolist()]))
cabin_extract.rename(columns=z, inplace=True)
df = pd.concat([df, cabin_extract], axis=1)


#cabin_extract = pd.concat([cabin_extract[1], pd.get_dummies(cabin_extract[0])], axis=1).fillna(0)

In [14]:
cabin_extract

Unnamed: 0,A_CABIN,B_CABIN,C_CABIN,D_CABIN,E_CABIN,F_CABIN,G_CABIN,T_CABIN
0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0


In [15]:
cabin_extract.rename(columns={1:'CabinNo'}, inplace=True)
z = dict(zip(cabin_extract.columns[1:].tolist(), [x + '_CABIN' for x in cabin_extract.columns[1:].tolist()]))
cabin_extract.rename(columns=z, inplace=True)
cabin_extract.loc[cabin_extract['CabinNo'] == '', 'CabinNo'] = 0
cabin_extract.loc[:, 'CabinNo'] = cabin_extract['CabinNo'].astype(int, errors='ignore')
cabin_extract.head()

KeyError: 'CabinNo'

In [111]:
df = pd.concat([df, cabin_extract], axis=1)

#### Creating Boolean Cabin Value

In [16]:
df.loc[:, 'InCabin'] = ~df['Cabin'].isna()

## Passenger Class Encoded

In [17]:
pclass = pd.get_dummies(df['Pclass'])

In [18]:
map = dict(zip(pclass.columns.tolist(), ['class_' + str(x) for x in pclass.columns.tolist()]))

In [19]:
pclass.rename(columns=map, inplace=True)

In [20]:
class_dummies = pd.get_dummies(df['Pclass'])
class_dummies_names = ['class_' + str(x) for x in class_dummies.columns.to_list()]
df[class_dummies_names] = class_dummies

## Embarked Encoded

In [21]:
df = pd.concat([df, pd.get_dummies(df['Embarked'])], axis=1)

## Gender Encoded

In [22]:
df[['female', 'male']] = pd.get_dummies(df['Sex'])

## Feature Engineering

In [116]:
# TO DO -- 

In [117]:
#df.loc[:, 'Child'] = df['Age'] <= 12

#Life expectancy in 1912 was ~53, well say someone is elderly if they are older than 60
#df.loc[:, 'Elderly'] = df['Age'] >= 60

In [24]:
df['FamilyAboard'] = df['Parch'] + df['SibSp']

In [25]:
df['IsAlone'] = df['FamilyAboard'] == 0

# -Age Imputing-

In [29]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,class_1,class_2,class_3,C,Q,S,female,male,FamilyAboard,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A5 21171,7.25,...,0,0,1,0,0,1,0,1,1,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,1,0,0,1,0,0,1,0,1,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STONO2 3101282,7.925,...,0,0,1,0,0,1,1,0,0,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,1,0,0,0,0,1,1,0,1,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,0,1,0,0,1,0,1,0,True


In [34]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Titles', 'Master.',
       'Miss.', 'Mr.', 'Mrs.', 'Other', 'A_CABIN', 'B_CABIN', 'C_CABIN',
       'D_CABIN', 'E_CABIN', 'F_CABIN', 'G_CABIN', 'T_CABIN', 'InCabin',
       'class_1', 'class_2', 'class_3', 'C', 'Q', 'S', 'female', 'male',
       'FamilyAboard', 'IsAlone'],
      dtype='object')

In [44]:
t = df.groupby(['Sex', 'Pclass','Titles']).mean()['Age'].reset_index()

In [45]:
t.head()

Unnamed: 0,Sex,Pclass,Titles,Age
0,female,1,Miss.,30.0
1,female,1,Mrs.,40.882353
2,female,1,Other,33.666667
3,female,2,Miss.,22.390625
4,female,2,Mrs.,33.682927


In [77]:
t.set_index(['Sex', 'Pclass', 'Titles']).loc[('dragon', 1, 'Miss.'), 'Age']

KeyError: ('dragon', 1, 'Miss.')

## Dropping unecessary data

In [120]:
df = df.drop(df[df['Embarked'].isna()].index)
drop_columns = ['PassengerId', 'Sex', 'Ticket', 'Cabin', 'Name', 'Pclass', 'Embarked', 'Titles']
df_dropped = df.drop(drop_columns, axis=1)

## Export Processed Data to Cleaned CSV

In [121]:
df_dropped.to_csv('train_cleaned.csv', index=False)