# Data Preprocessing
Titanic - by [Saniewski](https://github.com/Saniewski)

In [95]:
import numpy as np
import pandas as pd

In [96]:
titanic = pd.read_csv('./data/train.csv')

In [97]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [98]:
df = titanic.copy()

## Simple columns and values preprocessing

Remove redundant ID column - `PassengerId`

In [99]:
df.drop(columns=['PassengerId'], inplace=True)
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


One-hot encode `Pclass` (passenger's socio-economic class), `Sex`, and `Embarked` columns.

In [100]:
dummies_pclass = pd.get_dummies(df['Pclass'], drop_first=True).rename(columns={2: "2nd_class", 3: "3rd_class"})
dummies_sex = pd.get_dummies(df['Sex'], drop_first=True)
dummies_embarked = pd.get_dummies(df['Embarked'], drop_first=True).rename(columns={'Q': 'Queenstown', 'S': 'Southampton'})
df = pd.concat([df, dummies_pclass, dummies_sex, dummies_embarked], axis=1)
df.drop(columns=['Pclass', 'Sex', 'Embarked'], inplace=True)
df

Unnamed: 0,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,2nd_class,3rd_class,male,Queenstown,Southampton
0,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,0,1,1,0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,0,1,0,0,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,0,0,0,0,1
4,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,1,0,1,0,1
887,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,0,0,0,0,1
888,0,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,,0,1,0,0,1
889,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,0,0,1,0,0


Convert `Cabin` column values to indicate whether a passenger's cabin is known after the disaster. A lot of this data is missing, might try to make use of what we have.

In [101]:
df['Cabin'] = df['Cabin'].apply(lambda x: 0 if x is np.nan else 1)
df

Unnamed: 0,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,2nd_class,3rd_class,male,Queenstown,Southampton
0,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,0,0,1,1,0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,0,0,0,0
2,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,0,0,1,0,0,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,1,0,0,0,0,1
4,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,0,1,0,1,0,1
887,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,1,0,0,0,0,1
888,0,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,0,0,1,0,0,1
889,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,1,0,0,1,0,0


Remove `Ticket` column - not sure if it can be transformed in any useful way...

In [102]:
df.drop(columns=['Ticket'], inplace=True)
df

Unnamed: 0,Survived,Name,Age,SibSp,Parch,Fare,Cabin,2nd_class,3rd_class,male,Queenstown,Southampton
0,0,"Braund, Mr. Owen Harris",22.0,1,0,7.2500,0,0,1,1,0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,1,0,0,0,0,0
2,1,"Heikkinen, Miss. Laina",26.0,0,0,7.9250,0,0,1,0,0,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1000,1,0,0,0,0,1
4,0,"Allen, Mr. William Henry",35.0,0,0,8.0500,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,"Montvila, Rev. Juozas",27.0,0,0,13.0000,0,1,0,1,0,1
887,1,"Graham, Miss. Margaret Edith",19.0,0,0,30.0000,1,0,0,0,0,1
888,0,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,23.4500,0,0,1,0,0,1
889,1,"Behr, Mr. Karl Howell",26.0,0,0,30.0000,1,0,0,1,0,0


Convert `Name` column to `Title` - this might be of use later...

In [103]:
df['Title'] = df['Name'].apply(lambda x: x[str.index(x, ', ')+2:str.index(x, '. ')+1])
df.drop(columns=['Name'], inplace=True)
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Cabin,2nd_class,3rd_class,male,Queenstown,Southampton,Title
0,0,22.0,1,0,7.2500,0,0,1,1,0,1,Mr.
1,1,38.0,1,0,71.2833,1,0,0,0,0,0,Mrs.
2,1,26.0,0,0,7.9250,0,0,1,0,0,1,Miss.
3,1,35.0,1,0,53.1000,1,0,0,0,0,1,Mrs.
4,0,35.0,0,0,8.0500,0,0,1,1,0,1,Mr.
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,0,1,0,1,0,1,Rev.
887,1,19.0,0,0,30.0000,1,0,0,0,0,1,Miss.
888,0,,1,2,23.4500,0,0,1,0,0,1,Miss.
889,1,26.0,0,0,30.0000,1,0,0,1,0,0,Mr.


One-hot encode `Title` column.

In [104]:
dummies_title = pd.get_dummies(df['Title'], drop_first=True)
df = pd.concat([df, dummies_title], axis=1)
df.drop(columns=['Title'], inplace=True)
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Cabin,2nd_class,3rd_class,male,Queenstown,...,Master.,Miss.,Mlle.,Mme.,Mr.,Mrs.,Ms.,Rev.,Sir.,the Countess.
0,0,22.0,1,0,7.2500,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,38.0,1,0,71.2833,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,26.0,0,0,7.9250,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,35.0,1,0,53.1000,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,35.0,0,0,8.0500,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
887,1,19.0,0,0,30.0000,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
888,0,,1,2,23.4500,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
889,1,26.0,0,0,30.0000,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


## Imputation with ML

Predict missing `Age` values.

In [112]:
df_train = df[~df['Age'].apply(np.isnan)]
df_test = df[df['Age'].apply(np.isnan)]

In [113]:
print(df_train.shape)
print(df_test.shape)

(714, 27)
(177, 27)


In [114]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train, test_size=0.3, random_state=8801)

Drop records with `None` or `NaN` values.

In [93]:
# print(df.shape)
# df.dropna(inplace=True, axis=0)
# print(df.shape)

Save processed DataFrame to `./data/train_processed.csv`.

In [94]:
# df.to_csv('./data/train_processed.csv', index=False)