## Perform Preprocessing

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import multiprocessing

%matplotlib inline

np.random.seed(42)

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
y_train = train_data['Survived']
train_data.drop(['Survived', 'Cabin', 'Ticket', 'Name'], axis=1, inplace=True)

In [3]:
#Reset index to PassengerId
train_data.set_index('PassengerId', inplace=True)

In [4]:
#Create list of attributes for preprocessing pipeline
num_attribs = ['Parch', 'Age', 'SibSp', 'Fare']
cat_attribs = ['Pclass', 'Embarked', 'Sex']

In [5]:
#Create pipeline for preprocessing
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),   
])

cat_pipeline = Pipeline([
    ("one_hot_encoder", OneHotEncoder()),
    ("imputer", SimpleImputer(strategy="median")),
])

In [6]:
#Create preprocessing pipeline
preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

X_train = preprocess_pipeline.fit_transform(train_data[num_attribs + cat_attribs])

X_train

array([[-0.47367361, -0.56573646,  0.43279337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.47367361,  0.66386103,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.47367361, -0.25833709, -0.4745452 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 2.00893337, -0.1046374 ,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.47367361, -0.25833709, -0.4745452 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.47367361,  0.20276197, -0.4745452 , ...,  0.        ,
         0.        ,  1.        ]])

In [7]:
pd.to_pickle(X_train, '../data/X_train_v3.pkl')
pd.to_pickle(y_train, '../data/y_train_v3.pkl')
