## Setup

Import necessary libraries

In [28]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
# To enable import from future_encoders.py
import sys
sys.path.append(os.path.join(os.getcwd(), '../'))

%matplotlib inline

np.random.seed(42)

## Read Data

In [29]:
DATASETS_PATH = os.path.join(os.getcwd(), '../', 'datasets')
TITANIC_PATH = os.path.join(DATASETS_PATH, 'titanic')

def load_titanic_data(filename):
    target_path = os.path.join(TITANIC_PATH, filename)
    return pd.read_csv(target_path)

train = load_titanic_data('train.csv')
test = load_titanic_data('test.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preapare Data

Drop columns that has nothing to do with survival rate

In [36]:
train_without_id_name_ticket = train.drop(["PassengerId", "Name", "Ticket"], axis=1)
train_without_id_name_ticket.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


Drop columns with 50% missing value, get_missing_values_table function got from https://towardsdatascience.com/a-complete-machine-learning-walk-through-in-python-part-one-c62152f39420

In [31]:
# Function to calculate missing values by column# Funct 
def get_missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

missing_values_table = get_missing_values_table(train_without_id_name_ticket)
missing_values_table

Your selected dataframe has 9 columns.
There are 3 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2


In [32]:
cols_with_over_50_missing = missing_values_table[missing_values_table["% of Total Values"] > 50].index
train_useful = train_without_id_and_name.drop(list(cols_with_over_50_missing), axis=1)
train_useful.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Here we split out the features and target

In [33]:
X_train = train_useful.drop('Survived', axis=1)
y_train = train_useful['Survived'].copy()

## Create Pipeline

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, Imputer
from future_encoders import OneHotEncoder

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attributes]
    
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

cat_cols = ['Sex', 'Embarked', 'Pclass']
num_cols = list(X_train.drop(cat_cols, axis=1).columns)

num_pipeline = Pipeline([
    ('num_selector', DataFrameSelector(num_cols)),
    ('median_imputer', Imputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('cat_selector', DataFrameSelector(cat_cols)),
    ('most_frequent_imputer', MostFrequentImputer()),
    ('std_scaler', OneHotEncoder(sparse=False)),
])

data_preparation_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

X_train_prep = data_preparation_pipeline.fit_transform(X_train)
print(X_train.shape)
print(X_train_prep.shape)

(891, 7)
(891, 12)
