# <a>Scikit Learn Pipeline</a>

This notebook explores the Titanic Dataset, available at [Kaggle](https://www.kaggle.com/c/titanic/overview), and creates a basic ML pipeline. In this project, the data is already available through make get_data as specified in the documentation.

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
df_titanic = pd.read_csv('../data/raw/train.csv')
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### <a>Data Dictionary</a>

| Variable | Definition | Key |
| --- | --- | --- |
| survival | Survival | 0 = No, 1 = Yes |
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex |  |
| Age | Age in years |  |
| sibsp | # of siblings / spouses aboard the Titanic |  |
| parch | # of parents / children aboard the Titanic | 	|
| ticket | Ticket number | 	 |
| fare | Passenger fare | 	 |
| cabin | Cabin number | 	 |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton

### <a>Split First, Transform Later</a>

In [3]:
X = df_titanic.drop('Survived', axis=1)
y = df_titanic['Survived']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
X_train.shape, X_test.shape

((668, 11), (223, 11))

### <a>Check if there's any missing data</a>

In [6]:
X_train.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            132
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          519
Embarked         2
dtype: int64

In [7]:
# Class to select features from dataframe by type (numerical vs categorical)
# Modified from
# https://marloz.github.io/projects/sklearn/pipeline/missing/preprocessing/2020/03/20/sklearn-pipelines-missing-values.html
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, dtype):
        self.dtype = dtype
    
    def fit(self, X, y=None):
        """ Get either categorical or numerical columns on fit.
        Store as attribute for future reference"""
        X = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X)
        
        #display(X)
        
        if self.dtype == 'numerical':
            self.cols = X.select_dtypes(exclude='O').columns.tolist()
        elif self.dtype == 'categorical':
            self.cols = X.select_dtypes(include='O').columns.tolist()
        return self

    def transform(self, X):
        """ Subset columns of chosen data type and return np.array"""
        return X.loc[:, self.cols]


In [8]:
# Example
df = pd.DataFrame({'product': ['banana', 'soda', 'wine'], 'price': [10, 20, 30], 'quantity': [12, 6, 2]})
df

Unnamed: 0,product,price,quantity
0,banana,10,12
1,soda,20,6
2,wine,30,2


In [9]:
num_selector = ColumnSelector('numerical')
num_selector.fit(df)
display(num_selector.cols)

['price', 'quantity']

In [10]:
cat_selector = ColumnSelector('categorical')
cat_selector.fit(df)
display(cat_selector.cols)

['product']

In [11]:
num_selector.transform(df)

Unnamed: 0,price,quantity
0,10,12
1,20,6
2,30,2


In [12]:
# Instead of removing the columns directly, create a class to encapsulate the columns to remove
class columnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 

In [13]:
column_dropper = columnDropperTransformer(['price'])
column_dropper.fit(df)
column_dropper.columns

['price']

### <a> Step 01 - feature types + address missing values</a>

In [14]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer


# Pipeline with 2 steps: select numerical features and impute missing values with 0
num_pipe = Pipeline([
    ('drop_nums', columnDropperTransformer(['PassengerId'])),
    ('num_selector', ColumnSelector('numerical')),
    ('num_imputer', SimpleImputer(strategy='constant', fill_value=0))
])

# Pipeline with 2 steps: dropselect categorial features and impute missing values with None
cat_pipe = Pipeline([
    ('drop_cats', columnDropperTransformer(['Name', 'Ticket'])),
    ('cat_selector', ColumnSelector('categorical')),
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='None'))
])

# FeatureUnion to unite the pipelines
pre_processing = FeatureUnion([
    ('num_pipe', num_pipe),
    ('cat_pipe', cat_pipe)
])

pre_processing

FeatureUnion(transformer_list=[('num_pipe',
                                Pipeline(steps=[('drop_nums',
                                                 <__main__.columnDropperTransformer object at 0x7f6040e54610>),
                                                ('num_selector',
                                                 ColumnSelector(dtype='numerical')),
                                                ('num_imputer',
                                                 SimpleImputer(fill_value=0,
                                                               strategy='constant'))])),
                               ('cat_pipe',
                                Pipeline(steps=[('drop_cats',
                                                 <__main__.columnDropperTransformer object at 0x7f6040e18040>),
                                                ('cat_selector',
                                                 ColumnSelector(dtype='categorical')),
                                

### <a> Step 02 - categorical encoding</a>

In [15]:
from sklearn.preprocessing import OneHotEncoder

# Append one hot encoding to categorial pipeline
pre_processing.get_params()['cat_pipe'].steps.append([
    'ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')
])

pre_processing

FeatureUnion(transformer_list=[('num_pipe',
                                Pipeline(steps=[('drop_nums',
                                                 <__main__.columnDropperTransformer object at 0x7f6040e54610>),
                                                ('num_selector',
                                                 ColumnSelector(dtype='numerical')),
                                                ('num_imputer',
                                                 SimpleImputer(fill_value=0,
                                                               strategy='constant'))])),
                               ('cat_pipe',
                                Pipeline(steps=[('drop_cats',
                                                 <__main__.columnDropperTransformer object at 0x7f6040e18040>),
                                                ('cat_selector',
                                                 ColumnSelector(dtype='categorical')),
                                

### <a> Step 03 - Create pipeline with preprocessing + rf model</a>

In [16]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [17]:
# Create meta pipeline with preprocessing and training steps
estimator = Pipeline([
    ('preproc', pre_processing),
    ('model', model)
])

estimator

Pipeline(steps=[('preproc',
                 FeatureUnion(transformer_list=[('num_pipe',
                                                 Pipeline(steps=[('drop_nums',
                                                                  <__main__.columnDropperTransformer object at 0x7f6040e54610>),
                                                                 ('num_selector',
                                                                  ColumnSelector(dtype='numerical')),
                                                                 ('num_imputer',
                                                                  SimpleImputer(fill_value=0,
                                                                                strategy='constant'))])),
                                                ('cat_pipe',
                                                 Pipeline(steps=[('drop_cats',
                                                                  <__main__.columnDropperTransform

### <a> Step 04 - Pass Titanic through pipeline</a>

In [18]:
from sklearn.model_selection import cross_val_score
from statistics import mean

accuracies_cv = cross_val_score(estimator, X_train, y_train, scoring='accuracy')

rf_accuracy = mean(accuracies_cv)

rf_accuracy

0.8039726181124452