# Creating A ML Workflow With Kaggle Titanic Competition 

---

In this project, our goal is to create a machine learning workflow. We're going to explore a workflow to make competing in the Kaggle Titanic competition easier, using a pipeline of functions to reduce the number of dimensions we need to focus on.

By defining a workflow, we can create a framework with which to make iterating on ideas quicker and easier, allowing us to work more efficiently.

In [176]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last"

## Read in and explore the dataset

In [177]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [178]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [179]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [180]:
train.describe(include = 'all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Radeff, Mr. Alexander",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### Load functions from previous build

In [181]:
# %load functions.py
def process_missing(df):
    """Handle various missing values from the data set

    Usage
    ------

    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 

    Usage
    ------

    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 

    Usage
    ------

    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 

    Usage
    ------

    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 

    Usage
    ------

    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [182]:
# Create a fucntion to process the dataset
def data_process(df):
    df = process_missing(df)
    df = process_age(df)
    df = process_fare(df)
    df = process_titles(df)
    df = process_cabin(df)
    for col in ['Age_categories', 'Fare_categories', 'Title', 'Cabin_type', 'Sex']:
        df = create_dummies(df, col)        
    return df

In [183]:
# Apply data_process to out dataset
train = data_process(train)
holdout = data_process(holdout)

### Inspect & visualize some features
---
Inspect columns:
* `Sibsp` - # of siblings / spouses aboard the Titanic
* `Parch` - # of parents / children aboard the Titanic

In [184]:
import plotly.express as px
import plotly.graph_objects as go

# Visualize the distribution of Sibsp & Parch
fig = px.histogram(train, x=['SibSp', 'Parch'], 
                   marginal = "box", 
                   opacity = 0.7, 
                   color_discrete_sequence=['#eb9a75','#757feb']
                   
                   )

"""
Use print(fig) to check out the structure and configuration of fig 
""" 

# Add centered plot title & update xaxis label
fig.update_layout(
    title={
        'text': 'Stacked distribution: number of siblings/spouses or parents/children',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}, 
    xaxis = {
        'title': {
            'text': 'number of siblings/spouses or parents/children'}}
)

# Update traces name for legend labels
fig.data[0].name = 'siblings/spouses'
fig.data[2].name = 'parents/children'

fig.show()

In [185]:
# Create pivot table for both 'SibSp' & 'Parch'
sibsp_pivot = train.pivot_table(values = 'Survived', index = 'SibSp')
parch_pivot = train.pivot_table(values = 'Survived', index = 'Parch')

# Create a new column with summation of 'SibSp' & 'Parch' and generate survival rate
train['family_size'] = train.SibSp + train.Parch
holdout['family_size'] = holdout.SibSp + holdout.Parch
family_pivot = train.pivot_table(values = 'Survived', index = 'family_size')

In [186]:
from plotly.subplots import make_subplots
# Create fig
fig = go.Figure()

# Add first trace
fig.add_trace(go.Bar(name = 'siblings/spouses',
                     x = sibsp_pivot.index,
                     y = sibsp_pivot.Survived,
                    text = round(sibsp_pivot.Survived,2),
                    textposition='auto',
                    marker_color = '#eb9a75',
                    opacity = 0.8),
)

# Add second trace
fig.add_trace(go.Bar(name = 'parents/children',
                     x = parch_pivot.index,
                    y = parch_pivot.Survived,
                    marker_color = '#757feb',
                    text = round(parch_pivot.Survived, 2),
                    textposition = 'auto',
                    opacity = 0.8
                    )              
        )

# Add third trace
fig.add_trace(go.Bar(name = 'combined family size',
                    x = family_pivot.index, 
                    y = family_pivot.Survived,
                    marker_color = '#9ad687',         
                    text = round(family_pivot.Survived, 2),
                    textposition = 'auto',
                    opacity = 0.8
                    ))

# Update layout
fig.update_layout(title = 'Survival Rate Based On Number Of Relations On The Ship',
                  width = 1000,
                  height = 500,
                  barmode = 'group',
                  bargap = 0.2, # gap between bars of adjacent location coordinates.
                  bargroupgap=0.1, # gap between bars of the same location coordinate.
                  xaxis = dict(title = 'number',
                               tickvals = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                              ),
                  yaxis = dict(title = 'survival rate'),
                  legend = dict(title = 'Relation type', 
                                yanchor = 'top',
                               y = 0.9,
                               xanchor = 'right',
                               x = 0.9)
  ) 


**Observations:**
* Combined family size gives us some new data range that's not available in either `SibSp` column or `Parch` column.
* There is no linear correlation on survival rate between the three relation types.
* The data is right skewed. The higher survival rate window for all three types for relations is at number 1,2,3. Survival rate is at highest for siblings/spouses number at 1, for parents/children number at 3, for combined family size at 3.
* Only 30% of the passengers who had no family members on board survived.
---
Based of this, we can come up with an idea for a new feature - was the passenger alone. This will be a binary column containing the value:

* `1` if the passenger has zero family members on board
* `0` if the passenger has one or more family members on board

## Feature engineering

In [187]:
# Create a function to categorize passengers if they are alone on board or not
def is_alone(df):
    df['is_alone'] = 0
    df.loc[df.family_size==0, 'is_alone'] = 1
    df.loc[df.family_size!=0, 'is_alone'] = 0
    return df

In [188]:
# Create new column `is_alone` using is_alone function
train = is_alone(train)
holdout = is_alone(holdout)

## Feature preparation

Accepts a dataframe as input
Performs data preparation for machine learning
Uses recursive feature elimination and the random forests algorithm to find the best-performing set of features

In [248]:
# Create a function for data preparation
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def select_features(df):
    # Drop non-numerical columns
    df = df.select_dtypes(include = np.number)
    
    # Drop rows with null values
    df = df.dropna()
    
    # Create training data
    all_X = df.drop(['PassengerId', 'Survived'], axis = 1)
    all_y = df.Survived
    
    # Perform recursive feature elimination using RFECV
    clf = RandomForestClassifier(random_state = 1)
    selector = RFECV(clf, cv = 10)
    selector.fit(all_X, all_y)
    
    # Assign best features
    best_features = all_X.columns[selector.support_]
    print(best_features)
    
    return best_features

In [249]:
features = select_features(train)

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_categories_Missing',
       'Age_categories_Infant', 'Age_categories_Young Adult',
       'Age_categories_Adult', 'Fare_categories_0-12', 'Fare_categories_12-50',
       'Fare_categories_50-100', 'Title_Master', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Cabin_type_E', 'Cabin_type_Unknown', 'Sex_female',
       'Sex_male', 'family_size'],
      dtype='object')


## Model selection and tuning

In [252]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Create a function to select the best performing model
def select_model(df, features):
    # Create training data
    all_X = train[features]
    all_y = train.Survived
    
    # Create models to tryout and tune
    models = [
        {
            'name':'LogisticRegression',
            'estimator':LogisticRegression(),
            'hyperparameters':{
                'solver': ['newton-cg', 'lbfgs', 'liblinear']
            }
        },
        
        {'name':'KNeighborsClassifier',
         'estimator':KNeighborsClassifier(),
         'hyperparameters':{
             'n_neighbors': range(1,20,2),
             'weights': ['distance', 'uniform'],
             'algorithm': ['ball_tree', 'kd_tree', 'brute'],
             'p': [1,2]
         }
            
        },
        
        {
            'name':'RandomForestClassifier',
            'estimator':RandomForestClassifier(),
            'hyperparameters':{
                'n_estimators': [4, 6, 9],
                'criterion': ['entropy', 'gini'],
                'max_depth': [2, 5, 10],
                'max_features': ['log2', 'sqrt'],
                'min_samples_leaf': [1, 5, 8],
                'min_samples_split': [2, 3, 5]
            }
        }
    ]
    
    # Loop through the models to perform grid search
    for model in models:
        print(model['name'])
        
        grid = GridSearchCV(estimator = model['estimator'], 
                            param_grid = model['hyperparameters'],
                            cv = 10)
        
        # Grid search
        grid.fit(all_X, all_y)
        
        # Assign best results to the dictionary
        model['best_params'] = grid.best_params_ 
        model['best_score'] = grid.best_score_
        model['best_estimator'] = grid.best_estimator_
        
        print(grid.best_params_)
        print(grid.best_score_)
        
    return models

In [253]:
best_performances = select_model(train, features)

LogisticRegression



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

{'solver': 'newton-cg'}
0.821598002496879
KNeighborsClassifier
{'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
0.7822971285892635
RandomForestClassifier
{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 6}
0.8384394506866417


In [254]:
best_performances

[{'name': 'LogisticRegression',
  'estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False),
  'hyperparameters': {'solver': ['newton-cg', 'lbfgs', 'liblinear']},
  'best_params': {'solver': 'newton-cg'},
  'best_score': 0.821598002496879,
  'best_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                     warm_start=False)},
 {'name': 'KNeighborsClassifier',
  'estimator': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='m

## Prep submission file to Kaggle

In [255]:
# Create a function to prep submission file to Kaggle
def save_submission_file(model, features, filename = 'result'):
    
    # Make a prediction
    predictions = model.predict(holdout[features])
    
    # Format final result based on Kaggle rules
    result = pd.DataFrame(data ={'PassengerId':holdout.PassengerId,
                          'Survived':predictions})
    result.to_csv(filename+'.csv', index = False)

In [256]:
save_submission_file(best_performances[2]['best_estimator'], features, 'titanic_survival_predictions')