In [None]:
import re
import sys
import phik
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import set_config
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.cluster import AgglomerativeClustering

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [None]:
assert sys.version_info >= (3, 7, 3)
assert sklearn.__version__ == '0.23.2'

In [None]:
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12,8)

# Display estimators as diagrams in Jupyter notebook
set_config(display='diagram')

In [None]:
# Define paths to datasets and target/feature information
trainPath = 'train.csv'
testPath = 'test.csv'
target = 'Survived'
index = 'PassengerId'

In [None]:
# Explicity define datatypes
dtypes = ({
    'Survived': bool, 
    'Pclass':   int, 
    'Name':     'category',
    'Sex':      'category',
    'Age':      float,
    'SibSp':    int,
    'Parch':    int,
    'Ticket':  'category',
    'Fare':     float,
    'Cabin':   'category',
    'Embarked':'category'
})

## Exploratory Data Analysis
 - Here we explore the training dataset and relationships between features.
 - We also explore possible new features to engineer.
 - Test dataset is ignored.

In [None]:
train = pd.read_csv(trainPath, index_col=index, dtype=dtypes)
train.head()

### View missing data
 - We will drop Cabin due to proportion of missing variables.

In [None]:
def showMissing(df):
    """ Show features with missing values """
    nullOrd = df.isnull().sum().sort_values(ascending=False)
    nullOrd = nullOrd[nullOrd > 0]
    nullOrd = pd.DataFrame(nullOrd, columns=['TotalNA'])
    nullOrd['PropNA'] = nullOrd['TotalNA'] / len(df)
    return nullOrd

In [None]:
print(showMissing(train))

In [None]:
# Remove Cabin feature
train = train.drop('Cabin', axis=1)

## Initial feature engineering
 - Prior to visualising the data there are some additional features we can extract based on our knowledge of the data.
   - **Family Size** (from Parch and SibSp)
     - The sum of Parch and SibSp + 1 gives total family size.
   - **Surname** (from Name)
     - Indicates family grouping.
   - **Title** (from Name)
     - Contains information on age, sex, social status.
   - **Woman or Child** (from Title and Sex)
     - We can engineer this group without using Age since boys have the title 'Master'.

In [None]:
train['FamSize'] = train['Parch'] + train['SibSp'] + 1
train['Surname'] = train['Name'].apply(lambda x: re.split(',', x)[0].strip())
train['Title'] = train['Name'].apply(lambda x: re.split(',|\.', x)[1].strip())
train['womanOrChild'] = (train['Sex'] == 'female') | (train['Title'] == 'Master')

In [None]:

# Check frequency of each title - 4 titles dominate so we will group all others together for now.
train['Title'].value_counts()

In [None]:
# Boolean mask of any title NOT in list
otherTitles = ~train['Title'].isin(['Mr', 'Miss', 'Mrs', 'Master'])
# Replace non-standard titles with other to reduce cardinality.
train.loc[otherTitles, 'Title'] = 'Other'

### Family Survival
 - Here we create 2 boolean features to quantify family survive for a given surname.
 - Since most males dies, if a male of the family survived the rest of his family may have survived to.
 - Since most women and children survived, if they died then the rest of their family may have died also.
  - Adult male survived.
  - Female or boy (title = 'Master') died.

In [None]:
# Boolean feature describing if family has a surviving adult male
maleSurviveWithFam = (
    (train['Sex'] == 'male') & (train['Title'] != 'Master') & 
    (train['Survived'] == 1) & (train['FamSize'] > 1))
maleNames = train.loc[maleSurviveWithFam, 'Surname']
train['famSurvive'] = train['Surname'].isin(maleNames)

In [None]:
# Boolean feature describing if family has a dead women/child
womenChildDieWithFam = (
    ((train['Sex'] == 'female') | (train['Title'] == 'Master')) &
    (train['Survived'] == 0) & (train['FamSize'] > 1))
womenChildNames = train.loc[womenChildDieWithFam, 'Surname']
train['famDie'] = train['Surname'].isin(womenChildNames)

### Check cardinality of non-numeric groups
  - High cardinality groups will be dropped for Phik analysis.

In [None]:
def checkCardinality(df):
    """ Return number of unique groups 
        for non-numeric columns """
    cardinality = (
        df
        .select_dtypes(exclude='number')
        .apply(pd.Series.nunique)
        .sort_values(ascending=False))
    return cardinality

In [None]:
print(checkCardinality(train))

In [None]:
def plotMatrix(df, sortOn=None, mask=None):
    """ Wrapper for plotting phik matrix """
    fig, ax = plt.subplots()
    if sortOn is not None:
        # Retrieve sorted columns
        order = df[target].sort_values(ascending=False).index
        # Reorder rows and columns
        phik = df.reindex(order)[order]
    # Plot heatmap
    heatmap = sns.heatmap(df, cmap='Reds', vmin=0, vmax=1, mask=mask, ax=ax)
    heatmap.set_facecolor('grey')
    return fig, ax

In [None]:
intervalCols = ['Age', 'Fare']
dropCols = ['Name', 'Ticket', 'Surname']
phikMatrix = train.drop(dropCols, axis=1).phik_matrix(interval_cols=intervalCols)
sigMatrix = train.drop(dropCols, axis=1).significance_matrix(interval_cols=intervalCols)

In [None]:
fig, ax = plotMatrix(phikMatrix, sortOn=target, mask=(phikMatrix == 1))

In [None]:
phikNoTarget = 1 - phikMatrix.drop(target, axis=0).drop(target, axis=1)
phikClusterer = AgglomerativeClustering(
    affinity='precomputed', linkage='average', 
    n_clusters=None, distance_threshold=0.25)

# Associate cluster labels with features names
labelledClusters = (pd.DataFrame(
    {'cluster': phikClusterer.fit(phikNoTarget).labels_, 
     'feature': phikNoTarget.index}))

# Add association with target and sort
labelledClusters = (
    pd.merge(phikMatrix[target].drop(target), labelledClusters, 
             left_index=True, right_on='feature')
    .sort_values(target, ascending=False))
    
# Group by cluster and print clustered features.
# Features are ordered in a cluster according to their association with the target
# and cluster are order by their mean association with target.
clusterFeatures = labelledClusters.groupby('cluster')
clusterFeatures = (
    pd.concat([clusterFeatures['feature'].apply(list), 
               clusterFeatures[target].mean()], axis=1)
    .sort_values('Survived', ascending=False))
print(clusterFeatures)

### Building a model
 - Here we build a custom transformer incorporating all of the preprocessing steps.
  - Using a custom transformer within a pipeline helps prevent data leakage and make it easy to run the pipeline on test datasets.

In [None]:
class GroupImputer(BaseEstimator, TransformerMixin):
    """ Extension of SimpleImputer to optionally impute 
        values by group and return a pandas Series. """
    
    def __init__(self, variable, by=[], strategy='median'): 
        self.variable = variable
        self.by = by
        if strategy == 'most_frequent':
            self.strategy = lambda x: x.mode().sample(1).values[0]
        else:
            self.strategy = strategy
        self.maps = []

    def fit(self, X, y=None):
        # Store impute for ungrouped data
        self.simpleImpute = X[self.variable].agg(self.strategy)
        # Store maps for all grouping levels
        for i in range(len(self.by), 0, -1):
            subBy = self.by[:i]
            mapper = X.groupby(subBy)[self.variable].agg(self.strategy)
            if i == 1:
                mapper = {(k,): v for k, v in mapper.to_dict().items()}
            self.maps.append((subBy , mapper))
        return self

    def transform(self, X, y=None):
        imputed = X[self.variable]
        for (by, mapper) in self.maps:
            fillVals = X[by].apply(tuple, axis=1).map(mapper)
            imputed = imputed.fillna(fillVals)
            if not imputed.isnull().values.any():
                break
        else:
            # Replace remaining NaN (with ungrouped)
            imputed = imputed.fillna(self.simpleImpute)
        return imputed

### NoTransformer
 - In this analysis we make use of the ColumnTransformer to explicity define a pre-processing step for each feature.
 - Any features not included in the ColumnTransformer are dropped.
 - This transformer allows us to pass the data through the ColumnTransformer and retain features that do not need pre-processing.

In [None]:
class NoTransformer(BaseEstimator, TransformerMixin):
    """ Dummy transformer """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """ Custom imputation and feature engineering 
        of Titanic dataset """
    
    def __init__(self):
        self._imputes = {}
        

    def initData(self, X):
        """ Feature engineering required for both fit & transform """
        X['Title'] = X['Name'].apply(self.getTitle)
        X['womanOrChild'] = (X['Sex'] == 'female') | (X['Title'] == 'Master')
        X['Surname'] = X['Name'].apply(self.getSurname)
        X['FamSize'] = X['Parch'] + X['SibSp'] + 1
        X['ageGroup'] = X.apply(self.estimateAgeGroup, axis=1)

        
    def fit(self, X, y=None):
        X = X.copy()
        self.initData(X)
        # Store surnames of all adult males (with families) that survived
        maleSurviveWithFam = (
            (X['Sex'] == 'male') & (X['Title'] != 'Master') & 
            (y == 1) & (X['FamSize'] > 1))
        self.maleNames = X.loc[maleSurviveWithFam, 'Surname']
        # Store surnames of all women and children (with families) that died
        femaleDieWithFam = (
            ((X['Sex'] == 'female') | (X['Title'] == 'Master')) &
            (y == 0) & (X['FamSize'] > 1))
        # Store surnames of all females (with families) that died
        self.femaleNames = X.loc[femaleDieWithFam, 'Surname']
        self._imputes['Age'] = GroupImputer(
            'Age', by=['ageGroup', 'Pclass'], strategy='median').fit(X)
        return self

    
    def transform(self, X, y=None):
        X = X.copy()
        self.initData(X)
        X['Age'] = self._imputes['Age'].transform(X)
        X['famSurvive'] = X['Surname'].isin(self.maleNames)
        X['famDie'] = X['Surname'].isin(self.femaleNames)
        return X

    
    def estimateAgeGroup(self, X):
        """ Estimate age/sex group by title for age imputation """
        # Assume unmarried with parents is a girl
        if (X['Title'] == 'Miss') & (X['Parch'] > 0):
            return 'girl'
        elif (X['Title'] == 'Master'):
            return 'boy'
        elif (X['Sex'] == 'male'):
            return 'man'
        else:
            return 'woman'    
        
        
    def getTitle(self, x):
        """ Extract title from name """
        return re.split(',|\.', x)[1].strip()
    
    
    def getSurname(self, x):
        """ Extract surname from name """
        return re.split(',', x)[0].strip()

### Read training data
 - Seperate feature matrix and target vector.
 - Split into training and validation datasets.
   - Since test_train_split returns a view we run the result through map() to yeild a true copy of each.
 

In [None]:
X = pd.read_csv(trainPath, index_col=index, dtype=dtypes)
y = X.pop(target)

split = train_test_split(X, y, random_state=0, train_size=0.8, test_size=0.2)
X_train, X_valid, y_train, y_valid = map(lambda x: x.copy(), split)

In [None]:
EmbarkedTransformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encode', OneHotEncoder(
        handle_unknown='ignore', categories=[['C', 'Q', 'S']]))
])
AgeDiscretizer = Pipeline(steps=[
    ('discrete', KBinsDiscretizer(encode='ordinal', strategy='kmeans'))
])

In [None]:
transformers = ([
    ('Pclass',   OneHotEncoder(handle_unknown='ignore', categories=[[1, 2, 3]]),  ['Pclass']),
    ('Embarked', EmbarkedTransformer, ['Embarked']),
    ('Age',      AgeDiscretizer, ['Age']),
    ('None',     NoTransformer(), ['famDie', 'famSurvive', 'womanOrChild', 'FamSize']),
])
featureTransformer = ColumnTransformer(transformers=transformers, remainder='drop')

In [None]:
# Define a preModel pipeline distinct from modelling step
preProcessor = Pipeline(steps=[
    ('engineer',        FeatureEngineer()),
    ('columnTransform', featureTransformer),
])

### Assess feature importance
 - This function takes a preProcessor pipeline and a tree estimator to assess feature importance 

In [None]:
def plotFeatureImportance(X, y, prePreprocessor, estimator, vline=None):
    """ Run decision tree ensemble method on a preModel 
        pipline and plot feature importance """
    pipeline = Pipeline(steps=[
        ('prePreprocessor', prePreprocessor),
        ('selector',        estimator)])
    clf = pipeline.fit(X, y)
    columnTransformer = (
        clf.named_steps['prePreprocessor'].named_steps['columnTransform'])
    try:
        selector = clf.named_steps['prePreprocessor'].named_steps['selector']
    except KeyError:
        selector = None
    featureNames = getFeatureNames(columnTransformer, selector)
    features = (pd.DataFrame(
        {'feature': featureNames,
         'importance': clf.named_steps['selector'].feature_importances_})
        .sort_values(by=['importance'], ascending=False))
    
    print(f'Total unfiltered features: {len(featureNames)}')
    fig, ax = plt.subplots()
    sns.barplot(y='feature', x='importance', data=features, ax=ax)
    if vline is not None:
        ax.axvline(vline)
    ax.set_ylabel('')
    ax.set_xlabel('Feature importance')
    fig.tight_layout()

In [None]:
def getFeatureNames(columnTransformer, selector=None):
    """ Extract feature names from column transformer object. 
        If transformers are pipelines the one-hot encoding step
        should be last step of that pipeline.
    """
    colNames = np.array([])
    for tupleTransformer in columnTransformer.transformers_[:-1]:
        if isinstance(tupleTransformer[1], Pipeline): 
            transformer = tupleTransformer[1].steps[-1][1]
        else:
            transformer = tupleTransformer[1]
        try:
            # One hot encoded names have x0_, x1_ etc.
            names = transformer.get_feature_names()
            trueNames = tupleTransformer[-1]
            # Get dict mapping transformed name to true name
            nameMap = {f'x{i}_' : name for i, name in enumerate(trueNames)}
            # Swap transformed name with true name
            for i, name in enumerate(names):
                prefix = name[:3]
                names[i] = f'{nameMap[prefix]}_{name[3:]}'
        except AttributeError:
            names = tupleTransformer[2]
        # This is for kBinDiscretizers, which have n_bins_ method
        if (isinstance(transformer, KBinsDiscretizer)
                and transformer.encode != 'ordinal'):
            if transformer.encode != 'ordinal':
                nBins = transformer.n_bins_
                newNames = []
                for col, n in zip(names, nBins):
                    newNames = [f'{col}-{i}' for i in range(n)]
                names = newNames
        colNames = np.append(colNames, names)
    if selector is not None:
        colNames = colNames[selector.get_support()]
    return colNames

In [None]:
# Note the plotFeatureImportance function assumes the columnTransformer is named 'columnTransform'
# Future update could loop through steps and find the ColumnTransformer object
selectEstimator = RandomForestClassifier(random_state=1, n_estimators=500, max_features='sqrt')
plotFeatureImportance(X_train, y_train, preProcessor, selectEstimator, 0.015)

### Perform recursive feature elimination
  - Combine the preProcessor pipeline with feature selector.
  - Run feature selection and identify selected features.

In [None]:
# Configure the cross-validation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
nJobs = 4

In [None]:
featureSelector = Pipeline(steps=[
    ('preProcess',    preProcessor),
    ('selector',      RFECV(selectEstimator, cv=cv, scoring='accuracy')),
])
# Fit data to pipeline
featureSelector.fit(X_train, y_train)

#### View transformed data
 - Here we view our transformed and feature filtered dataset as a sanity check prior to running the model.
 - The custom function 'getFeatureNames' processes a columnTransformer object to extract the original feature names.
  - This allows us to view the processed numpy matrix as a labelled dataframe.
  - Ref: https://github.com/scikit-learn/scikit-learn/issues/12525 

In [None]:
# Extract columnTransformer and selector to extract feature names
columnTransformer = featureSelector.named_steps['preProcess'].named_steps['columnTransform']
selector = featureSelector.named_steps['selector']
selectedFeatures = selector.get_support()
featureNames = getFeatureNames(columnTransformer, selector)
# Create dataframe of transformed data
transformedDF = pd.DataFrame(
    featureSelector.transform(X), 
    columns=featureNames)

# Symetric difference between 'with' and 'without selection'
allFeatures = set(getFeatureNames(columnTransformer))

eliminatedFeatures = allFeatures ^ set(featureNames)
print(f'Eliminated features: {eliminatedFeatures}')

transformedDF.head()

### Estimator hypertuning
  - Build a fresh pipeline seperate from the feature selector.
  - The custom FeatureFilter() transformer filters unselected features.
  - We do this because we don't want to perform feature selection within a parameter search.

#### FeatureFilter
 - In the previous step we created a seperate pipeline to performed pre-processing and recursive feature elimination to determine optimal features to pass to the model.
   - This pipeline returns a boolean array of features selected.
 - The FeatureFilter transformer is included in the main pipeline to filter features before passing to the model.

In [None]:
class FeatureFilter(BaseEstimator, TransformerMixin):
    """ Filter columns by boolean mask """
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[:, self.columns]

In [None]:
fullModel = Pipeline(steps=[
    ('preProcess',    preProcessor),
    ('featureFilter', FeatureFilter(selectedFeatures)),
    ('model',         RandomForestClassifier(random_state=1)),
])

In [None]:
params = ({
    'preProcess__columnTransform__Age__discrete__n_bins':  range(2, 5),
    'model__n_estimators':      range(100, 1000, 10),
    'model__max_depth':         range(1, 20),
    'model__criterion':         ['gini', 'entropy'],
    'model__max_features':      range(1, len(featureNames) + 1),
})

gridSearch = RandomizedSearchCV(
    fullModel, params, scoring='accuracy', random_state=1,
    cv=cv, refit=True, n_jobs=nJobs, n_iter=100, verbose=1)
gridSearch.fit(X_train, y_train)

score = gridSearch.score(X_valid, y_valid)
print(f'Best score: {score:.3f}')

In [None]:
X_test = pd.read_csv(testPath, index_col=index, dtype=dtypes)
currentPredict = gridSearch.predict(X_test)

correct = pd.read_csv('submissionTrue.csv')['Survived']
myBest = pd.read_csv('submissionBest.csv')['Survived']
actualScore = (correct == currentPredict).sum() / len(correct)
print(actualScore)

### BayesSearchCV
  - Note: current version of skopt (0.8.1) not compatible with scikit-learn 0.24.1

In [None]:
params = ({
    'model__n_estimators':      Integer(100, 1000),
    'model__max_depth':         Integer(3, 20),
    'model__criterion':         Categorical(['gini', 'entropy']),
    'model__max_features':      Integer(1, len(featureNames)),
})

gridSearch = BayesSearchCV(
    fullModel, params, scoring='accuracy', random_state=1,
    cv=cv, refit=True, n_jobs=nJobs, n_iter=50)
gridSearch.fit(X, y)