Progress list:

- ~~Develop basic pipeline that can process data~~
    - ~~NaN handling for categorical and numerical columns of DataFrame~~
    - ~~Model building - simple, defined RandomForestRegression to start with~~
- Enhance pipeline with cross-validation/gridsearch
- Improve feature engineering using insights
    - Handling of Ticket data (Ticket_num, Ticket_pre)
    - Name splitting (Title, First_name, Surname, Other_names, Maiden_names)
    - 'Is_alone' column, for Parch & SibSp == 0
    - Familial_rel column, try to work out role within family (e.g. Father, Mother, Grandfather, etc.)
    - Age inference (How best to do this?)
        - First step: view known age distributions for each title (see Explore_data) 
        - For 'Master' title passengers, use median of other 'Master's
        - May be able to simply use median ages for groups within Familial_rel column, if not, split data by Pclass and then do so
        - Split data by Pclass, 
            - then separate out Is_alone and allocate median to these passengers  
- Implement tests to check pipeline handling is going as expected
    - Test whether the output model is better than the baseline: doing nothing to the data (other than removing NaN values)
    - Test whether the data in the transformed columns is in the expected format

[Custom pipeline transformations](https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65)

- Idea is to create a custom transformer class that handles the numerical or categorical columns named within in it using a specified method
- 'where will I find these base classes that come with most of the methods I need to write my transformer class on top of? Fret not. Scikit-Learn provides us with two great base classes, [TransformerMixin](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) and [BaseEstimator](https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html). Inheriting from TransformerMixin ensures that all we need to do is write our fit and transform methods and we get fit_transform for free.'

For my use case:

In [327]:
import re
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 

#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, feature_names):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X[self._feature_names] 
    
# Custom transformer that:
# Extracts ticket_num from the Ticket column
# ~~Create a numerical column for 'shared_exact_ticket' if ticket_num is not unique
# ~~Create a numerical column for 'shared_adjacent_ticket' if ticket_num +/- 1 is not unique
# splits 'Name' column into Title, First_name, Surname, Maiden_first_name and Maiden_surname
# Creates a 'num_cabins' column by counting the number of spaces in the 'Cabin' column, Nan => 0
# ~~Create a 'Maiden_fam_aboard' column if Maiden_surname matches any instance in Surname column

## ~~ issues: not working with whole dataset, so will likely skew results! e.g. if shared ticket is in another part of the data
    
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Return self nothing else to do here
    def __init__(self):
        pass
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self

    # Helper function to extract number of cabins (if NaN, 0)
    def get_num_cabins(self, obj):
        try:
            return str(obj).count(' ') + 1
        except:
            return 0
       
    # Helper function to extract surname from 'Name' column
    def get_surname(self, obj):
        Result = str(obj).split(sep = ", ")
        if Result:
            return Result[0]
        
    # Helper function to extract title from 'Name' column
    def get_title(self, obj):
        Result = str(obj).split(sep = ", ")[1].split(sep = ". ") # or re.search(r"(?<=, ).+?(?=\. )", str(obj)).group(1)
        if Result:
            return Result[0]
        
#     # Helper function to extract first name from 'Name' column
#     def get_first_name(self, obj):
#         Result = re.search(r"(?<=\. ).+?(?= )", str(obj))
#         if Result:
#             return Result[0]
#         else:
#             return 'None'
    
    # Helper function to extract maiden surname name from 'Name' column
    def get_maiden_surname(self, obj):
        Result = re.search(r"(?<= )[a-zA-Z]+?(?=\))", str(obj))
        if Result:
            return Result[0]
        else:
            return 'None'
    
#     # Helper function to extract maiden surname name from 'Name' column
#     def get_maiden_first_name(self, obj):
#         Result = re.search(r"(?<=\()[a-zA-Z]+?(?= )", str(obj))
#         if Result:
#             return Result[0]
#         else:
#             return 'None'
    
    # Helper function that gets the ticket number from 'Ticket' column
    def get_ticket_num(self, obj):
        Result = re.search(r"(?=(?:\D*\d))([a-zA-Z0-9]*$)", str(obj))
        if Result:
            return Result[0]
        else:
            return 0
        
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None):
        #Depending on constructor argument add num_cabins
        #using the helper functions written above 
        X.loc[:, 'Num_cabins'] = X['Cabin'].apply(self.get_num_cabins) 
        
        X.loc[:, 'Ticket_num'] = X['Ticket'].apply(self.get_ticket_num).astype(int) 
                
        X.loc[:, 'Title']             = X['Name'].apply(self.get_title)
#         X.loc[:, 'First_name']        = X['Name'].apply(self.get_first_name)
        X.loc[:, 'Surname']           = X['Name'].apply(self.get_surname)
#         X.loc[:, 'Maiden_first_name'] = X['Name'].apply(self.get_maiden_first_name)
        X.loc[:, 'Maiden_surname']    = X['Name'].apply(self.get_maiden_surname)
        
        X['Pclass'] = X['Pclass'].astype('category')
        
        #Drop unnecessary Name column 
        X = X.drop('Name', axis = 1 )
        X = X.drop('Cabin', axis = 1 )
#         X = X.drop('Ticket', axis = 1)
            
        #returns numpy array
        return X.values 

# Custom transformer to engineer features (create an 'Is_alone'
# column if SibSp & Parch are both 0)
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__(self):
        pass
        
    #Return self, nothing else to do here
    def fit(self, X, y = None):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
        X.loc[:,'Is_alone'] = np.where((X['SibSp'] == 0) & (X['Parch'] == 0), 1, 0).astype('bool')
        return X.values
            
# Cardinality: number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
# categorical_features = [cname for cname in X.columns if X[cname].nunique() < 20 and X[cname].dtype == "object"]
categorical_features = ['Sex', 'Cabin', 'Ticket', 'Name', 'Pclass']

# Select numerical columns
# numerical_features = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']

#Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline(steps = [
    ('cat_selector', FeatureSelector(categorical_features)),
    ('cat_transformer', CategoricalTransformer()),
    ('imputer', SimpleImputer(strategy = 'constant', 
                               fill_value = 'None')),
    ('one_hot_encoder', OneHotEncoder(sparse = False, 
                                      handle_unknown = 'ignore'))
])
    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline(steps = [
    ('num_selector', FeatureSelector(numerical_features)),
    ('num_transformer', NumericalTransformer()),
    ('imputer', SimpleImputer(strategy = 'median'))
])

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = FeatureUnion(transformer_list = [
    ('categorical_pipeline', categorical_pipeline),
    ('numerical_pipeline', numerical_pipeline)
])

In [332]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Import data
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')

y = train_data['Survived'].copy()

X = train_data.drop('Survived', axis = 1).copy()
X_test = test_data.copy()

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

from sklearn.svm import SVC
model = SVC(kernel="rbf", random_state=42)

# The full pipeline as a step in another pipeline with an estimator as the final
# step
full_pipeline_m = Pipeline(steps = [
    ('full_pipeline', full_pipeline),
    ('model', model)
])

#Can call fit on it just like any other pipeline
full_pipeline_m.fit(X_train, y_train)

scores = cross_val_score(full_pipeline_m, X, y, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

# param_grid = { 
#     'model__n_estimators': np.arange(1, 251, 25),
#     'model__max_features': ['auto', 'sqrt', 'log2'],
#     'model__max_depth' : [4,5,6,7,8],
#     'model__criterion' :['gini', 'entropy']
# }

# CV = GridSearchCV(full_pipeline_m, param_grid, iid = False, n_jobs= 1)
                  
# CV.fit(X_train, y_train)  
# print(CV.best_params_)   
# print(CV.best_score_)

# #Can predict with it like any other pipeline
y_pred = full_pipeline_m.predict(X_valid) 

error = mean_absolute_error(y_pred, y_valid)
1 - error

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

KeyError: None

In [329]:
full_pipeline_m.fit(X, y)

y_pred = full_pipeline_m.predict(test_data) 

output = pd.DataFrame({'PassengerId' : test_data.PassengerId, 'Survived' : y_pred})
output.to_csv('RF_submission_5.csv', index = False)
print("Saved it!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs

Saved it!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [368]:
import re
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline

#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, feature_names):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X[self._feature_names] 

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)
    
# Cardinality: number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
# categorical_features = [cname for cname in X.columns if X[cname].nunique() < 20 and X[cname].dtype == "object"]
categorical_features = ['Sex', 'Cabin']

# Select numerical columns
# numerical_features = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']

all_features = categorical_features + numerical_features

#Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline(steps = [
    ('cat_selector', FeatureSelector(categorical_features)),
#     ('cat_transformer', CategoricalTransformer()),
    ('imputer', SimpleImputer(strategy = 'most_frequent', 
                               fill_value = 'None')),
    ('one_hot_encoder', OneHotEncoder(sparse = True, 
                                      handle_unknown = 'ignore'))
])
    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline(steps = [
    ('num_selector', FeatureSelector(numerical_features)),
#     ('num_transformer', NumericalTransformer()),
    ('imputer', SimpleImputer(strategy = 'median'))
])

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
preprocess_pipeline = make_pipeline(
    ColumnSelector(columns=all_features),
    FeatureUnion(transformer_list = [
        ('categorical_pipeline', categorical_pipeline),
        ('numerical_pipeline', numerical_pipeline)
    ])
)

In [369]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Import data
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')

y = train_data['Survived'].copy()

X = train_data.drop('Survived', axis = 1).copy()
X_test = test_data.copy()

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

from sklearn.svm import SVC
model = SVC(kernel="rbf", random_state=42)

# The full pipeline as a step in another pipeline with an estimator as the final
# step
full_pipeline_m = Pipeline(steps = [
    ('pre_pipeline', preprocess_pipeline),
    ('model', model)
])

#Can call fit on it just like any other pipeline
full_pipeline_m.fit(X_train, y_train)

param_grid = {
    "model__gamma": [0.1 * x for x in range(1, 6)]
}

classifier_model = GridSearchCV(full_pipeline_m, param_grid, cv=10)
classifier_model.fit(X_train, y_train)

y_score = classifier_model.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)
roc_auc = roc_auc_score(y_test, y_score)

# Plot ROC curve
plt.figure(figsize=(16, 12))
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)', size=16)
plt.ylabel('True Positive Rate (Sensitivity)', size=16)
plt.title('ROC Curve', size=20)
plt.legend(fontsize=14);

# param_grid = { 
#     'model__n_estimators': np.arange(1, 251, 25),
#     'model__max_features': ['auto', 'sqrt', 'log2'],
#     'model__max_depth' : [4,5,6,7,8],
#     'model__criterion' :['gini', 'entropy']
# }

# CV = GridSearchCV(full_pipeline_m, param_grid, iid = False, n_jobs= 1)
                  
# CV.fit(X_train, y_train)  
# print(CV.best_params_)   
# print(CV.best_score_)

# #Can predict with it like any other pipeline
y_pred = full_pipeline_m.predict(X_valid) 

error = mean_absolute_error(y_pred, y_valid)
1 - error



KeyError: None