### Hackathon pipeline template

In this document an idea of how to create a pipeline.

In [102]:
# general imports
import pandas as pd
import itertools

# general imports from sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# import for models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

# imports for transformation
from sklearn.preprocessing import PolynomialFeatures

# splitting for crossvalidation procedure
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit

# evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import explained_variance_score as ev
from sklearn.metrics import r2_score as r2

import re
from functools import reduce

In [127]:
def clean_column_names(df):
    '''Remove all non letters from the column names'''
    newcols= []
    for colname in data.columns:
        newcols.append(reduce(lambda i,j: i+j,re.findall("[a-zA-Z]",colname)))
    df.columns = newcols
    
    return df

In [269]:
# import data
# this dataset I retrieved from https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/
data = pd.read_csv('KNMI_20171121.txt',header=48,sep=',')
#Instead of dropping, replace month and day by numeric values
#data = clean_week_and_day(data)
data = data.drop(0)
data.index -= 1 #Start at 0
#break down YYYYMMDD column
data["YYYYMMDD"] = data["YYYYMMDD"].astype("str")
data['Year'] = data.YYYYMMDD.apply(lambda x: x[0:4]).astype("int")
data['MM'] = data.YYYYMMDD.apply(lambda x: x[4:6]).astype("int")
data['DD'] = data.YYYYMMDD.apply(lambda x: x[6:8]).astype("int")
data = data.drop("YYYYMMDD",axis=1)

#Remove spaces from headers
data = clean_column_names(data)

#remove object types
dict_type = dict(data.dtypes)

objectcols=[]
for key,value in dict_type.items():
    if "object" in str(value):
        objectcols.append(key)

data = data.drop(objectcols,1)

data.head()

Unnamed: 0,DDVEC,FHVEC,FG,FHX,FHXH,FHN,FHNH,FXX,FXXH,Year,MM,DD
0,334.0,55.0,67.0,110.0,13.0,30.0,1.0,140.0,13.0,2016,11,1
1,320.0,96.0,99.0,120.0,8.0,70.0,23.0,180.0,13.0,2016,11,2
2,251.0,52.0,72.0,130.0,23.0,20.0,11.0,160.0,23.0,2016,11,3
3,197.0,97.0,100.0,140.0,3.0,70.0,22.0,180.0,3.0,2016,11,4
4,291.0,72.0,78.0,130.0,24.0,40.0,3.0,180.0,19.0,2016,11,5


In [270]:
# split data in train and test
# continue with train to prevent leakage
train, test = train_test_split(data.index.values,test_size=0.3)
train = data.iloc[train]
test = data.iloc[test]

In [272]:
# create functions for pipeline
class NoTransformation(BaseEstimator, TransformerMixin):
    """ Empty transformer class that allows to skip a step in the pipeline,
    in this case applying no transformation."""
    def __init__(self):
        self = self

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

def build_pipeline(model_name, trans_name, model_dict, trans_dict):

    estimator = Pipeline([
        ('transformation', trans_dict[trans_name]),
        ('classifier', model_dict[model_name])
        ])
    
    return estimator

In [273]:
# do things with train such as make dummies from month and day features\
# you can do this with standard sklearn functions such as OneHotEncoder
# or you can make your own adaptations, create classes and functions like the NoTransformation class above

In [274]:
# make dictionaries with models and transformations you want to apply
# models to apply
model_dict = {
    'lr':LinearRegression(),
    'svr':SVR(),
}

# transformation to apply
trans_dict = {
    'none':NoTransformation(), # simply passes the data
    'poly':PolynomialFeatures(2),
}

In [275]:
# for crosssectional data, use KFold crossvalidation
k_fold = KFold(n_splits=5, random_state=None)
y_variable = 'FG'

# for timeseries data, use TimeSeriesSplit
# tscv = TimeSeriesSplit(n_splits=5)

In [276]:
evaluation_values = []
i = 0
for train_indices, cv_indices in k_fold.split(train):
    i += 1
    for model_name, trans_name in list(itertools.product(model_dict, trans_dict)):
        estimator = build_pipeline(model_name, trans_name, model_dict, trans_dict)
        estimator.fit(train.drop(y_variable, 1).iloc[train_indices], train[y_variable].iloc[train_indices])
        y_pred = estimator.predict(train.drop(y_variable, 1).iloc[cv_indices])
        evaluation_values.append([i, model_name, trans_name, 
                                  mse(train[y_variable].iloc[cv_indices],y_pred),
                                  mae(train[y_variable].iloc[cv_indices],y_pred),
                                  ev(train[y_variable].iloc[cv_indices],y_pred),
                                  r2(train[y_variable].iloc[cv_indices],y_pred)])

evaluation_values = pd.DataFrame(evaluation_values, columns = ['fold','model','transformation','mse','mae','ev','r2'])
evaluation_values.groupby(['model', 'transformation']).agg('mean').drop('fold', 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,mse,mae,ev,r2
model,transformation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,none,23.776878,3.561359,0.9713243,0.970349
lr,poly,36.843153,4.097276,0.9554151,0.954391
svr,none,856.843962,23.49979,1.66501e-05,-0.042555
svr,poly,856.854984,23.500487,4.4408920000000007e-17,-0.042569


In [277]:
# apply to test set
estimator = build_pipeline('lr', 'none', model_dict, trans_dict)
estimator.fit(train.drop(y_variable, 1), train[y_variable])
y_pred = estimator.predict(test.drop(y_variable, 1))
mae(test[y_variable],y_pred)

3.2923818521834431