### Hackathon pipeline template

In this document an idea of how to create a pipeline.

In [8]:
# general imports
import pandas as pd
import itertools

# general imports from sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# import for models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

# imports for transformation
from sklearn.preprocessing import PolynomialFeatures

# splitting for crossvalidation procedure
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit

# evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import explained_variance_score as ev
from sklearn.metrics import r2_score as r2

In [9]:
#Transform months to integer data
def clean_week_and_day(dataFrame):

    month_transform  = {
        "jan":1,
        "feb":2,
        "mar":3,
        "apr":4,
        "may":5,
        "jun":6,
        "jul":7,
        "aug":8,
        "sep":9,
        "oct":10,
        "nov":11,
        "dec":12
    }
    
    day_transform = {
        "mon":1,
        "tue":2,
        "wed":3,
        "thu":4,
        "fri":5,
        "sat":6,
        "sun":7,
    }
    
    dataFrame["month"] = dataFrame["month"].apply(lambda x: month_transform[x])
    dataFrame["day"]   = dataFrame["day"].apply(lambda x: day_transform[x])
    
    return data

In [10]:
# import data
# this dataset I retrieved from https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/
data = pd.read_csv('forestfires.csv',sep=',')
#Instead of dropping, replace month and day by numeric values
data = clean_week_and_day(data)
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,3,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,10,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,10,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,3,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,3,7,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [11]:
# split data in train and test
# continue with train to prevent leakage
train, test = train_test_split(data.index.values,test_size=0.3)
train = data.iloc[train]
test = data.iloc[test]

In [12]:
# create functions for pipeline
class NoTranformation(BaseEstimator, TransformerMixin):
    """ Empty transformer class that allows to skip a step in the pipeline,
    in this case applying no transformation."""
    def __init__(self):
        self = self

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

def build_pipeline(model_name, trans_name, model_dict, trans_dict):

    estimator = Pipeline([
        ('transformation', trans_dict[trans_name]),
        ('classifier', model_dict[model_name])
        ])
    
    return estimator

In [13]:
# do things with train such as make dummies from month and day features\
# you can do this with standard sklearn functions such as OneHotEncoder
# or you can make your own adaptations, create classes and functions like the NoTransformation class above

In [14]:
# make dictionaries with models and transformations you want to apply
# models to apply
model_dict = {
    'lr':LinearRegression(),
    'svr':SVR(),
}

# transformation to apply
trans_dict = {
    'none':NoTranformation(), # simply passes the data
    'poly':PolynomialFeatures(2),
}

In [15]:
# for crosssectional data, use KFold crossvalidation
k_fold = KFold(n_splits=5, random_state=None)
y_variable = 'area'

# for timeseries data, use TimeSeriesSplit
# tscv = TimeSeriesSplit(n_splits=5)

In [16]:
evaluation_values = []
i = 0
for train_indices, cv_indices in k_fold.split(train):
    i += 1
    for model_name, trans_name in list(itertools.product(model_dict, trans_dict)):
        estimator = build_pipeline(model_name, trans_name, model_dict, trans_dict)
        estimator.fit(train.drop(y_variable, 1).iloc[train_indices], train[y_variable].iloc[train_indices])
        y_pred = estimator.predict(train.drop(y_variable, 1).iloc[cv_indices])
        evaluation_values.append([i, model_name, trans_name, 
                                  mse(train[y_variable].iloc[cv_indices],y_pred),
                                  mae(train[y_variable].iloc[cv_indices],y_pred),
                                  ev(train[y_variable].iloc[cv_indices],y_pred),
                                  r2(train[y_variable].iloc[cv_indices],y_pred)])

evaluation_values = pd.DataFrame(evaluation_values, columns = ['fold','model','transformation','mse','mae','ev','r2'])
evaluation_values.groupby(['model', 'transformation']).agg('mean').drop('fold', 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,mse,mae,ev,r2
model,transformation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,none,2148.821051,16.460362,-0.092861,-0.099574
lr,poly,3919.335186,29.964649,-2.369855,-2.406261
svr,none,2208.209688,10.21596,-0.00042,-0.088197
svr,poly,2208.115351,10.223627,0.000195,-0.087899


In [17]:
# apply to test set
estimator = build_pipeline('lr', 'none', model_dict, trans_dict)
estimator.fit(train.drop(y_variable, 1), train[y_variable])
y_pred = estimator.predict(test.drop(y_variable, 1))
mae(test[y_variable],y_pred)

25.605572541203376