### Hackathon pipeline template

In this document an idea of how to create a pipeline.

In [14]:
# general imports
import pandas as pd
import itertools

# general imports from sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# import for models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

# imports for transformation
from sklearn.preprocessing import PolynomialFeatures

# splitting for crossvalidation procedure
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit

# evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import explained_variance_score as ev
from sklearn.metrics import r2_score as r2

In [15]:
# import data
# this dataset I retrieved from https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/
data = pd.read_csv('forestfires.csv',sep=',')
#data = data.drop(['month', 'day'], 1)
data.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [16]:
data.shape

(517, 11)

In [17]:
# split data in train and test
# continue with train to prevent leakage
train, test = train_test_split(data.index.values,test_size=0.3)
train = data.iloc[train]
test = data.iloc[test]

In [18]:
train.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
109,4,5,88.6,91.8,709.9,7.1,17.4,56,5.4,0.0,0.0
373,5,4,94.8,222.4,698.6,13.9,20.3,42,2.7,0.0,0.0
58,2,2,84.0,9.3,34.0,2.1,13.9,40,5.4,0.0,0.0
418,2,4,92.0,203.2,664.5,8.1,24.9,42,5.4,0.0,2.44
154,1,5,93.4,145.4,721.4,8.1,29.6,27,2.7,0.0,1.46


In [None]:
def make_dummies():
    
    return data #met dummies ipv categorische variablen

In [None]:
train_new = make_dummies(train)


In [19]:
# create functions for pipeline
class NoTransformation(BaseEstimator, TransformerMixin):
    """ Empty transformer class that allows to skip a step in the pipeline,
    in this case applying no transformation."""
    def __init__(self):
        self = self

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

def build_pipeline(model_name, trans_name, model_dict, trans_dict):

    estimator = Pipeline([
        ('transformation', trans_dict[trans_name]),
        ('classifier', model_dict[model_name])
        ])
    
    return estimator

In [None]:
class MakeDummies(BaseEstimator, TransformerMixin):
    """ Empty transformer class that allows to skip a step in the pipeline,
    in this case applying no transformation."""
    def __init__(self):
        self = self

    def fit(self, X, y=None):
        
        #herken welke variabelen categorisch zijn
        #
        
        return self

    def transform(self, X):
        return X

In [20]:
# do things with train such as make dummies from month and day features\
# you can do this with standard sklearn functions such as OneHotEncoder
# or you can make your own adaptations, create classes and functions like the NoTransformation class above

In [21]:
# make dictionaries with models and transformations you want to apply
# models to apply
model_dict = {
    'lr':LinearRegression(),
    'svr':SVR(),
}

# transformation to apply
trans_dict = {
    'makedummies':MakeDummies(),
    'none':NoTransformation(), # simply passes the data
    'poly':PolynomialFeatures(2),
}

In [22]:
# for crosssectional data, use KFold crossvalidation
k_fold = KFold(n_splits=5, random_state=None)
y_variable = 'area'

# for timeseries data, use TimeSeriesSplit
# tscv = TimeSeriesSplit(n_splits=5)

In [27]:
evaluation_values = []
i = 0
for train_indices, cv_indices in k_fold.split(train):
    i += 1
    for model_name, trans_name in list(itertools.product(model_dict, trans_dict)):
        estimator = build_pipeline(model_name, trans_name, model_dict, trans_dict)
        estimator.fit(train.drop(y_variable, 1).iloc[train_indices], train[y_variable].iloc[train_indices])
        y_pred = estimator.predict(train.drop(y_variable, 1).iloc[cv_indices])
        evaluation_values.append([i, model_name, trans_name, 
                                  mse(train[y_variable].iloc[cv_indices],y_pred),
                                  mae(train[y_variable].iloc[cv_indices],y_pred),
                                  ev(train[y_variable].iloc[cv_indices],y_pred),
                                  r2(train[y_variable].iloc[cv_indices],y_pred)])

evaluation_values = pd.DataFrame(evaluation_values, 
                                 columns = ['fold','model','transformation','mse','mae','ev','r2'])

In [28]:
evaluation_values.groupby(['model', 'transformation']).agg('mean').drop('fold', 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,mse,mae,ev,r2
model,transformation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,none,5741.604269,23.989574,-0.65059,-0.948198
lr,poly,8106.656659,34.493342,-7.008543,-7.643062
svr,none,5723.392178,14.914542,-0.000107,-0.120233
svr,poly,5723.35983,14.928392,0.00042,-0.120975


In [12]:
# apply to test set
estimator = build_pipeline('svr', 'none', model_dict, trans_dict)
estimator.fit(train.drop(y_variable, 1), train[y_variable])

test_new = make_dummies(test)


y_pred = estimator.predict(test.drop(y_variable, 1))
mae(test[y_variable],y_pred)

19.94732412946983