In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklego.preprocessing import RepeatingBasisFunction
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

## 1. Load the data in split into X & Y

In [2]:
# load data
df = pd.read_csv('../data/train.csv', parse_dates=['datetime'])
# split into x and y data
X = df.drop(['count', 'casual', 'registered'], axis=1)
y = df['count']
# use logarithm(y + 1) transformation on y 
y = np.log1p(y)

## 2. Create pipeline, columnstransformer and model

In [3]:
# add the needed timefeatures
def date_time_transformation(df):
    X = df
    X['hour'] = X['datetime'].dt.hour
    X['dayofyear'] = X['datetime'].dt.dayofyear
    X['year'] = X['datetime'].dt.year
    X = X.drop('datetime', axis=1)
    return X

In [4]:
# create the transformer preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("hour", RepeatingBasisFunction(n_periods=18, column="hour", input_range=(0,23), remainder="drop"), ['hour']),
        ("month", RepeatingBasisFunction(n_periods=12,column="dayofyear",input_range=(1,365),remainder="drop"), ['dayofyear']),
        ('numeric_polinomial', MinMaxScaler(), ['atemp', 'humidity']),
        ('categorical', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), ['workingday', 'year']),
    ],
    remainder='drop')


In [5]:
# create the model pipeline
# first create the time features, then transform the features with the columntransformer, 
# then create polynomial features and finally feed all to the model
pipeline = make_pipeline(FunctionTransformer(date_time_transformation), 
                        preprocessor, 
                        PolynomialFeatures(include_bias=False, interaction_only=False, degree=2), 
                        Ridge(alpha=2.0))

## 3. Fit the model with Xtrain and ytrain

In [6]:
# split data to train and test to check, if everything worked
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(Xtrain, ytrain)
pipeline.score(Xtrain, ytrain), pipeline.score(Xval, yval)

(0.9402634964339578, 0.941076413626622)

## 4. Hyperparameter optimization

In [7]:
# print the hyperparams of the pipeline to chose the ones to optimize
pipeline.get_params()

{'memory': None,
 'steps': [('functiontransformer',
   FunctionTransformer(func=<function date_time_transformation at 0x7fd507eda5e0>)),
  ('columntransformer',
   ColumnTransformer(transformers=[('hour',
                                    RepeatingBasisFunction(column='hour',
                                                           input_range=(0, 23),
                                                           n_periods=18),
                                    ['hour']),
                                   ('month',
                                    RepeatingBasisFunction(column='dayofyear',
                                                           input_range=(1, 365)),
                                    ['dayofyear']),
                                   ('numeric_polinomial', MinMaxScaler(),
                                    ['atemp', 'humidity']),
                                   ('categorical',
                                    OneHotEncoder(drop='first',
             

In [8]:
# define hyperparameters for Grid Search
hyperparams = {
    'columntransformer__hour__n_periods': [14, 16, 18],
    'columntransformer__month__n_periods': [24, 36],
    'ridge__alpha': [5, 7, 9, 11],
    'polynomialfeatures__include_bias': [True, False],
    'polynomialfeatures__interaction_only': [True, False]
}

In [9]:
# Perform the GridSearch optimization
g2 = GridSearchCV(pipeline, hyperparams, cv = 5)
# fit grid with X and y. The train, val split will be done during cross validation
g2.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('functiontransformer',
                                        FunctionTransformer(func=<function date_time_transformation at 0x7fd507eda5e0>)),
                                       ('columntransformer',
                                        ColumnTransformer(transformers=[('hour',
                                                                         RepeatingBasisFunction(column='hour',
                                                                                                input_range=(0,
                                                                                                             23),
                                                                                                n_periods=18),
                                                                         ['hour']),
                                                                        ('month',
                                        

In [10]:
# check parameters with best_params_
g2.best_params_

{'columntransformer__hour__n_periods': 14,
 'columntransformer__month__n_periods': 36,
 'polynomialfeatures__include_bias': True,
 'polynomialfeatures__interaction_only': False,
 'ridge__alpha': 11}

In [11]:
# check the model that is used to make predictions
g2.best_estimator_

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function date_time_transformation at 0x7fd507eda5e0>)),
                ('columntransformer',
                 ColumnTransformer(transformers=[('hour',
                                                  RepeatingBasisFunction(column='hour',
                                                                         input_range=(0,
                                                                                      23),
                                                                         n_periods=14),
                                                  ['hour']),
                                                 ('month',
                                                  RepeatingBasisFunction(column='dayofyear',
                                                                         input_range=(1,
                                                                                      365),
               

In [12]:
# get the best score of a model
g2.best_score_

0.8779030168462711

In [13]:
# check the whole results of the grid-search
pd.DataFrame(g2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__hour__n_periods,param_columntransformer__month__n_periods,param_polynomialfeatures__include_bias,param_polynomialfeatures__interaction_only,param_ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.246136,0.005181,0.028191,0.001901,14,24,True,True,5,"{'columntransformer__hour__n_periods': 14, 'co...",0.677934,0.871179,0.899845,0.911675,0.919802,0.856087,0.090587,92
1,0.248386,0.024574,0.029167,0.003955,14,24,True,True,7,"{'columntransformer__hour__n_periods': 14, 'co...",0.689304,0.875459,0.903392,0.912832,0.923214,0.860840,0.087227,86
2,0.244020,0.011036,0.028821,0.001616,14,24,True,True,9,"{'columntransformer__hour__n_periods': 14, 'co...",0.696897,0.877753,0.904646,0.913488,0.925174,0.863592,0.084800,79
3,0.245547,0.006566,0.028207,0.001318,14,24,True,True,11,"{'columntransformer__hour__n_periods': 14, 'co...",0.702373,0.879101,0.904946,0.913863,0.926404,0.865337,0.082945,73
4,0.253976,0.001434,0.028846,0.001304,14,24,True,False,5,"{'columntransformer__hour__n_periods': 14, 'co...",0.741289,0.878387,0.912124,0.914883,0.925531,0.874443,0.068426,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.573369,0.009927,0.034811,0.001323,18,36,False,True,11,"{'columntransformer__hour__n_periods': 18, 'co...",0.727655,0.878607,0.903608,0.909740,0.927314,0.869385,0.072564,54
92,0.591350,0.004173,0.036220,0.001829,18,36,False,False,5,"{'columntransformer__hour__n_periods': 18, 'co...",0.760379,0.877654,0.909572,0.908831,0.926371,0.876562,0.060186,32
93,0.587628,0.006148,0.036109,0.001733,18,36,False,False,7,"{'columntransformer__hour__n_periods': 18, 'co...",0.759583,0.880999,0.909067,0.909334,0.927206,0.877238,0.060658,18
94,0.594673,0.008238,0.036467,0.000715,18,36,False,False,9,"{'columntransformer__hour__n_periods': 18, 'co...",0.759356,0.882723,0.908330,0.909675,0.927780,0.877573,0.060826,14
