In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklego.preprocessing import RepeatingBasisFunction
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error

## 1. Load the data in split into X & Y

In [15]:
# load data
df = pd.read_csv('../data/train.csv', parse_dates=['datetime'])
# split into x and y data
X = df.drop(['count', 'casual', 'registered'], axis=1)
y = df['count']
# use logarithm(y + 1) transformation on y 
y_log = np.log1p(y)

## 2. Create pipeline, columntransformer and model

In [16]:
# add the needed time features
def date_time_transformation(df):
    X = df
    X['hour'] = X['datetime'].dt.hour
    X['dayofyear'] = X['datetime'].dt.dayofyear
    X['year'] = X['datetime'].dt.year
    X = X.drop('datetime', axis=1)
    return X

In [17]:
# min max scale and Polynomial for numerical features (NOT USED)
numerical_features = ['atemp', 'humidity']
numerical_transformer = make_pipeline(MinMaxScaler(), PolynomialFeatures(include_bias=False, interaction_only=False, degree=4))

In [18]:
# create the transformer preprocessor
# use the optimized parameters
preprocessor = ColumnTransformer(
    transformers=[
        ("hour", RepeatingBasisFunction(n_periods=14, column="hour", input_range=(0,23), remainder="drop"), ['hour']),
        ("month", RepeatingBasisFunction(n_periods=36,column="dayofyear",input_range=(1,365),remainder="drop"), ['dayofyear']),
        ('numeric_polinomial', MinMaxScaler(), ['atemp', 'humidity']),
        ('categorical', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), ['workingday', 'year']),
    ],
    remainder='drop')


In [19]:
# create the model pipeline
# first create the time features, then transform the features with the column transformer, 
# then create polynomial features and finally feed all to the model
# use the optimized parameters
pipeline = make_pipeline(FunctionTransformer(date_time_transformation), 
                        preprocessor, 
                        PolynomialFeatures(include_bias=False, interaction_only=False, degree=2), 
                        Ridge(alpha=7))

## 3. Fit the model with Xtrain and ytrain

In [20]:
# split data to train and test
Xtrain, Xval, ytrain_log, yval_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [21]:
# fit the pipeline/model and calculate the R2 score
pipeline.fit(Xtrain, ytrain_log)
pipeline.score(Xtrain, ytrain_log), pipeline.score(Xval, yval_log)

(0.9422971471882301, 0.9411374571748645)

## 4. calculate the MSLE

In [22]:
# predict the logaritmic y 
y_pred_train_log = pipeline.predict(Xtrain)
y_pred_val_log = pipeline.predict(Xval)

In [23]:
# de-log the y predictions
y_pred_train = np.exp(y_pred_train_log) - 1
y_pred_val = np.exp(y_pred_val_log) - 1

In [27]:
# calculate the MSLE for train and validation data
msle_ridge_train = mean_squared_log_error(np.expm1(ytrain_log), y_pred_train)
msle_ridge_val = mean_squared_log_error(np.expm1(yval_log), y_pred_val)
print(msle_ridge_train, msle_ridge_val)

0.1156175239380077 0.12116885048182596


## 5. Prepare Kaggle upload file

In [25]:
# train model with full data
pipeline.fit(X, y_log)
pipeline.score(X, y_log)

0.9430448121405939

In [26]:
# import test-dataset
Xtest = pd.read_csv('../data/test.csv', parse_dates=['datetime'])
df_datetime = Xtest[['datetime']]
# make logarithmic predictions
y_pred_test_log = pipeline.predict(Xtest)
# convert log prediction to prediction
y_pred_test = np.exp(y_pred_test_log) - 1
df_y_pred_test = pd.DataFrame(y_pred_test)
# merge X datetime with y
test_result = pd.merge(df_datetime, df_y_pred_test, left_index=True, right_index=True)
test_result.columns = ['datetime', 'count']
# create upload pdf
test_result.to_csv('bike_result.csv', index=False)