# Bike sharing demand in Washington, D.C.


## This gives a kaggle score of ~0.5181


https://www.kaggle.com/c/bike-sharing-demand/data


- hourly rental data spanning two years
- the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month
- goal: predict the total **count** of bikes rented during each hour covered by the test set, using only information available prior to the rental period
- evaluation based on [Root Mean Squared Logarithmic Error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html)
- **model should only use information which was available prior to the time for which it is forecasting**

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler

### Get Data - Test Train Split

In [None]:
train = pd.read_csv('./data/train.csv', parse_dates=[0])
test = pd.read_csv('./data/test.csv', parse_dates=[0])
submission = pd.read_csv('./data/sampleSubmission.csv', parse_dates=[0])
train.head()

### Explore Data

### Feature Engineering

In [None]:
def create_date_features(df):
    x = df['datetime']
    return pd.DataFrame({
        'month': x.dt.month,
        'hour': x.dt.hour,
        'week': x.dt.isocalendar().week,
        'weekday': x.dt.weekday
    })

### Define preprocessing pipeline

In [None]:
datetime_pipeline = make_pipeline(
    FunctionTransformer(create_date_features, validate=False),
    OneHotEncoder(handle_unknown='ignore')
)


transformer = make_column_transformer(
    (datetime_pipeline, ['datetime']),
    (StandardScaler(), ['temp', 'atemp', 'humidity', 'windspeed']),         
    (OneHotEncoder(handle_unknown='ignore'), ['season', 'weather']),
    ('passthrough', ['workingday', 'holiday'])
)


### Define model pipeline

In [None]:
model_pipeline = make_pipeline(
    transformer, 
    PolynomialFeatures(), 
    MultiOutputRegressor(PoissonRegressor(max_iter=10000))
)

### Hyperparatmeter Optimization

In [None]:
model_pipeline.get_params().keys()

In [None]:
param_grid = {
    'polynomialfeatures__degree': [1, 2], 
    'multioutputregressor__estimator__alpha': [0.001, 0.01, 0.1, 1, 10]
}

### Define model input and output

In [None]:
X_train = train.drop(['casual','registered', 'count'], axis=1)
y_train = train[['casual','registered']]

### Define customized evaluation metric

In [None]:
def rmsle(y, y_pred):
    # sum up y and y_pred
    y = np.sum(y, axis=1)
    y_pred = np.sum(y_pred, axis=1)    
    # set negative predictions to zero
    y_pred[y_pred < 0] = 0
    return np.sqrt(metrics.mean_squared_log_error(y, y_pred))

neg_rmsle_score = metrics.make_scorer(rmsle, greater_is_better=False)

### Define Grid Search and train

In [None]:
cv = GridSearchCV(
    model_pipeline, 
    param_grid, 
    return_train_score=True, 
    scoring=neg_rmsle_score, 
    cv=5, 
    n_jobs=4, 
    refit=True,
    verbose=1
)
cv.fit(X_train, y_train)
cv_res = pd.DataFrame(cv.cv_results_)

### Check results

In [None]:
cv_res[[
    'param_polynomialfeatures__degree',
    'param_multioutputregressor__estimator__alpha',
    'mean_train_score', 
    'mean_test_score'
]].abs()

In [None]:
cv.best_params_

## Make predictions on entire test set

In [None]:
y_pred = cv.best_estimator_.predict(test)
y_pred = y_pred.sum(axis=1)
assert all(y_pred >= 0)

## Make prediction on one sample and compare to ground truth

In [None]:
test.sample(1)

In [None]:
test_x = train[test.columns].sample(1)

In [None]:
test_x

In [None]:
cv.best_estimator_.predict(test_x)

In [None]:
train.iloc[[test_x.index.item()]]