In [84]:
import pandas as pd
import numpy as np

## Used for preprocesing and modelling
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV

## Used for plotting
import altair as alt
import altair_ally as aly
aly.alt.data_transformers.enable('vegafusion')

df = pd.read_csv('data/SeoulBikeData.csv', encoding = 'latin-1')
print(df.head())

         Date  Rented Bike Count  Hour  Temperature(°C)  Humidity(%)  \
0  01/12/2017                254     0             -5.2           37   
1  01/12/2017                204     1             -5.5           38   
2  01/12/2017                173     2             -6.0           39   
3  01/12/2017                107     3             -6.2           40   
4  01/12/2017                 78     4             -6.0           36   

   Wind speed (m/s)  Visibility (10m)  Dew point temperature(°C)  \
0               2.2              2000                      -17.6   
1               0.8              2000                      -17.6   
2               1.0              2000                      -17.7   
3               0.9              2000                      -17.6   
4               2.3              2000                      -18.6   

   Solar Radiation (MJ/m2)  Rainfall(mm)  Snowfall (cm) Seasons     Holiday  \
0                      0.0           0.0            0.0  Winter  No Holiday   


In [85]:
### There's no missing value in the data.
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       8760 non-null   object 
 1   Rented Bike Count          8760 non-null   int64  
 2   Hour                       8760 non-null   int64  
 3   Temperature(°C)            8760 non-null   float64
 4   Humidity(%)                8760 non-null   int64  
 5   Wind speed (m/s)           8760 non-null   float64
 6   Visibility (10m)           8760 non-null   int64  
 7   Dew point temperature(°C)  8760 non-null   float64
 8   Solar Radiation (MJ/m2)    8760 non-null   float64
 9   Rainfall(mm)               8760 non-null   float64
 10  Snowfall (cm)              8760 non-null   float64
 11  Seasons                    8760 non-null   object 
 12  Holiday                    8760 non-null   object 
 13  Functioning Day            8760 non-null   objec

In [86]:
## Renaming the colums for the simplicity
df=df.rename(columns={
    'Temperature(°C)':'Temperature',
    'Humidity(%)':'Humidity',
    'Rainfall(mm)':'Rainfall',
    'Snowfall (cm)':'Snowfall',
    'Wind speed (m/s)':'Wind speed',
    'Visibility (10m)':'Visibility',
    'Solar Radiation (MJ/m2)':'Radiation',
    'Dew point temperature(°C)':'Dew point temperature'})

In [87]:
#Convert the Date column in Datetime Dtype
df['Date']=pd.to_datetime(df['Date'], format = 'mixed')

# Extract features from the Date column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.weekday
df = df.drop(['Date'], axis=1)  # Exclude unwanted columns

# Convert to categorical
df['Hour'] = df['Hour'].astype(str) 
df['Seasons'] = df['Seasons'].astype(str)

## Converting to binary for EDA and for values to feed into model
df['Holiday'] = df['Holiday'].apply(lambda x: 1 if x == "Holiday" else 0)
df['Functioning Day'] = df['Functioning Day'].apply(lambda x: 1 if x == "Yes" else 0)



1. Hour, Seasons should be OHE (Should be converted to Object/categorical first)
2. Holiday  and Functioning day should be binary encoded

In [88]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)
train_df

Unnamed: 0,Rented Bike Count,Hour,Temperature,Humidity,Wind speed,Visibility,Dew point temperature,Radiation,Rainfall,Snowfall,Seasons,Holiday,Functioning Day,Year,Month,Day,Weekday
3850,0,10,17.3,52,2.3,1235,7.3,2.38,0.0,0.0,Spring,0,0,2018,10,5,4
4491,562,3,19.6,68,1.7,1260,13.5,0.00,0.0,0.0,Summer,1,1,2018,6,6,2
3305,1632,17,18.3,29,4.3,1626,0.0,1.61,0.0,0.0,Spring,0,1,2018,4,17,1
2511,329,15,11.6,97,3.4,117,11.1,0.26,0.0,0.0,Spring,0,1,2018,3,15,3
2487,1025,15,21.7,39,3.3,1979,7.1,2.09,0.0,0.0,Spring,0,1,2018,3,14,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7382,0,14,24.3,39,1.7,2000,9.4,2.18,0.0,0.0,Autumn,0,0,2018,4,10,1
7763,1175,11,16.2,39,1.6,1580,2.2,2.05,0.0,0.0,Autumn,0,1,2018,10,20,5
5218,998,10,23.7,59,1.7,2000,15.2,0.89,0.0,0.0,Summer,0,1,2018,6,7,3
1346,54,2,-15.6,33,2.2,2000,-28.2,0.00,0.0,0.5,Winter,0,1,2018,1,26,4


In [89]:
##Temp - rental bike graph 
alt.Chart(train_df[train_df['Rented Bike Count']!=0]).mark_bar().encode(
    x = alt.X('Rented Bike Count', bin=alt.Bin(maxbins=30)),
    y = alt.Y('count()', title='Distribution of rental bikes'),
    tooltip = ['Rented Bike Count', 'count()']
)


In [90]:
##Hourly bike graph for seasons
alt.Chart(train_df).mark_line().encode(
    x = 'Hour',
    y = 'mean(Rented Bike Count)',
    color = 'Seasons',
    tooltip = ['Hour', 'mean(Rented Bike Count)']
)

In [91]:
## Correlation graph
aly.corr(train_df)

Here we can see that Temp and Dew point temp have a high correlation. For now we are dropping dew point temperature as most of the dew point temperature values are associated with temperature. Other values are not so highly correlated so not dropping them

In [None]:
## Model separation
X_train, y_train = train_df.drop("Rented Bike Count", axis = 1), train_df["Rented Bike Count"]
X_test, y_test = test_df.drop("Rented Bike Count", axis = 1), test_df["Rented Bike Count"]

In [None]:
# Define column transformer for preprocessing
column_transformer = make_column_transformer(
    (OneHotEncoder(), ['Hour', 'Seasons']),  # One-hot encode Hour and Seasons
    ("drop", ['Dew point temperature']),
    remainder='passthrough'  # Leave other columns as they are
)

# Ridge Regression Pipeline
ridge_pipeline = make_pipeline(
    column_transformer,
    StandardScaler(),
    Ridge()
)

# Decision Tree Pipeline
tree_pipeline = make_pipeline(
    column_transformer,
    StandardScaler(),
    DecisionTreeRegressor(random_state=42)
)


# Define parameter grids for RandomizedSearchCV
ridge_param_grid = {
    'ridge__alpha': np.logspace(-3, 3, 10)
}

tree_param_grid = {
    'decisiontreeregressor__max_depth': [None, 10, 20, 30, 40],
    'decisiontreeregressor__min_samples_split': [2, 5, 10],
    'decisiontreeregressor__min_samples_leaf': [1, 2, 4]
}

# Apply RandomizedSearchCV
ridge_search = RandomizedSearchCV(ridge_pipeline, ridge_param_grid, cv=5, n_iter=10, random_state=42)
tree_search = RandomizedSearchCV(tree_pipeline, tree_param_grid, cv=5, n_iter=10, random_state=42)

# Fit models
ridge_search.fit(X_train, y_train)
tree_search.fit(X_train, y_train)

## Best params
ridge_best_params = ridge_search.best_params_
tree_best_params = tree_search.best_params_

## Predicting accuracy on test set
ridge_score = ridge_search.score(X_test, y_test)
tree_score = tree_search.score(X_test, y_test)

ridge_best_params, ridge_score, tree_best_params, tree_score


({'ridge__alpha': np.float64(10.0)},
 0.6560417726146159,
 {'decisiontreeregressor__min_samples_split': 10,
  'decisiontreeregressor__min_samples_leaf': 4,
  'decisiontreeregressor__max_depth': 20},
 0.7942957692313539)