In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score, mean_squared_error


In [2]:
# Load data
df = pd.read_csv("bike_data.csv")

In [3]:
# Parsing time
df = pd.read_csv('bike_data.csv', low_memory=False, parse_dates=['timestamp'])
df.sort_values(by=['timestamp'], inplace=True, ascending=True)

In [4]:
# Creating time features
df_tmp = df.copy()
df_tmp['Year'] = df_tmp.timestamp.dt.year
df_tmp['Month'] = df_tmp.timestamp.dt.month
df_tmp['Day'] = df_tmp.timestamp.dt.day
df_tmp['DayOfWeek'] = df_tmp.timestamp.dt.dayofweek
df_tmp['Hour'] = df_tmp.timestamp.dt.hour

In [5]:
# Select features and target
X = df_tmp[['station_id', 'Hour', 'Month', 'Day', 'DayOfWeek', 'weather_sunny', 'weather_rainy', 'holidays_public_holiday']]
y = df_tmp['demand']

In [6]:

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Hyperparameter tuning
rf_grid = {
    'n_estimators': np.arange(10, 100, 10),
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': np.arange(2, 20, 2),
    'min_samples_leaf': np.arange(1, 20, 2),
    'max_features': [0.5, 1, "sqrt"],
    'max_samples': [5000]
}

rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1, random_state=42),
                              param_distributions=rf_grid,
                              n_iter=50,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [8]:
# Save the model
joblib.dump(rs_model, './bike_demand_prediction/bike_demand_rs_model.pkl')

['./bike_demand_prediction/bike_demand_rs_model.pkl']