In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from scipy.stats import randint, uniform
import joblib

In [57]:
# Load data with encoding fix
df = pd.read_csv('demanddata.csv', encoding='ISO-8859-1')

In [58]:
print("Available Columns After Encoding:")
print(df.columns.tolist())

Available Columns After Encoding:
['Station Id', 'Date', 'Rented Bike Count', 'Hour', 'Temperature(è\x9a\x93)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(è\x9a\x93)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons', 'Holiday', 'Functioning Day']


In [62]:
df.rename(columns={
    "Station Id": "Station_Id",  # Renamed "Station Id" to "Station_Id"
    "Temperature(è\x9a\x93)": "Temperature",  
    "Functioning Day": "Functioning_Day",
    "Humidity(%)": "Humidity",  
    "Wind speed (m/s)": "Wind_speed",
    "Visibility (10m)": "Visibility",
    "Dew point temperature(è\x9a\x93)": "Dew_point_temperature",
    "Solar Radiation (MJ/m2)": "Solar_Radiation",
    "Snowfall (cm)": "Snowfall",
    "Rainfall(mm)": "Rainfall",
    "Rented Bike Count": "Rented_Bike_Count"
}, inplace=True)

In [63]:
# Parse date
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

In [64]:
print("Available Columns After Encoding:")
print(df.columns.tolist())

Available Columns After Encoding:
['Station_Id', 'Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity', 'Wind_speed', 'Visibility', 'Dew_point_temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall', 'Seasons', 'Holiday', 'Functioning_Day']


In [65]:
# Create time-related features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek

In [66]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Seasons', 'Holiday', 'Functioning_Day', 'Station_Id'])  # Include 'Station_Id'

In [67]:
print("Available Columns After Encoding:")
print(df.columns.tolist())

Available Columns After Encoding:
['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity', 'Wind_speed', 'Visibility', 'Dew_point_temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall', 'Year', 'Month', 'Day', 'DayOfWeek', 'Seasons_Autumn', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter', 'Holiday_Holiday', 'Holiday_No Holiday', 'Functioning_Day_No', 'Functioning_Day_Yes', 'Station_Id_1', 'Station_Id_2', 'Station_Id_3', 'Station_Id_4', 'Station_Id_5', 'Station_Id_6', 'Station_Id_7', 'Station_Id_8', 'Station_Id_9', 'Station_Id_10', 'Station_Id_11', 'Station_Id_12', 'Station_Id_13', 'Station_Id_14', 'Station_Id_15', 'Station_Id_16', 'Station_Id_17', 'Station_Id_18', 'Station_Id_19', 'Station_Id_20', 'Station_Id_21', 'Station_Id_22', 'Station_Id_23', 'Station_Id_24', 'Station_Id_25', 'Station_Id_26', 'Station_Id_27', 'Station_Id_28', 'Station_Id_29', 'Station_Id_30', 'Station_Id_31', 'Station_Id_32', 'Station_Id_33', 'Station_Id_34', 'Station_Id_35', 'Station_Id_36', 'Statio

In [68]:
# Select features and target
features = ['Hour', 'Temperature', 'Humidity', 'Wind_speed', 'Visibility', 
            'Dew_point_temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall',
            'Year', 'Month', 'Day', 'DayOfWeek',
            'Seasons_Autumn', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter',
            'Holiday_Holiday', 'Holiday_No Holiday', 'Functioning_Day_No', 'Functioning_Day_Yes']
# Add 'Station_Id' features
station_id_features = [col for col in df.columns if col.startswith('Station_Id')]
features.extend(station_id_features)

X = df[features]
y = df['Rented_Bike_Count']

In [69]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Set up the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(10, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['sqrt', 'log2', None]
}

In [70]:
# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

In [71]:
# Initialize RandomizedSearchCV with the Random Forest Regressor and parameter distributions
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42)


In [72]:
# Fit RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [73]:
# Get the best parameters from the random search
best_params = random_search.best_params_

In [74]:
# Train a Random Forest Regressor with the best parameters
rf_best = RandomForestRegressor(**best_params, random_state=42)
rf_best.fit(X_train, y_train)

In [75]:
# Predict on the validation set
y_pred = rf_best.predict(X_val)

In [76]:
# Evaluate the model
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)

print("Best Hyperparameters: ", best_params)
print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2)
print("Mean Squared Error: ", mse)

Best Hyperparameters:  {'max_depth': 24, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 263}
Mean Absolute Error:  100.76876266710198
R2 Score:  0.9237076957341631
Mean Squared Error:  31786.901538468337


In [77]:
# Check sample predictions
print("Sample Predictions vs Actual:")
for i in range(10):
    print(f"Predicted: {y_pred[i]}, Actual: {y_val.iloc[i]}")

Sample Predictions vs Actual:
Predicted: 1759.026409258253, Actual: 1728
Predicted: 728.5244850734395, Actual: 822
Predicted: 672.7358229223247, Actual: 658
Predicted: 2382.7389884724485, Actual: 2716
Predicted: 748.5847275031686, Actual: 1083
Predicted: 777.5321609632446, Actual: 636
Predicted: 1714.1335475888716, Actual: 1537
Predicted: 722.2239739875678, Actual: 712
Predicted: 480.6325698593758, Actual: 425
Predicted: 503.36865381133435, Actual: 594


In [83]:
import joblib

# Save the trained model
joblib.dump(rf_best, '../API/bike_demand_model.pkl')

['../API/bike_demand_model.pkl']