In [34]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns

from sklearn import tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [35]:
data = pd.read_csv('training_data_VT2026.csv')

data['increase_stock'] = data['increase_stock'].map({'high_bike_demand': True, 'low_bike_demand': False}) 
#High demand = True, Low demand = False


#Heat index
def heat_index(T, R):
    """
    Beräkna Heat Index
    T: temperatur i °C
    R: relativ luftfuktighet i %
    Returnerar: HI i °C
    """

    HI = (
        T - 0.22475541 * (100 - R)
        - 0.00683783 * T**2
        - 0.05481717 * R**2
        + 0.00122874 * T**2 * R
        + 0.00085282 * T * R**2
        - 0.00000199 * T**2 * R**2
    )

    return HI

#data['heat_index'] = heat_index(data['temp'], data['humidity'])


# Australian apparent temperature
# e = vapour pressure
e = data['humidity']*0.01*6.105*np.exp((1.27*data['temp'])/(237.7 + data['temp']))

#AT = Australian apparent temperature
#data['AT'] = data['temp'] + 0.33*e - 0.7*data['windspeed'] - 4

In [36]:
# Optimized features
def optimized_features(data):
    df_feat = data.copy()

    df_feat['is_daytime'] = ((df_feat['hour_of_day'] >= 6) & (df_feat['hour_of_day'] <= 22)).astype(int)

    df_feat['rush_hour'] = ((df_feat['hour_of_day'] >= 7) & (df_feat['hour_of_day'] <= 9)).astype(int) | ((df_feat['hour_of_day'] >= 16) & (df_feat['hour_of_day'] <= 19)).astype(int)

    # Cyclic encoding for time variables

    df_feat['hour_sin'] = np.sin(2*np.pi*df_feat['hour_of_day'] / 24)
    df_feat['hour_cos'] = np.cos(2*np.pi*df_feat['hour_of_day'] / 24)

    df_feat['day_sin'] = np.sin(2*np.pi*df_feat['day_of_week'] / 7)
    df_feat['day_sin'] = np.cos(2*np.pi*df_feat['day_of_week'] / 7)

    df_feat['month_sin'] = np.sin(2*np.pi*(df_feat['month'] - 1) / 12)
    df_feat['month_cos'] = np.cos(2*np.pi*(df_feat['month'] - 1) / 12)

    return df_feat

#df = optimized_features(data)
df = data

#df = df.drop(columns = ['hour_of_day', 'day_of_week', 'month'])


In [37]:
state = 1

train, test = train_test_split(df, test_size=0.2, random_state= state)
X_train = train.drop(columns = ['increase_stock'])
y_train = train['increase_stock']
X_test = test.drop(columns = ['increase_stock'])
y_test = test['increase_stock']

model = RandomForestClassifier(n_estimators=100, random_state = state)
model.fit(X = X_train, y = y_train)
y_predict = model.predict(X_test)



print("Accuracy:", accuracy_score(y_test, y_predict))
#pd.crosstab(y_predict, y_test)
print(classification_report(y_test, y_predict))

Accuracy: 0.909375
              precision    recall  f1-score   support

       False       0.93      0.96      0.95       270
        True       0.76      0.62      0.68        50

    accuracy                           0.91       320
   macro avg       0.84      0.79      0.81       320
weighted avg       0.90      0.91      0.91       320



Now with grid search hyperparameter optimization

Best accuracy so far!

In [38]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

model = RandomForestClassifier(random_state=1)

grid = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
y_predict = best_model.predict(X_test)

print("Best parameters:", grid.best_params_)
#print("Best accuracy:", grid.best_score_)
print("Accuracy:", accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))



Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.91875
              precision    recall  f1-score   support

       False       0.93      0.97      0.95       270
        True       0.82      0.62      0.70        50

    accuracy                           0.92       320
   macro avg       0.87      0.80      0.83       320
weighted avg       0.91      0.92      0.91       320



Now with random search hyperparameter optimization

In [39]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 10)
}

model = RandomForestClassifier(random_state=1)

random_search = RandomizedSearchCV(model, param_distributions=param_dist, 
                                   n_iter=40, cv=5, scoring='f1', random_state=1)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
y_predict = best_model.predict(X_test)

print("Best parameters:", random_search.best_params_)
#print("Best accuracy:", random_search.best_score_)
print("Accuracy:", accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

Best parameters: {'max_depth': 14, 'min_samples_split': 6, 'n_estimators': 80}
Accuracy: 0.90625
              precision    recall  f1-score   support

       False       0.93      0.97      0.95       270
        True       0.76      0.58      0.66        50

    accuracy                           0.91       320
   macro avg       0.84      0.77      0.80       320
weighted avg       0.90      0.91      0.90       320

