# Importing libraries

Importing libraries for managing the data

In [5]:
import pandas as pd
import numpy as np

Library for splitting the data into training and testing data

In [6]:
from sklearn.model_selection import train_test_split

Importing library that allows us to use Extreme Gradiant Boosting Regression algorithm

In [7]:
from xgboost import XGBRegressor

Importing library that allows us to use a Random Forest Regression model

In [8]:
from sklearn.ensemble import RandomForestRegressor

Libraries for using tensorflow (neural network algorithm)

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.layers import BatchNormalization

Prediction evaluation libraries

In [10]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

Library that allows us to scale the features of our data (important for neural networks)

In [11]:
from sklearn.preprocessing import StandardScaler

# Loading data

File "bikesRentedPerHourCombinedData.csv" contains hourly data from the autumn of 2010 to the autumn of 2024. It lists how many bike rides was done on a certain hour and also the weather conditions.

In [12]:
data = pd.read_csv("bikesRentedPerHourCombinedData.csv")

In [13]:
data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2010-09-20 11:00:00,2,0,1,1,23.1,21.2,34.0,13.0,2
1,2010-09-20 12:00:00,2,0,1,1,24.3,22.7,30.0,12.6,17
2,2010-09-20 13:00:00,2,0,1,1,25.1,23.5,29.0,12.6,11
3,2010-09-20 14:00:00,2,0,1,1,25.4,23.6,30.0,12.8,6
4,2010-09-20 15:00:00,2,0,1,1,25.6,23.2,30.0,13.0,12
...,...,...,...,...,...,...,...,...,...,...
123728,2024-10-31 19:00:00,3,0,1,1,21.8,20.6,65.0,18.8,1561
123729,2024-10-31 20:00:00,3,0,1,1,22.3,20.7,61.0,20.4,1249
123730,2024-10-31 21:00:00,3,0,1,1,21.9,20.2,61.0,20.4,1182
123731,2024-10-31 22:00:00,3,0,1,1,21.5,19.6,61.0,20.9,939


# Creating new columns

Converting *object* type "*datetime*" column into *datetime* type. This is important because then we can divide that column into more descriptive columns

In [14]:
data["datetime"] = pd.to_datetime(data["datetime"])

Splitting "*datetime*" column into "*year*", "*month*", "*day*" and "*hour*" columns

In [15]:
datetime = data["datetime"]

data["year"] = datetime.dt.year
data["month"] = datetime.dt.month
data["day"] = datetime.dt.day
data["hour"] = datetime.dt.hour

Creating a "*dayofweek*" column to improve the performance of the machine learning model

In [16]:
data["dayofweek"] = datetime.dt.dayofweek

In [17]:
data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,year,month,day,hour,dayofweek
0,2010-09-20 11:00:00,2,0,1,1,23.1,21.2,34.0,13.0,2,2010,9,20,11,0
1,2010-09-20 12:00:00,2,0,1,1,24.3,22.7,30.0,12.6,17,2010,9,20,12,0
2,2010-09-20 13:00:00,2,0,1,1,25.1,23.5,29.0,12.6,11,2010,9,20,13,0
3,2010-09-20 14:00:00,2,0,1,1,25.4,23.6,30.0,12.8,6,2010,9,20,14,0
4,2010-09-20 15:00:00,2,0,1,1,25.6,23.2,30.0,13.0,12,2010,9,20,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123728,2024-10-31 19:00:00,3,0,1,1,21.8,20.6,65.0,18.8,1561,2024,10,31,19,3
123729,2024-10-31 20:00:00,3,0,1,1,22.3,20.7,61.0,20.4,1249,2024,10,31,20,3
123730,2024-10-31 21:00:00,3,0,1,1,21.9,20.2,61.0,20.4,1182,2024,10,31,21,3
123731,2024-10-31 22:00:00,3,0,1,1,21.5,19.6,61.0,20.9,939,2024,10,31,22,3


# Removing unnecessary columns

Removing "*datetime*" column, since machine learning models usually can't read it. We already converted and divided it into appropriate columns.

In [18]:
data_modified = data.drop("datetime", axis=1)

Removing "*day*" column because it causes overfitting. Instead "*holiday*", "*workingday*" and "*dayofweek*" columns are much better to describe the daily data.

In [19]:
data_modified.drop("day", inplace=True, axis=1)

We removed early years (2010–2012) due to the low volume of bike rides compared to the rest of the dataset. Additionally, we excluded the pandemic years (2020–2021) as they experienced significant movement restrictions due to COVID-19, which could confuse the model. If future predictions include scenarios with similar restrictions in Los Angeles, we should consider adding a new feature, such as "*is_moving_restricted*", to capture this context.

In [20]:
years_to_remove = [2010, 2011, 2012, 2020, 2021]
data_modified = data_modified[~data_modified['year'].isin(years_to_remove)]

# Modifying "*year*" column

The model's performance is unaffected by whether the years start from 2013 or 0. For simplicity, we adjusted the dataset so that the year starts from 0 by subtracting 2013.

In [21]:
data_modified["year"] = data_modified["year"] - 2013

# Splitting data for training and testing

Splitting data into features (X) and target (y)

In [22]:
X = data_modified.drop("count", axis=1)
y = data_modified["count"]

Splitting data into training and testing sets. Training set is for training the models and testing set is for evaluating the models

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Training an extreme gradiant boosting (XGBoost) regression model

Calculating the mean of the y_target. This is the base score that the XGBoost regression model starts predicting from

In [24]:
base_score = np.mean(y_train)

Creating the model and training it

In [25]:
xgb_model = XGBRegressor(objective="count:poisson", base_score=base_score, random_state=0)
xgb_model.fit(X_train, y_train)

# Evaluating the XGBoost model

Calculating variance

In [26]:
variance = np.var(y_test)
print(f"Variance: {variance}")

Variance: 165408.03545859834


Calculating a baseline prediction

In [27]:
baseline_pred = np.full_like(y_test, np.mean(y_train))
baseline_mse = mean_squared_error(y_test, baseline_pred)
print(f"Baseline mean squared error: {baseline_mse}")

Baseline mean squared error: 165412.25990601612


Predicting test data

In [28]:
y_pred = xgb_model.predict(X_test)

Calculating model's mean squared error

In [29]:
mse = mean_squared_error(y_test, y_pred)
print(f"Model's mean squared error: {mse}")

Model's mean squared error: 9505.974453519078


Considering the variance of the test set and the baseline mean squared error, the model's mean squared error is significantly lower, indicating that the model performs satisfactorily and provides meaningful predictions compared to the baseline.

Calculating model's mean squared logarithmic error

In [30]:
msle = mean_squared_log_error(y_test, y_pred)
print(f"Model's mean squared logarithmic error: {msle}")

Model's mean squared logarithmic error: 0.15937568466626098


Calculating model's root mean squared logarithmic error

In [31]:
rmsle = np.sqrt(msle)
print(f"Model's root mean squared logarithmic error: {rmsle}")

Model's root mean squared logarithmic error: 0.39921884307514965


# Preparing data for random forest regression model

One hot encoding categorical columns

In [32]:
one_hot_encoded_columns = ["season", "weather", "dayofweek"]

for feature in one_hot_encoded_columns:
    one_hot_encoded_feature = pd.get_dummies(data_modified[feature], prefix=feature)
    data_modified = pd.concat([data_modified, one_hot_encoded_feature], axis=1)
    data_modified.drop(feature, inplace=True, axis=1)

# Splitting data for training and testing

Splitting data into features (X) and target (y)

In [661]:
X = data_modified.drop("count", axis=1)
y = data_modified["count"]

Splitting data into training and testing sets. Training set is for training the models and testing set is for evaluating the models

In [662]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Training a random forest regression model

Creating a model and training it. We also used gridsearch for hyperparameter tuning (in *kaggle_model.ipynb*), but found best results by just using default parameters.

In [663]:
rf = RandomForestRegressor(random_state=0, n_jobs=-1)
rf.fit(X_train, y_train)

# Evaluating the random forest regression model

Predicting test data

In [664]:
y_pred = rf.predict(X_test)

Calculating model's mean squared error

In [665]:
mse = mean_squared_error(y_test, y_pred)
print(f"Model's mean squared error: {mse}")

Model's mean squared error: 10251.710185014794


Calculating model's mean squared logarithmic error

In [666]:
msle = mean_squared_log_error(y_test, y_pred)
print(f"Model's mean squared logarithmic error: {msle}")

Model's mean squared logarithmic error: 0.1596628873445851


Calculating model's root mean squared logarithmic error

In [667]:
rmsle = np.sqrt(msle)
print(f"Model's root mean squared logarithmic error: {rmsle}")

Model's root mean squared logarithmic error: 0.3995783869838121


# Training a tensorflow model

In [668]:
X_tf = X.copy()

We applied a cyclical transformation to the "*month*" and "*hour*" columns to capture their periodic nature. This is important because certain values, such as hours 23 and 0, or months December and January, are as close to each other as any other consecutive values. Without this transformation, their relationship would not be properly represented in the model.

The transformation uses sine and cosine functions to map these cyclical values onto a continuous circle, ensuring the model can interpret their periodicity. The original month and hour columns were dropped after the transformation.

In [669]:
X_tf["hour_sin"] = np.sin(2 * np.pi * X_tf["hour"] / 24)
X_tf["hour_cos"] = np.cos(2 * np.pi * X_tf["hour"] / 24)

X_tf["month_sin"] = np.sin(2 * np.pi * X_tf["month"] / 12)
X_tf["month_cos"] = np.cos(2 * np.pi * X_tf["month"] / 12)

In [670]:
X_tf.drop(["month", "hour"], inplace=True, axis=1)

We scaled the features to standardize them with a mean of 0 and a standard deviation of 1. This is essential for TensorFlow models, as neural networks are sensitive to the scale of input data. Without scaling, training can be slower, less stable, and biased toward features with larger magnitudes.

In [671]:
scaler = StandardScaler()
X_tf = scaler.fit_transform(X_tf)

In [672]:
y_tf = np.array(y).reshape(-1, 1)

Splitting data into training and testing sets

In [673]:
X_train, X_test, y_train, y_test = train_test_split(X_tf, y_tf, test_size=0.2, random_state=0)

Creating the model. To ensure predictions are always 0 or positive, we use an exponential activation function in the final layer. The model includes multiple dense layers with ReLU activation, batch normalization for stability, and dropout for regularization.

In [674]:
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, activation="relu"),
    BatchNormalization(),
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dense(16, activation='relu'),
    Dense(1, activation='exponential')
])

In [675]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

Creating an early stopping function to prevent overfitting during training

In [676]:
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

Creating a function to reduce learning rate every 10 epochs.

In [677]:
def lr_scheduler(epoch, lr):
    if epoch % 10 == 0 and epoch > 0:
        lr = lr * 0.9  # Reduce by 10% every 10 epochs
    return lr

lr_schedule = LearningRateScheduler(lr_scheduler)

Training the model

In [678]:
history = model.fit(
    X_train, y_train,
    epochs=200,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, lr_schedule],
    verbose=1
)

Epoch 1/200
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - loss: 226601.3125 - mae: 324.5024 - val_loss: 65836.0156 - val_mae: 161.4161 - learning_rate: 0.0010
Epoch 2/200
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 63001.7852 - mae: 164.8329 - val_loss: 29298.2363 - val_mae: 109.4128 - learning_rate: 0.0010
Epoch 3/200
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 43368.0820 - mae: 137.0326 - val_loss: 26669.5000 - val_mae: 104.7171 - learning_rate: 0.0010
Epoch 4/200
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 36863.9531 - mae: 125.7334 - val_loss: 21252.8828 - val_mae: 92.7118 - learning_rate: 0.0010
Epoch 5/200
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 31903.7891 - mae: 116.8325 - val_loss: 19546.7930 - val_mae: 89.1735 - learning_rate: 0.0010
Epoch 6/200
[1m1724/1724[0m [32m━━━━━━━━━━

# Evaluating the TensorFlow model

Predicting test data

In [679]:
y_pred = model.predict(X_test)

[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


Calculating model's mean squared error

In [680]:
mse = mean_squared_error(y_test, y_pred)
print(f"Model's mean squared error: {mse}")

Model's mean squared error: 9377.793409113228


Calculating model's mean squared logarithmic error

In [681]:
msle = mean_squared_log_error(y_test, y_pred)
print(f"Model's mean squared logarithmic error: {msle}")

Model's mean squared logarithmic error: 0.20799011207647666


Calculating model's root mean squared logarithmic error

In [682]:
rmsle = np.sqrt(msle)
print(f"Model's root mean squared logarithmic error: {rmsle}")

Model's root mean squared logarithmic error: 0.45605932955754414
