# Importing libraries

Importing libraries for managing the data

In [43]:
import pandas as pd
import numpy as np

Importing library that allows us to use Extreme Gradiant Boosting Regression algorithm

In [44]:
from xgboost import XGBRegressor

Importing library that allows us to use a Random Forest Regression model

In [45]:
from sklearn.ensemble import RandomForestRegressor

Libraries for using tensorflow (neural network algorithm)

In [96]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.layers import BatchNormalization

Prediction evaluation libraries

In [47]:
from sklearn.metrics import mean_squared_error

Library that allows us to scale the features of our data (important for neural networks)

In [48]:
from sklearn.preprocessing import StandardScaler

# Loading data

File "*kaggleData/train.csv*" contains hourly data from 2011 to 2012 (both inclusive) for training the machine learning models. It lists how many bike rides were done on a certain hour and also the weather conditions. Some of the data has been removed and added to the "*kaggleData/test.csv*" file, where the hourly bike rental counts have been removed. The predictions must be done on this data and then sent to the Kaggle competition.

In [49]:
kaggle_training_data = pd.read_csv("kaggleData/train.csv")
X_test = pd.read_csv("kaggleData/test.csv")

In [50]:
kaggle_training_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


In [51]:
X_test

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014
...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981


# Creating new columns

Converting object type "datetime" column into datetime type. This is important because then we can divide that column into more descriptive columns

In [52]:
kaggle_training_data["datetime"] = pd.to_datetime(kaggle_training_data["datetime"])
X_test["datetime"] = pd.to_datetime(X_test["datetime"])

Splitting "datetime" column into "year", "month", "day" and "hour" columns

In [53]:
kaggle_training_data["year"] = kaggle_training_data["datetime"].dt.year
kaggle_training_data["month"] = kaggle_training_data["datetime"].dt.month
kaggle_training_data["day"] = kaggle_training_data["datetime"].dt.day
kaggle_training_data["hour"] = kaggle_training_data["datetime"].dt.hour

X_test["year"] = X_test["datetime"].dt.year
X_test["month"] = X_test["datetime"].dt.month
X_test["day"] = X_test["datetime"].dt.day
X_test["hour"] = X_test["datetime"].dt.hour

Creating a "dayofweek" column to improve the performance of the machine learning models

In [54]:
kaggle_training_data["dayofweek"] = kaggle_training_data["datetime"].dt.dayofweek
X_test["dayofweek"] = X_test["datetime"].dt.dayofweek

# Modifying "year" column

The model's performance is unaffected by whether the years start from 2011 or 0. For simplicity, we adjusted the dataset so that the year starts from 0 by subtracting 2013.

In [55]:
kaggle_training_data["year"] = kaggle_training_data["year"] - 2011
X_test["year"] = X_test["year"] - 2011

# Removing unnecessary columns

Removing "casual" and "registered" columns from the training set, since they aren't used as features and also not as target values

In [56]:
kaggle_training_data.drop(["casual", "registered"], inplace=True, axis=1)

Saving the "datetime" column of the test set to later on use it in submission file

In [57]:
submission_datetime = X_test["datetime"]

Removing "datetime" column, since machine learning models usually can't read it. We already converted and divided it into appropriate columns.

In [58]:
kaggle_training_data.drop("datetime", inplace=True, axis=1)
X_test.drop("datetime", inplace=True, axis=1)

Removing "month" column, since this way the model produces better results

In [59]:
kaggle_training_data_xgb = kaggle_training_data.drop("month", axis=1)
X_test_xgb = X_test.drop("month", axis=1)

In [60]:
kaggle_training_data_xgb

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,year,day,hour,dayofweek
0,1,0,0,1,9.84,14.395,81,0.0000,16,0,1,0,5
1,1,0,0,1,9.02,13.635,80,0.0000,40,0,1,1,5
2,1,0,0,1,9.02,13.635,80,0.0000,32,0,1,2,5
3,1,0,0,1,9.84,14.395,75,0.0000,13,0,1,3,5
4,1,0,0,1,9.84,14.395,75,0.0000,1,0,1,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,19.695,50,26.0027,336,1,19,19,2
10882,4,0,1,1,14.76,17.425,57,15.0013,241,1,19,20,2
10883,4,0,1,1,13.94,15.910,61,15.0013,168,1,19,21,2
10884,4,0,1,1,13.94,17.425,61,6.0032,129,1,19,22,2


In [61]:
X_test_xgb

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,day,hour,dayofweek
0,1,0,1,1,10.66,11.365,56,26.0027,0,20,0,3
1,1,0,1,1,10.66,13.635,56,0.0000,0,20,1,3
2,1,0,1,1,10.66,13.635,56,0.0000,0,20,2,3
3,1,0,1,1,10.66,12.880,56,11.0014,0,20,3,3
4,1,0,1,1,10.66,12.880,56,11.0014,0,20,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,1,0,1,2,10.66,12.880,60,11.0014,1,31,19,0
6489,1,0,1,2,10.66,12.880,60,11.0014,1,31,20,0
6490,1,0,1,1,10.66,12.880,60,11.0014,1,31,21,0
6491,1,0,1,1,10.66,13.635,56,8.9981,1,31,22,0


# Splitting training data into features (X) and target (y)

Splitting the training data so we could use it to train the model

In [68]:
X_train_xgb = kaggle_training_data_xgb.drop("count", axis=1)
y_train_xgb = kaggle_training_data_xgb["count"]

# Training an extreme gradiant boosting (XGBoost) regression model

Creating a model and training it. We also used gridsearch for hyperparameter tuning (in kaggle_model.ipynb), but found best results by just using default parameters.

In [69]:
xgb_model = XGBRegressor(objective="reg:gamma", random_state=0)
xgb_model.fit(X_train_xgb, y_train_xgb)

# Predicting the test results and writing them into a file

Predicting test data

In [72]:
y_test_xgb = xgb_model.predict(X_test_xgb)

Creating a function to save the test data into a file

In [79]:
def results_to_file(y_test, file_name):
    prediction = pd.DataFrame({
        "datetime": submission_datetime,
        "count": y_test
    })

    prediction.to_csv("./submissions/" + file_name + ".csv", index=False)

Saving the results into a file

In [80]:
results_to_file(y_test_xgb, "submission_xgb_gamma_16")

Competition score (Root Mean Squared Logarithmic Error): 0.39899

# Preparing data for random forest regression model

One hot encoding categorical features

In [84]:
X_train_rfr = X_train_xgb.copy()
y_train_rfr = y_train_xgb.copy()

X_test_rfr = X_test_xgb.copy()

one_hot_encoded_columns = ["season", "weather", "dayofweek"]

for feature in one_hot_encoded_columns:
    one_hot_encoded_feature = pd.get_dummies(X_train_rfr[feature], prefix=feature)
    X_train_rfr = pd.concat([X_train_rfr, one_hot_encoded_feature], axis=1)
    X_train_rfr.drop(feature, inplace=True, axis=1)

for feature in one_hot_encoded_columns:
    one_hot_encoded_feature = pd.get_dummies(X_test_rfr[feature], prefix=feature)
    X_test_rfr = pd.concat([X_test_rfr, one_hot_encoded_feature], axis=1)
    X_test_rfr.drop(feature, inplace=True, axis=1)

In [85]:
X_train_rfr

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,year,day,hour,season_1,...,weather_2,weather_3,weather_4,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6
0,0,0,9.84,14.395,81,0.0000,0,1,0,True,...,False,False,False,False,False,False,False,False,True,False
1,0,0,9.02,13.635,80,0.0000,0,1,1,True,...,False,False,False,False,False,False,False,False,True,False
2,0,0,9.02,13.635,80,0.0000,0,1,2,True,...,False,False,False,False,False,False,False,False,True,False
3,0,0,9.84,14.395,75,0.0000,0,1,3,True,...,False,False,False,False,False,False,False,False,True,False
4,0,0,9.84,14.395,75,0.0000,0,1,4,True,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,0,1,15.58,19.695,50,26.0027,1,19,19,False,...,False,False,False,False,False,True,False,False,False,False
10882,0,1,14.76,17.425,57,15.0013,1,19,20,False,...,False,False,False,False,False,True,False,False,False,False
10883,0,1,13.94,15.910,61,15.0013,1,19,21,False,...,False,False,False,False,False,True,False,False,False,False
10884,0,1,13.94,17.425,61,6.0032,1,19,22,False,...,False,False,False,False,False,True,False,False,False,False


In [86]:
X_test_rfr

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,year,day,hour,season_1,...,weather_2,weather_3,weather_4,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6
0,0,1,10.66,11.365,56,26.0027,0,20,0,True,...,False,False,False,False,False,False,True,False,False,False
1,0,1,10.66,13.635,56,0.0000,0,20,1,True,...,False,False,False,False,False,False,True,False,False,False
2,0,1,10.66,13.635,56,0.0000,0,20,2,True,...,False,False,False,False,False,False,True,False,False,False
3,0,1,10.66,12.880,56,11.0014,0,20,3,True,...,False,False,False,False,False,False,True,False,False,False
4,0,1,10.66,12.880,56,11.0014,0,20,4,True,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6488,0,1,10.66,12.880,60,11.0014,1,31,19,True,...,True,False,False,True,False,False,False,False,False,False
6489,0,1,10.66,12.880,60,11.0014,1,31,20,True,...,True,False,False,True,False,False,False,False,False,False
6490,0,1,10.66,12.880,60,11.0014,1,31,21,True,...,False,False,False,True,False,False,False,False,False,False
6491,0,1,10.66,13.635,56,8.9981,1,31,22,True,...,False,False,False,True,False,False,False,False,False,False


# Training a random forest regression model

Creating a model and training it. We also used gridsearch for hyperparameter tuning (in kaggle_model.ipynb), but found best results by just using default parameters.

In [87]:
rfr = RandomForestRegressor(random_state=0, n_jobs=-1)
rfr.fit(X_train_rfr, y_train_rfr)

# Predicting the test results and writing them into a file

Predicting test data

In [89]:
y_test_rfr = rfr.predict(X_test_rfr)

Saving the results into a file

In [91]:
results_to_file(y_test_rfr, "submission_rfr_9")

Competition score (Root Mean Squared Logarithmic Error): 0.42960

# Preparing data for a Tensorflow model

We applied a cyclical transformation to the "hour" feature to capture its periodic nature. This is important because certain values, such as hours 23 and 0, are as close to each other as any other consecutive values. Without this transformation, their relationship would not be properly represented in the model.

The transformation uses sine and cosine functions to map these cyclical values onto a continuous circle, ensuring the model can interpret their periodicity. The original month column was dropped after the transformation.

In [93]:
X_train_tf = X_train_rfr.copy()
y_train_tf = y_train_rfr.copy()

X_test_tf = X_test_rfr.copy()

X_train_tf["hour_sin"] = np.sin(2 * np.pi * X_train_tf["hour"] / 24)
X_train_tf["hour_cos"] = np.cos(2 * np.pi * X_train_tf["hour"] / 24)

X_test_tf["hour_sin"] = np.sin(2 * np.pi * X_test_tf["hour"] / 24)
X_test_tf["hour_cos"] = np.cos(2 * np.pi * X_test_tf["hour"] / 24)

X_train_tf.drop("hour", inplace=True, axis=1)
X_test_tf.drop("hour", inplace=True, axis=1)

We scaled the features to standardize them with a mean of 0 and a standard deviation of 1. This is essential for TensorFlow models, as neural networks are sensitive to the scale of input data. Without scaling, training can be slower, less stable, and biased toward features with larger magnitudes.

In [97]:
day_mean = 16
day_std = 8.8

X_train_tf["day"] = (X_train_tf["day"] - day_mean) / day_std
X_test_tf["day"] = (X_test_tf["day"] - day_mean) / day_std

scaler = StandardScaler()
X_train_tf = scaler.fit_transform(X_train_tf)
X_test_tf = scaler.transform(X_test_tf)

y_train_tf = np.array(y_train_tf).reshape(-1, 1)

# Training a Tensorflow model

Creating the model. To ensure predictions are always 0 or positive, we use an exponential activation function in the final layer. The model includes multiple dense layers with ReLU activation, batch normalization for stability, and dropout for regularization.

In [99]:
model = Sequential([
    Input(shape=(X_train_tf.shape[1],)),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, activation="relu"),
    BatchNormalization(),
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dense(16, activation='relu'),
    Dense(1, activation='exponential')
])

In [100]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

Creating an early stopping function to prevent overfitting during training

In [104]:
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

Creating a function to reduce learning rate every 10 epochs.

In [102]:
def lr_scheduler(epoch, lr):
    if epoch % 10 == 0 and epoch > 0:
        lr = lr * 0.9  # Reduce by 10% every 10 epochs
    return lr

lr_schedule = LearningRateScheduler(lr_scheduler)

Training the model

In [105]:
history = model.fit(
    X_train_tf, y_train_tf,
    epochs=200,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, lr_schedule],
    verbose=1
)

Epoch 1/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 47869.5977 - mae: 156.3402 - val_loss: 77789.7969 - val_mae: 207.8105 - learning_rate: 0.0010
Epoch 2/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 27932.5195 - mae: 114.5333 - val_loss: 39570.9766 - val_mae: 133.5121 - learning_rate: 0.0010
Epoch 3/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 23840.6699 - mae: 103.0940 - val_loss: 34918.5352 - val_mae: 125.2165 - learning_rate: 0.0010
Epoch 4/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 31093.9512 - mae: 118.4119 - val_loss: 28570.6602 - val_mae: 113.2879 - learning_rate: 0.0010
Epoch 5/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 15088.7979 - mae: 82.4744 - val_loss: 19036.7168 - val_mae: 92.9247 - learning_rate: 0.0010
Epoch 6/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

# Predicting the test results and writing them into a file

Predicting test data

In [106]:
y_test_tf = model.predict(X_test_tf)

[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


Saving the results into a file

In [109]:
y_test_tf = [element[0] for element in y_test_tf] 
results_to_file(y_test_tf, "submission_tf_6")

Competition score (Root Mean Squared Logarithmic Error): 0.50451