In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mp
import seaborn as sb
import sklearn
import tensorflow
from pathlib import Path
import os

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, EarlyStopping
from tensorflow.keras.layers import Dense, Dropout

In [26]:
project_root = Path().resolve().parent
# print(project_root)

train_data_path = os.path.join(project_root, "data", "train.csv")
train_data = pd.read_csv(train_data_path)
train_data = train_data.iloc[:,1:]

test_data_path = os.path.join(project_root, "data", "test.csv")
test_data = pd.read_csv(test_data_path)
test_data = test_data.iloc[:,1:]

X = train_data.drop(columns=['BeatsPerMinute'])
y = train_data['BeatsPerMinute']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42)

LinearRegression

In [74]:
linear_model = LinearRegression()
print("Model name:", linear_model)
linear_model.fit(X_train, y_train)
linear_predictions = linear_model.predict(X_val)
linear_mae = mean_absolute_error(y_val, linear_predictions)
linear_mape = round(mean_absolute_percentage_error(y_val, linear_predictions),2)
print(f"MAE:{linear_mae:.2f} and model's predictions are off by MAPE:{linear_mape}%")

Model name: LinearRegression()
MAE:21.20 and model's predictions are off by MAPE:0.2%


Lasso and Ridge Regression

In [75]:
#RidgeCV model
ridge_cv_model = RidgeCV(alphas=[0.1, 0.4, 0.6, 0.8, 0.9, 1.0, 1.5, 3.0, 4.5, 6, 8.0, 10.0], cv=10)
print("Model name:", ridge_cv_model)

ridge_cv_model.fit(X_train, y_train)
print("Best alpha value:", ridge_cv_model.alpha_)

ridge_cv_predictions = ridge_cv_model.predict(X_val)

ridge_cv_mae = mean_absolute_error(y_val, ridge_cv_predictions)
ridge_cv_mape = round(mean_absolute_percentage_error(y_val, ridge_cv_predictions),2)
print(f"MAE:{ridge_cv_mae:.2f} and model's predictions are off by MAPE:{ridge_cv_mape}%")
print()

#LassoCV model
lasso_cv_model = LassoCV(eps=0.5, alphas=[0.1, 0.4, 0.6, 0.8, 0.9, 1.0], cv=10, random_state=32)
print("Model name:", lasso_cv_model)

lasso_cv_model.fit(X_train, y_train)
print("Best alpha value:", lasso_cv_model.alpha_)

lasso_cv_predictions = lasso_cv_model.predict(X_val)

lasso_cv_mae = mean_absolute_error(y_val, lasso_cv_predictions)
lasso_cv_mape = round(mean_absolute_percentage_error(y_val, lasso_cv_predictions),2)
print(f"MAE:{lasso_cv_mae:.2f} and model's predictions are off by MAPE:{lasso_cv_mape}%")


Model name: RidgeCV(alphas=[0.1, 0.4, 0.6, 0.8, 0.9, 1.0, 1.5, 3.0, 4.5, 6, 8.0, 10.0],
        cv=10)
Best alpha value: 10.0
MAE:21.20 and model's predictions are off by MAPE:0.2%

Model name: LassoCV(alphas=[0.1, 0.4, 0.6, 0.8, 0.9, 1.0], cv=10, eps=0.5, random_state=32)
Best alpha value: 0.1
MAE:21.21 and model's predictions are off by MAPE:0.2%


GradientBoosting and XGBoost

In [76]:
gradient_boost = GradientBoostingRegressor()
gradient_boost.fit(X_train, y_train)
print("Model name:", gradient_boost)

gradient_boost_predictions = gradient_boost.predict(X_val)
gradient_boost_mae = mean_absolute_error(y_val, gradient_boost_predictions)
gradient_boost_mape = round(mean_absolute_percentage_error(y_val, gradient_boost_predictions),2)
print(f"MAE:{gradient_boost_mae:.2f} and model's predictions are off by MAPE:{gradient_boost_mape}%")

Model name: GradientBoostingRegressor()
MAE:21.20 and model's predictions are off by MAPE:0.2%


Tensorflow

In [77]:
model = Sequential()
model.add(Dense(units=60, activation='relu'))
model.add(Dense(units=40))
model.add(Dense(units=1))

model.compile(optimizer=Adam(learning_rate=0.01), loss='mae', metrics=['mae'])

model.fit(X_train, 
          y_train, 
          batch_size=512, 
          epochs=50, 
          validation_data=(X_val, y_val),
          callbacks=EarlyStopping(monitor='val_loss', patience=15, mode='min', restore_best_weights=True), 
          verbose=1)

neural_network_predictions = model.predict(X_val)

neural_network_mae = mean_absolute_error(y_val, neural_network_predictions)
neural_network_mape = round(mean_absolute_percentage_error(y_val, neural_network_predictions),2)
print(f"MAE:{neural_network_mae:.2f} and model's predictions are off by MAPE:{neural_network_mape}%")

Epoch 1/50
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1129.7933 - mae: 1129.7933 - val_loss: 39.5157 - val_mae: 39.5157
Epoch 2/50
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 41.3976 - mae: 41.3976 - val_loss: 44.4099 - val_mae: 44.4099
Epoch 3/50
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 34.2966 - mae: 34.2966 - val_loss: 29.8492 - val_mae: 29.8492
Epoch 4/50
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 30.9145 - mae: 30.9145 - val_loss: 32.6338 - val_mae: 32.6338
Epoch 5/50
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 29.8811 - mae: 29.8811 - val_loss: 29.6132 - val_mae: 29.6132
Epoch 6/50
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 28.2748 - mae: 28.2748 - val_loss: 27.5918 - val_mae: 27.5918
Epoch 7/50
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

MonteCarlo

In [80]:
n_simulations = 100
mae_list = []

for i in range(n_simulations):
    #split the data randomly in each iteration
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30)
    
    #build model and fit training data
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    #make predictions
    predictions = model.predict(X_val)
    
    #error metrics
    mae = mean_absolute_error(y_val, predictions)
    mae_list.append(mae)
    
print(mae_list)

average_mae = np.mean(mae_list)
print(f"Mean of MAE is {average_mae}")

[21.228402869035687, 21.224508351201536, 21.193057752265872, 21.19017205235903, 21.176995661106766, 21.129580166202256, 21.26433336104149, 21.25399601879798, 21.199790443276136, 21.2044180548612, 21.268340330799113, 21.200315788258745, 21.167092602455025, 21.21462549251294, 21.206749866519743, 21.15743414592626, 21.196793469334335, 21.212244810516946, 21.208623458901705, 21.14924610892513, 21.202395301441484, 21.212838000559966, 21.198992725295973, 21.2084324105051, 21.156079287494535, 21.222035358823316, 21.253925557402393, 21.19335720160567, 21.222273394789173, 21.134154197572965, 21.279609084723933, 21.152932556808505, 21.195886365515648, 21.219058832857385, 21.214067865201365, 21.214510975152116, 21.1888405802031, 21.197595633410266, 21.229697552575857, 21.215387789852205, 21.27114792137005, 21.182388069054007, 21.17549914387814, 21.118858349912195, 21.188204166875426, 21.132944836361457, 21.137505157354, 21.193267768366287, 21.231440802167782, 21.22234776833173, 21.237656091131672

Make predictions on actual test data using the best fit model

In [None]:
test_data_predictions = pd.DataFrame(linear_model.predict(test_data))

submission_file = pd.read_csv("sample_submission.csv")
submission_file = submission_file.iloc[:,1]

final_predictions = pd.concat([submission_file, test_data_predictions], axis=1, ignore_index=True)
final_predictions.columns = [['id', 'BeatsPerMinute']]

final_predictions.to_csv("predictions.csv", index=False)