<span style="color: Blue;">**Predicting the Beats-per-Minute of Songs**</span>

**Dataset Description:**

This dataset (train & test) was created using a deep learning model trained on the BPM Prediction Challenge dataset. The features are similar to the original dataset, but not exactly the same.

I will build the model using <span style="color: Blue;">XGBoost</span> and assess its performance with Root Mean Squared Error (RMSE) between predicted and actual values.

**Data Source:** kaggle competitions download -c playground-series-s5e9

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the training dataset
df = pd.read_csv('train.csv')

In [None]:
# pd.set_option('display.max_rows', None)

# Preview the dataset
df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [None]:
# Dataset dimensions
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

Number of rows:  517754
Number of columns:  14


In [None]:
# Summary statistics of the dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,517754.0,258876.5,149462.849974,0.0,129438.25,258876.5,388314.75,517753.0
num_lanes,517754.0,2.491511,1.120434,1.0,1.0,2.0,3.0,4.0
curvature,517754.0,0.488719,0.272563,0.0,0.26,0.51,0.71,1.0
speed_limit,517754.0,46.112575,15.788521,25.0,35.0,45.0,60.0,70.0
num_reported_accidents,517754.0,1.18797,0.895961,0.0,1.0,1.0,2.0,7.0
accident_risk,517754.0,0.352377,0.166417,0.0,0.23,0.34,0.46,1.0


In [None]:
# Check for missing values
df.isnull().sum().sum()

np.int64(0)

In [None]:
# Check for duplicate rows
df.duplicated().sum()

np.int64(0)

In [None]:
# Dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      517754 non-null  int64  
 1   road_type               517754 non-null  object 
 2   num_lanes               517754 non-null  int64  
 3   curvature               517754 non-null  float64
 4   speed_limit             517754 non-null  int64  
 5   lighting                517754 non-null  object 
 6   weather                 517754 non-null  object 
 7   road_signs_present      517754 non-null  bool   
 8   public_road             517754 non-null  bool   
 9   time_of_day             517754 non-null  object 
 10  holiday                 517754 non-null  bool   
 11  school_season           517754 non-null  bool   
 12  num_reported_accidents  517754 non-null  int64  
 13  accident_risk           517754 non-null  float64
dtypes: bool(4), float64(

In [None]:
# Set 'id' column as dataFrame index
df = df.set_index("id")

In [None]:
# Random sample of dataset
df.sample(5)

Unnamed: 0_level_0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
132726,urban,2,0.84,25,daylight,foggy,False,False,afternoon,True,True,1,0.35
90488,rural,3,0.13,45,dim,clear,False,False,afternoon,False,True,1,0.06
143772,urban,3,0.39,35,daylight,foggy,False,True,afternoon,True,False,1,0.25
71482,highway,1,0.91,70,daylight,rainy,False,False,evening,False,False,3,0.68
261969,urban,1,0.61,45,daylight,foggy,False,True,morning,True,False,1,0.4


<span style="color: Red;">**Using Linear Regression**</span>


In [None]:
# Check skewness of features
col = df.skew()
print(col)

TypeError: could not convert string to float: 'urban'

Since the dataset is fairly normally distributed, I will use StandardScaler to standardize the features before modeling.

In [None]:
# Define features (X) and target (y)
X_lin = df.drop('BeatsPerMinute', axis=1)
y_lin = df['BeatsPerMinute']

In [None]:
# Standardize features
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_scaled = ss.fit_transform(X_lin)

In [None]:
# Convert scaled features to DataFrame
df.X_scaled = pd.DataFrame(X_scaled, columns=X_lin.columns)
df.X_scaled.head()

In [None]:
# Import libraries for modeling
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_lin, test_size=0.2, random_state=5)

In [None]:
# Initialize Linear Regression Model
lr = LinearRegression()

In [None]:
# Train Linear Regression Model
lr.fit(X_train, y_train)

In [None]:
# Predict on Test Data
y_pred = lr.predict(X_test)

In [None]:
# Evaluate Model with Mean Squared Error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print ('Mean Squared Error', mse)

In [None]:
# Variance of target variable
var = np.var(y_test)
var

<span style="color: Red;">**Extreme Gradient Boosting**</span>

In [None]:
X = df.drop('BeatsPerMinute', axis=1)
y = df['BeatsPerMinute']

In [None]:
X__train, X__test, y__train, y__test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Import libraries for XGBoost and Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor()

In [None]:
# Train XGBoost Regressor
xgb.fit(X__train, y__train)

In [None]:
y_pred_xgb = xgb.predict(X__test)

In [None]:
mse_xgb = mean_squared_error(y__test, xgb.predict(X__test))
print('Mean Squared Error', mse_xgb)

In [None]:
# Split a validation set for early stopping
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1
)

# Define expanded parameter grid
param_grid = {
    "n_estimators": [300, 400, 500],
    "learning_rate": [0.05, 0.1, 0.2],
    "max_depth": [3, 5, 7, 10],
    "subsample": [0.6, 0.8, 1.0],           # row sampling
    "colsample_bytree": [0.6, 0.8, 1.0],    # feature sampling
    "gamma": [0, 0.1, 0.3, 0.5],            # pruning regularization
    "min_child_weight": [1, 3, 5, 7],       # minimum data in leaf
    "reg_alpha": [0, 0.1, 0.5],         # L1 regularization
    "reg_lambda": [1, 1.5, 2],              # L2 regularization
}

# Initialize model with early stopping inside
xgb = XGBRegressor(
    random_state=1,
    tree_method="hist",          # faster training
    eval_metric="rmse",          # evaluation metric
    early_stopping_rounds=50     # âœ… early stopping handled here
)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=50,   # number of random combinations to try
    scoring="neg_root_mean_squared_error",
    cv=5,
    verbose=2,
    random_state=1,
    n_jobs=-1,
    error_score="raise"
)

# Fit the model with eval_set for early stopping
random_search.fit(
    X_train_sub, y_train_sub,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best RMSE:", -random_search.best_score_)



In [None]:
mse_xgb = mean_squared_error(y__test, random_search.predict(X__test))
print('Mean Squared Error', mse_xgb)

<span style="color: Red;">**Loading Test Data and Generating Kaggle Submission Sample**</span>

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_test.isnull().sum().sum()

In [None]:
df_test.sample(5)

In [None]:
df_test.set_index('id', inplace=True)

In [None]:
df_test.tail(5)

In [None]:
BeatsPerMinute = random_search.predict(df_test)

In [None]:
# df_test.index.name = "ID"

In [None]:
#Creating a submission dataset
submission = pd.DataFrame({
    'BeatsPerMinute': BeatsPerMinute
}, index=df_test.index)

submission.head(5)

In [None]:
# Save to CSV file with the index
submission.to_csv('submission.csv', index=True)