In [12]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [14]:
# Load csv file
df = pd.read_csv('/Users/sa26/Documents/GitHub/Predict-Podcast-Listening-Time/data/processed/processed_train.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749947 entries, 0 to 749946
Data columns (total 31 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Episode_Title                749947 non-null  float64
 1   Episode_Length_minutes       662860 non-null  float64
 2   Host_Popularity_percentage   749947 non-null  float64
 3   Guest_Popularity_percentage  749947 non-null  float64
 4   Number_of_Ads                749947 non-null  float64
 5   Listening_Time_minutes       749947 non-null  float64
 6   Top_Podcast                  749947 non-null  float64
 7   Genre_Business               749947 non-null  bool   
 8   Genre_Comedy                 749947 non-null  bool   
 9   Genre_Education              749947 non-null  bool   
 10  Genre_Health                 749947 non-null  bool   
 11  Genre_Lifestyle              749947 non-null  bool   
 12  Genre_Music                  749947 non-null  bool   
 13 

In [15]:
# Prepare data
X = df.drop('Listening_Time_minutes', axis=1)
y = df['Listening_Time_minutes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Regressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb_mse = mean_squared_error(y_test, xgb.predict(X_test))
print(f"XGBoost MSE: {xgb_mse}")

# Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_mse = mean_squared_error(y_test, rf.predict(X_test))
print(f"Random Forest MSE: {rf_mse}")

XGBoost MSE: 170.0925087453645
Random Forest MSE: 160.23212858005616


Random Forest performed better. Will use hyperparameter tuning to improve performance. The Kaggle competition 1st place had a calculated MSE of ~132.4 for reference.

In [16]:
# Define hyperparameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Randomized Search for XGBoost
xgb_search = RandomizedSearchCV(xgb, xgb_param_grid, n_iter=200, cv=3, random_state=42, n_jobs=-1, scoring='neg_mean_squared_error')
xgb_search.fit(X_train, y_train)

# Best XGBoost model
best_xgb = xgb_search.best_estimator_
xgb_mse = mean_squared_error(y_test, best_xgb.predict(X_test))
print(f"XGBoost MSE after tuning: {xgb_mse}")
print(f"XGBoost Best Parameters: {xgb_search.best_params_}")

XGBoost MSE after tuning: 169.68827035558482
XGBoost Best Parameters: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 0.1, 'colsample_bytree': 1.0}


The XGBoost did not improve much after hypertuning.
I used the following code for hyperparameter tuning Random Forest. I had to stop since it was still loading after one hour!

Define hyperparameter grid for Random Forest

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

Randomized Search for Random Forest

rf_search = RandomizedSearchCV(rf, rf_param_grid, n_iter=50, cv=3, random_state=42, n_jobs=-1, scoring='neg_mean_squared_error')
rf_search.fit(X_train, y_train)

Best Random Forest model

best_rf = rf_search.best_estimator_
rf_mse = mean_squared_error(y_test, best_rf.predict(X_test))
print(f"Random Forest MSE after tuning: {rf_mse}")
print(f"Random Forest Best Parameters: {rf_search.best_params_}")