In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy import stats

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

SEED = 42
n_splits = 8
n_estimators=5000
early_stopping_rounds = 100

In [2]:
# train_data = pd.read_csv('../data/raw/train.csv')  #index_col='id'
# test_data = pd.read_csv('../data/raw/test.csv') # , index_col='id'
data = pd.read_csv('../data/raw/podcast_dataset.csv')

# print("train_data shape :",train_data.shape)
# print("test_data shape :",test_data.shape)
print("data shape :",data.shape)

data shape : (52500, 11)


In [3]:
TARGET = 'Listening_Time_minutes'
data_clean = data.dropna(subset=[TARGET]).drop_duplicates()
print("data shape after dropping na and duplicates :",data.shape)

data shape after dropping na and duplicates : (52500, 11)


In [4]:
X= data_clean.drop(columns=[TARGET])
y= data_clean[TARGET]
train, test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {train.shape}")
print(f"Test  shape: {test.shape}")
print(f"Orig_clean  shape: {data_clean.shape}")

Train shape: (35894, 10)
Test  shape: (8974, 10)
Orig_clean  shape: (44868, 11)


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
class PodcastPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.genre_mapping = {
            'Music': 0, 'True Crime': 1, 'Health': 2, 'Education': 3,
            'Technology': 4, 'Business': 5, 'Lifestyle': 6,
            'Sports': 7, 'Comedy': 8, 'News': 9
        }
        self.day_mapping = {
            'Tuesday': 0, 'Monday': 1, 'Wednesday': 2,
            'Saturday': 3, 'Friday': 4, 'Thursday': 5, 'Sunday': 6
        }
        self.time_mapping = {
            'Night': 0, 'Afternoon': 1, 'Morning': 2, 'Evening': 3
        }
        self.label_encoders = {}
        self.num_medians = {}
        self.feature_names_ = None

    def _data_process(self, df):
        df = df.copy()

        # Your feature engineering
        df['Episode_Title_num'] = (
            df['Episode_Title'].astype(str).str.replace('Episode ', '', regex=False).astype(int)
        )
        # numeric medians applied later
        df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(
            {'Neutral': 0, 'Positive': 1, 'Negative': -1}
        )

        df['Ad_Density'] = df['Number_of_Ads'] / (df['Episode_Length_minutes'] + 1e-3)
        df['Popularity_Diff'] = df['Host_Popularity_percentage'] - df['Guest_Popularity_percentage']
        df['Popularity_Interaction'] = df['Host_Popularity_percentage'] * df['Guest_Popularity_percentage']
        df['Host_Popularity_squared'] = df['Host_Popularity_percentage'] ** 2
        df['Popularity_Average'] = (
            df['Host_Popularity_percentage'] + df['Guest_Popularity_percentage']
        ) / 2
        
        df['Genre_Num'] = df['Genre'].map(self.genre_mapping)
        df['Publication_Day_Num'] = df['Publication_Day'].map(self.day_mapping)
        df['Publication_Time_Num'] = df['Publication_Time'].map(self.time_mapping)

        return df

    def fit(self, X, y=None):
        X = self._data_process(X)

        # 1) Fit TF-IDF on Podcast_Name
        tfidf_train = self.vectorizer.fit_transform(X['Podcast_Name'])
        tfidf_df = pd.DataFrame(
            tfidf_train.toarray(),
            columns=self.vectorizer.get_feature_names_out(),
            index=X.index
        )

        # 2) Fill numeric medians and store them
        num_cols = X.select_dtypes(include=['number']).columns.tolist()
        for col in num_cols:
            median_val = X[col].median()
            self.num_medians[col] = median_val
            X[col] = X[col].fillna(median_val)

        # 3) Label encode categorical columns
        cat_cols = X.select_dtypes(exclude=['number']).columns.tolist()
        X[cat_cols] = X[cat_cols].fillna("Missing")
        for col in cat_cols:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
            self.label_encoders[col] = le

        # 4) Combine tabular + TF-IDF and remember feature order
        X_full = pd.concat([X, tfidf_df], axis=1)
        self.feature_names_ = X_full.columns.tolist()

        return self

    def transform(self, X):
        X = self._data_process(X)

        # 1) TF-IDF using existing vocab
        tfidf_test = self.vectorizer.transform(X['Podcast_Name'])
        tfidf_df = pd.DataFrame(
            tfidf_test.toarray(),
            columns=self.vectorizer.get_feature_names_out(),
            index=X.index
        )

        # 2) Fill numeric using training medians
        for col, median_val in self.num_medians.items():
            if col in X.columns:
                X[col] = X[col].fillna(median_val)

        # 3) Apply label encoders (handle unknowns as "Missing" if needed)
        cat_cols = X.select_dtypes(exclude=['number']).columns.tolist()
        X[cat_cols] = X[cat_cols].fillna("Missing")
        for col, le in self.label_encoders.items():
            # Map unknown labels to a fallback if required
            X[col] = X[col].map(lambda v: v if v in le.classes_ else "Missing")
            # Ensure encoder knows "Missing"
            if "Missing" not in le.classes_:
                le.classes_ = np.append(le.classes_, "Missing")
            X[col] = le.transform(X[col])

        X_full = pd.concat([X, tfidf_df], axis=1)

        # Reindex to match training feature order
        X_full = X_full.reindex(columns=self.feature_names_, fill_value=0)

        return X_full.values

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib

preprocessor = PodcastPreprocessor()
model = RandomForestRegressor(random_state=42)

pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", model)
])

In [None]:
# pipe.fit(train, y_train)
preprocessor = PodcastPreprocessor()
X_train_transformed = preprocessor.fit_transform(train)
joblib.dump(preprocessor, "podcast_model.pkl")

['podcast_model.pkl']

XGBoost

In [8]:
from xgboost import XGBRegressor

SEED = 42

xgb_base = XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=SEED,
    tree_method="hist",   # or gpu_hist
)


In [9]:
param_dist = {
    "n_estimators": [300, 500, 800, 1000],
    "max_depth": [6, 8, 10, 12, 15],
    "learning_rate": [0.03, 0.05, 0.08, 0.1],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.7, 0.9],
    "reg_alpha": [0.0, 0.5, 0.8, 1.0],
    "reg_lambda": [1.0, 2.0, 4.0, 6.0],
}


In [10]:
import dagshub
import mlflow

mlflow.set_tracking_uri('https://dagshub.com/NitinNandeshwar/Podcast-Listening-Time-Prediction.mlflow')
dagshub.init(repo_owner='NitinNandeshwar', repo_name='Podcast-Listening-Time-Prediction', mlflow=True)

mlflow.set_experiment("xgboost_podcast_regression")

<Experiment: artifact_location='mlflow-artifacts:/68a4d8b968d247d7bec901a8bbcaccfa', creation_time=1765668177842, experiment_id='0', last_update_time=1765668177842, lifecycle_stage='active', name='xgboost_podcast_regression', tags={}>

In [11]:
from sklearn.model_selection import RandomizedSearchCV, KFold
import mlflow
import mlflow.xgboost

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=30,
    scoring="neg_root_mean_squared_error",
    cv=kf,
    n_jobs=-1,
    verbose=2,
    random_state=SEED,
)


with mlflow.start_run(run_name="xgb_random_search"):
    mlflow.autolog()

    search.fit(X_train_transformed, y_train)

    best_rmse = -search.best_score_
    best_params = search.best_params_

    mlflow.log_metric("best_cv_rmse", best_rmse)
    mlflow.log_params(best_params)

    best_model = search.best_estimator_


2025/12/13 23:25:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/12/13 23:25:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/12/13 23:25:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Fitting 5 folds for each of 30 candidates, totalling 150 fits


2025/12/13 23:53:10 INFO mlflow.sklearn.utils: Logging the 5 best runs, 25 runs will be omitted.
2025/12/13 23:53:14 INFO mlflow.tracking._tracking_service.client: üèÉ View run serious-deer-952 at: https://dagshub.com/NitinNandeshwar/Podcast-Listening-Time-Prediction.mlflow/#/experiments/0/runs/15d9f3e216c24a27bd3a7b668191d1a9.
2025/12/13 23:53:14 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dagshub.com/NitinNandeshwar/Podcast-Listening-Time-Prediction.mlflow/#/experiments/0.
2025/12/13 23:53:16 INFO mlflow.tracking._tracking_service.client: üèÉ View run languid-shrimp-113 at: https://dagshub.com/NitinNandeshwar/Podcast-Listening-Time-Prediction.mlflow/#/experiments/0/runs/91fb55c81fc54f79a5c0bf03f8d24ee7.
2025/12/13 23:53:16 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dagshub.com/NitinNandeshwar/Podcast-Listening-Time-Prediction.mlflow/#/experiments/0.
2025/12/13 23:53:17 INFO mlflow.tracking._tracking_service.cl

In [12]:
best_params

{'subsample': 1.0,
 'reg_lambda': 1.0,
 'reg_alpha': 0.8,
 'n_estimators': 300,
 'max_depth': 6,
 'learning_rate': 0.05,
 'colsample_bytree': 0.9}

In [13]:
best_rmse


13.665979088451019

In [14]:
joblib.dump(best_model, "model.pkl")

['model.pkl']

In [16]:
mlflow.log_artifact("podcast_model.pkl")
mlflow.log_artifact("model.pkl")