In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor, Pool
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# ===== LOAD DATA =====
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")
test_df["Listening_Time_minutes"] = np.nan
full_df = pd.concat([train_df, test_df], ignore_index=True)

# ===== CREATE NEW FEATURES =====
def enrich_features(df):
    # Temporal (assuming a datetime column exists — else skip)
    if 'Release_Date' in df.columns:
        df['Release_Date'] = pd.to_datetime(df['Release_Date'])
        df['Release_Weekday'] = df['Release_Date'].dt.weekday
        df['Is_Weekend'] = df['Release_Weekday'] >= 5
        df['Release_Hour'] = df['Release_Date'].dt.hour
        df['Hour_Bucket'] = pd.cut(df['Release_Hour'], bins=[-1, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'])
    else:
        df['Is_Weekend'] = 0
        df['Hour_Bucket'] = 'Unknown'

    # Interaction and ratios
    df['Ad_Density'] = df['Number_of_Ads'] / (df['Episode_Length_minutes'] + 1e-3)
    df['HostGuest_Mult'] = df['Host_Popularity_percentage'] * df['Guest_Popularity_percentage']
    return df

full_df = enrich_features(full_df)

# ===== FEATURE ENCODING =====
categorical_cols = ['Hour_Bucket']
full_df[categorical_cols] = full_df[categorical_cols].astype(str)

# ===== FEATURES TO USE =====
features = [
    'Episode_Length_minutes',
    'Host_Popularity_percentage',
    'Guest_Popularity_percentage',
    'Number_of_Ads',
    'Ad_Density',
    'HostGuest_Mult',
    'Is_Weekend',
    'Hour_Bucket'
]

# ===== FILL MISSING =====
for col in features:
    if full_df[col].dtype == 'object':
        full_df[col] = full_df[col].fillna('Unknown')
    else:
        full_df[col] = full_df[col].fillna(full_df[col].mean())

# ===== SPLIT =====
train_proc = full_df[~full_df['Listening_Time_minutes'].isna()]
test_proc  = full_df[ full_df['Listening_Time_minutes'].isna()]

X = train_proc[features]
y = train_proc['Listening_Time_minutes'].astype(float)
X_test = test_proc[features]
test_ids = test_proc['id']

# ===== LOG TRANSFORM TARGET =====
y_log = np.log1p(y)  # log(1 + y) to handle 0s

# ===== SPLIT TRAIN/VAL =====
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

train_pool = Pool(X_train, y_train_log, cat_features=categorical_cols)
val_pool   = Pool(X_val, y_val_log, cat_features=categorical_cols)

# ===== CATBOOST: QUANTILE LOSS =====
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='Quantile:alpha=0.5',  # Median regression
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=val_pool)

# ===== VALIDATION METRICS (inverse log) =====
val_preds_log = model.predict(X_val)
val_preds = np.expm1(val_preds_log)  # inverse of log1p

y_val_true = np.expm1(y_val_log)
rmse = mean_squared_error(y_val_true, val_preds, squared=False)
mae  = mean_absolute_error(y_val_true, val_preds)

print(f"🌟 CatBoost Quantile | RMSE: {rmse:.2f} | MAE: {mae:.2f}")

# ===== RETRAIN ON FULL DATA =====
full_pool = Pool(X, y_log, cat_features=categorical_cols)
model.fit(full_pool, verbose=0)

# ===== PREDICT TEST SET =====
test_preds_log = model.predict(X_test)
test_preds = np.expm1(test_preds_log)

# ===== SAVE PREDICTIONS =====
pd.DataFrame({
    'id': test_ids,
    'Listening_Time_minutes': test_preds
}).to_csv("submission.csv", index=False)

print("✅ Saved submission.csv")


0:	learn: 0.3017241	test: 0.3018072	best: 0.3018072 (0)	total: 104ms	remaining: 1m 43s
100:	learn: 0.1322671	test: 0.1314672	best: 0.1314672 (100)	total: 2.81s	remaining: 25s
200:	learn: 0.1314686	test: 0.1306866	best: 0.1306866 (200)	total: 5.47s	remaining: 21.7s
300:	learn: 0.1311230	test: 0.1304155	best: 0.1304155 (300)	total: 7.96s	remaining: 18.5s
400:	learn: 0.1309206	test: 0.1302665	best: 0.1302665 (400)	total: 10.4s	remaining: 15.5s
500:	learn: 0.1307143	test: 0.1301229	best: 0.1301229 (500)	total: 12.8s	remaining: 12.7s
600:	learn: 0.1305719	test: 0.1300248	best: 0.1300248 (600)	total: 15.2s	remaining: 10.1s
700:	learn: 0.1304595	test: 0.1299528	best: 0.1299528 (697)	total: 17.7s	remaining: 7.54s
800:	learn: 0.1304024	test: 0.1299202	best: 0.1299200 (799)	total: 20.1s	remaining: 4.98s
900:	learn: 0.1303063	test: 0.1298593	best: 0.1298591 (899)	total: 22.5s	remaining: 2.47s
999:	learn: 0.1301999	test: 0.1297890	best: 0.1297890 (999)	total: 24.9s	remaining: 0us

bestTest = 0.129

In [2]:
# ====================== SETUP ======================
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# ====================== LOAD DATA ======================
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")
test_df["Listening_Time_minutes"] = np.nan

full_df = pd.concat([train_df, test_df], ignore_index=True)

# ====================== FEATURE ENGINEERING ======================
numerical_features = [
    "Episode_Length_minutes", "Guest_Popularity_percentage", "Number_of_Ads"
]
categorical_features = [
    "Podcast_Name", "Episode_Title", "Genre", "Publication_Day", 
    "Publication_Time", "Episode_Sentiment"
]

# Impute
num_imputer = SimpleImputer(strategy="median")
full_df[numerical_features] = num_imputer.fit_transform(full_df[numerical_features])

cat_imputer = SimpleImputer(strategy="most_frequent")
full_df[categorical_features] = cat_imputer.fit_transform(full_df[categorical_features])

# ====================== AGGREGATED FEATURES ======================
# Podcast-Level
podcast_stats = full_df.groupby("Podcast_Name").agg(
    avg_listen=('Listening_Time_minutes', 'mean'),
    std_listen=('Listening_Time_minutes', 'std'),
    count_listen=('Listening_Time_minutes', 'count')
).reset_index()
full_df = full_df.merge(podcast_stats, on="Podcast_Name", how="left")

# Genre-Level
genre_stats = full_df.groupby("Genre").agg(
    genre_avg_listen=('Listening_Time_minutes', 'mean'),
    genre_count=('Listening_Time_minutes', 'count')
).reset_index()
full_df = full_df.merge(genre_stats, on="Genre", how="left")

# Day-Level
day_stats = full_df.groupby("Publication_Day").agg(
    day_avg_listen=('Listening_Time_minutes', 'mean'),
).reset_index()
full_df = full_df.merge(day_stats, on="Publication_Day", how="left")

# ====================== ENCODING ======================
label_cols = ["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]
for col in label_cols:
    le = LabelEncoder()
    full_df[col] = le.fit_transform(full_df[col])

# ====================== FINAL FEATURES ======================
drop_cols = ["id", "Episode_Title"]
features = [col for col in full_df.columns if col not in drop_cols + ["Listening_Time_minutes"]]

train_proc = full_df[~full_df["Listening_Time_minutes"].isna()].copy()
test_proc  = full_df[full_df["Listening_Time_minutes"].isna()].copy()

X = train_proc[features]
y = train_proc["Listening_Time_minutes"]
X_test = test_proc[features]
test_ids = test_proc["id"]

# ====================== TRAIN-VAL SPLIT ======================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ====================== MODEL TRAINING ======================
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='Quantile:alpha=0.5',  # Robust to outliers
    verbose=100,
    random_seed=42
)

model.fit(X_train, y_train, eval_set=(X_val, y_val))

# ====================== EVALUATION ======================
from sklearn.metrics import mean_squared_error, mean_absolute_error

val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
mae  = mean_absolute_error(y_val, val_preds)

print(f"Hybrid CatBoost RMSE: {rmse:.2f}")
print(f"Hybrid CatBoost MAE : {mae:.2f}")

# ====================== FINAL PREDICTIONS ======================
model.fit(X, y, verbose=0)
final_preds = model.predict(X_test)

submission = pd.DataFrame({
    "id": test_ids,
    "Listening_Time_minutes": final_preds
})
submission.to_csv("submission_hybrid_catboost.csv", index=False)
print("✅ submission_hybrid_catboost.csv saved.")


0:	learn: 10.8645389	test: 10.8618389	best: 10.8618389 (0)	total: 67.9ms	remaining: 1m 7s
100:	learn: 4.8190445	test: 4.8009239	best: 4.8009239 (100)	total: 2.99s	remaining: 26.6s
200:	learn: 4.7937773	test: 4.7783754	best: 4.7783754 (200)	total: 5.73s	remaining: 22.8s
300:	learn: 4.7805808	test: 4.7681149	best: 4.7681149 (300)	total: 8.53s	remaining: 19.8s
400:	learn: 4.7688199	test: 4.7589815	best: 4.7589815 (400)	total: 11.2s	remaining: 16.7s
500:	learn: 4.7588773	test: 4.7515607	best: 4.7515607 (500)	total: 13.8s	remaining: 13.7s
600:	learn: 4.7516191	test: 4.7469805	best: 4.7469805 (600)	total: 16.4s	remaining: 10.9s
700:	learn: 4.7451453	test: 4.7431704	best: 4.7431704 (700)	total: 19.1s	remaining: 8.13s
800:	learn: 4.7391308	test: 4.7401318	best: 4.7401318 (800)	total: 21.7s	remaining: 5.39s
900:	learn: 4.7334524	test: 4.7370737	best: 4.7370737 (900)	total: 24.3s	remaining: 2.67s
999:	learn: 4.7281489	test: 4.7341173	best: 4.7341173 (999)	total: 26.9s	remaining: 0us

bestTest = 