Import Libraries

In [14]:
import pandas as pd
import numpy as np
import os
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

# Optional: import LGBM and XGB if available
try:
    from lightgbm import LGBMClassifier
except ImportError:
    LGBMClassifier = None
try:
    from xgboost import XGBClassifier
except ImportError:
    XGBClassifier = None

Load the dataset

In [15]:
# Change to your working directory
os.chdir('/home/piyush/umc301/iisc-umc-301-kaggle-competition-1')

# Read train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Feature Engineering Function

In [16]:
def add_features(df):
    df = df.copy()
    # Basic features
    if "song_duration_ms" in df.columns:
        df["duration_min"] = df["song_duration_ms"] / 60000.0
        df["log_duration"] = np.log1p(df["song_duration_ms"])
    if "energy" in df.columns and "danceability" in df.columns:
        df["energy_dance_ratio"] = df["energy"] / (df["danceability"] + 1e-6)
        df["energy_plus_dance"] = df["energy"] + df["danceability"]
        df["energy_times_dance"] = df["energy"] * df["danceability"]
    if "loudness" in df.columns and "energy" in df.columns:
        df["loudness_energy_ratio"] = df["loudness"] / (df["energy"] + 1e-6)
        df["loudness_times_energy"] = df["loudness"] * df["energy"]
    if "audio_valence" in df.columns and "energy" in df.columns:
        df["valence_energy_ratio"] = df["audio_valence"] / (df["energy"] + 1e-6)
    if "tempo" in df.columns and "time_signature" in df.columns:
        df["tempo_per_measure"] = df["tempo"] / (df["time_signature"].replace(0, 1))
    # Polynomial features
    for col in ["acousticness", "danceability", "energy", "instrumentalness", "liveness"]:
        if col in df.columns:
            df[f"log_{col}"] = np.log1p(df[col])
            df[f"{col}_squared"] = df[col] ** 2
    # Binning
    if "loudness" in df.columns:
        df["loudness_bin"] = pd.cut(df["loudness"], bins=5, labels=False)
    if "tempo" in df.columns:
        df["tempo_bin"] = pd.cut(df["tempo"], bins=5, labels=False)
    if "energy" in df.columns:
        df["energy_bin"] = pd.cut(df["energy"], bins=5, labels=False)
    if "danceability" in df.columns:
        df["danceability_bin"] = pd.cut(df["danceability"], bins=5, labels=False)
    # Extra interactions
    if "energy" in df.columns and "loudness" in df.columns and "danceability" in df.columns:
        df["energy_loudness_dance"] = df["energy"] * df["loudness"] * df["danceability"]
    if "acousticness" in df.columns and "instrumentalness" in df.columns:
        df["acoustic_instrumental"] = df["acousticness"] * df["instrumentalness"]
    if "liveness" in df.columns and "speechiness" in df.columns:
        df["liveness_speech"] = df["liveness"] * df["speechiness"]
    # Treat key, audio_mode, time_signature as categorical if present
    for col in ["key", "audio_mode", "time_signature"]:
        if col in df.columns:
            df[col] = df[col].astype("category")
    return df

Apply Feature Engineering

In [17]:
train = add_features(train)
test = add_features(test)

Prepare Data for Modeling and Imputation

In [18]:
train_id = train['id']
test_id = test['id']
y = train['song_popularity']

X_train = train.drop(['id', 'song_popularity'], axis=1)
X_test = test.drop(['id'], axis=1)

X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

Impute Missing Values using MICE

In [19]:
imputer = IterativeImputer(random_state=42, initial_strategy='mean', max_iter=10)
X_full = pd.concat([X_train, X_test], axis=0)
X_full_imputed = imputer.fit_transform(X_full)
X_train_imputed = X_full_imputed[:len(X_train)]
X_test_imputed = X_full_imputed[len(X_train):]

Model Training with Cross-Validation

In [20]:
NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
test_preds_hgb = np.zeros(len(X_test_imputed))
test_preds_lgb = np.zeros(len(X_test_imputed))
test_preds_xgb = np.zeros(len(X_test_imputed))
oof_hgb = np.zeros(len(X_train_imputed))
oof_lgb = np.zeros(len(X_train_imputed))
oof_xgb = np.zeros(len(X_train_imputed))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_imputed, y)):
    X_tr, X_val = X_train_imputed[tr_idx], X_train_imputed[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
    # HistGradientBoosting
    clf_hgb = HistGradientBoostingClassifier(random_state=42, max_iter=1200, learning_rate=0.01, max_leaf_nodes=63)
    clf_hgb.fit(X_tr, y_tr)
    oof_hgb[val_idx] = clf_hgb.predict_proba(X_val)[:,1]
    test_preds_hgb += clf_hgb.predict_proba(X_test_imputed)[:,1] / NFOLDS
    # LightGBM
    if LGBMClassifier is not None:
        clf_lgb = LGBMClassifier(n_estimators=1200, learning_rate=0.01, max_depth=11, subsample=0.8, colsample_bytree=0.8, reg_alpha=1.0, reg_lambda=1.0, random_state=42)
        clf_lgb.fit(X_tr, y_tr)
        oof_lgb[val_idx] = clf_lgb.predict_proba(X_val)[:,1]
        test_preds_lgb += clf_lgb.predict_proba(X_test_imputed)[:,1] / NFOLDS
    # XGBoost
    if XGBClassifier is not None:
        clf_xgb = XGBClassifier(n_estimators=1200, learning_rate=0.01, max_depth=11, subsample=0.8, colsample_bytree=0.8, reg_alpha=1.0, reg_lambda=1.0, random_state=42, use_label_encoder=False, eval_metric='logloss')
        clf_xgb.fit(X_tr, y_tr)
        oof_xgb[val_idx] = clf_xgb.predict_proba(X_val)[:,1]
        test_preds_xgb += clf_xgb.predict_proba(X_test_imputed)[:,1] / NFOLDS

[LightGBM] [Info] Number of positive: 8746, number of negative: 15254
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9186
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364417 -> initscore=-0.556245
[LightGBM] [Info] Start training from score -0.556245


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8746, number of negative: 15254
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9185
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364417 -> initscore=-0.556245
[LightGBM] [Info] Start training from score -0.556245


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8746, number of negative: 15254
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9185
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364417 -> initscore=-0.556245
[LightGBM] [Info] Start training from score -0.556245


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8745, number of negative: 15255
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9186
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364375 -> initscore=-0.556425
[LightGBM] [Info] Start training from score -0.556425


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8745, number of negative: 15255
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9185
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364375 -> initscore=-0.556425
[LightGBM] [Info] Start training from score -0.556425


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluate Model Performance

In [21]:
print("CV AUC HGB:", roc_auc_score(y, oof_hgb))
if LGBMClassifier is not None:
    print("CV AUC LGB:", roc_auc_score(y, oof_lgb))
if XGBClassifier is not None:
    print("CV AUC XGB:", roc_auc_score(y, oof_xgb))

CV AUC HGB: 0.5567289610983427
CV AUC LGB: 0.5596839571833769
CV AUC XGB: 0.5486616001038055


Weighted Ensemble (Based on AUC Scores)

In [22]:
weights = []
auc_hgb = roc_auc_score(y, oof_hgb)
weights.append(auc_hgb)
ensemble = test_preds_hgb * auc_hgb
if LGBMClassifier is not None:
    auc_lgb = roc_auc_score(y, oof_lgb)
    weights.append(auc_lgb)
    ensemble += test_preds_lgb * auc_lgb
if XGBClassifier is not None:
    auc_xgb = roc_auc_score(y, oof_xgb)
    weights.append(auc_xgb)
    ensemble += test_preds_xgb * auc_xgb
ensemble /= sum(weights)

Stacking (meta-model)

In [23]:
if (LGBMClassifier is not None) and (XGBClassifier is not None):
    stack_X = np.vstack([oof_hgb, oof_lgb, oof_xgb]).T
    stack_X_test = np.vstack([test_preds_hgb, test_preds_lgb, test_preds_xgb]).T
    meta = LogisticRegression(max_iter=1000)
    meta.fit(stack_X, y)
    stacked_preds = meta.predict_proba(stack_X_test)[:,1]
    submission = pd.DataFrame({'id': test_id, 'song_popularity': stacked_preds})
    submission.to_csv('submission_stacked.csv', index=False)

Ensembling -> Normal average and Save Predictions

In [24]:
if (LGBMClassifier is not None) and (XGBClassifier is not None):
    print("Predictions saved to submission_stacked.csv (stacked meta-model)")
else:
    submission = pd.DataFrame({'id': test_id, 'song_popularity': ensemble})
    submission.to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv (weighted ensemble)")

    # Optionally, save individual model predictions for comparison
    pd.DataFrame({'id': test_id, 'song_popularity': test_preds_hgb}).to_csv('submission_hgb.csv', index=False)
    if LGBMClassifier is not None:
        pd.DataFrame({'id': test_id, 'song_popularity': test_preds_lgb}).to_csv('submission_lgb.csv', index=False)
    if XGBClassifier is not None:
        pd.DataFrame({'id': test_id, 'song_popularity': test_preds_xgb}).to_csv('submission_xgb.csv', index=False)

Predictions saved to submission_stacked.csv (stacked meta-model)
