In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from pytorch_tabnet.tab_model import TabNetClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import torch


# --- Feature Engineering ---
def engineer_features(df_in):
    df_out = df_in.copy()
    if 'id' in df_out.columns:
        df_out.drop('id', axis=1, inplace=True)
    for col in df_out.select_dtypes(include=np.number).columns:
        df_out[col].fillna(df_out[col].median(), inplace=True)
    for col in df_out.select_dtypes(include=['object', 'category']).columns:
        df_out[col].fillna(df_out[col].mode()[0], inplace=True)
    df_out['key_mode'] = df_out['key'].astype(str) + '_' + df_out['audio_mode'].astype(str)
    df_out['tempo_category'] = pd.cut(df_out['tempo'], bins=[0, 90, 120, 150, 250],
                                      labels=['Slow', 'Medium', 'Fast', 'Very Fast'])
    df_out['loudness_scaled'] = (df_out['loudness'] - df_out['loudness'].min()) / \
                                (df_out['loudness'].max() - df_out['loudness'].min())
    df_out['party_index'] = df_out['danceability'] * df_out['energy'] * df_out['loudness_scaled']
    df_out['ballad_index'] = df_out['acousticness'] / (df_out['energy'] + 1e-6)
    df_out['vocal_focus'] = 1 - df_out['instrumentalness']
    df_out['studio_polish'] = 1 - df_out['liveness']
    df_out['rhythmic_vs_acoustic'] = df_out['danceability'] - df_out['acousticness']
    df_out['duration_log'] = np.log1p(df_out['song_duration_ms'])
    df_out['key_mode'] = df_out['key_mode'].astype('category')
    df_out['tempo_category'] = df_out['tempo_category'].astype('category')
    df_out.drop(['loudness', 'song_duration_ms', 'key', 'audio_mode', 'tempo'], axis=1, inplace=True)
    return df_out


# --- Validation Phase ---
def run_ultimate_ensemble_pipeline():
    try:
        df = pd.read_csv('train.csv')
    except FileNotFoundError:
        print("❌ Error: 'train.csv' not found.")
        return
    
    TARGET_COLUMN = 'song_popularity'
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df[TARGET_COLUMN]
    )
    print(f"📊 Data split: {len(train_df)} training samples, {len(test_df)} hold-out test samples.")

    # Feature engineering
    X_train = engineer_features(train_df.drop(TARGET_COLUMN, axis=1))
    y_train = train_df[TARGET_COLUMN]
    X_test = engineer_features(test_df.drop(TARGET_COLUMN, axis=1))
    y_test = test_df[TARGET_COLUMN]

    # Preprocessing
    X_train_lgbm, X_test_lgbm = X_train.copy(), X_test.copy()
    categorical_features = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
    numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

    for col in categorical_features:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])
        X_train_lgbm[col] = X_train_lgbm[col].astype('category')
        X_test_lgbm[col] = X_test_lgbm[col].astype('category')

    scaler = QuantileTransformer(output_distribution='normal', random_state=42)
    X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
    X_test[numerical_features] = scaler.transform(X_test[numerical_features])

    cat_idxs = [X_train.columns.get_loc(col) for col in categorical_features]
    cat_dims = [X_train[col].nunique() for col in categorical_features]

    X_train_tabnet_np = X_train.values
    X_test_tabnet_np = X_test.values
    y_train_np = y_train.values
    y_test_np = y_test.values

    # Class imbalance handling
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    print(f"Calculated scale_pos_weight for LightGBM: {scale_pos_weight:.2f}")

    # K-Fold Training
    N_SPLITS = 5
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    test_preds_tabnet, test_preds_lgbm = [], []

    print(f"\n⏳ Starting {N_SPLITS}-Fold Ensemble Training...")
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train_np)):
        print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")

        # TabNet
        tn_clf = TabNetClassifier(
            cat_idxs=cat_idxs, cat_dims=cat_dims, n_d=16, n_a=16, n_steps=4,
            gamma=1.5, lambda_sparse=1e-4,
            optimizer_fn=torch.optim.AdamW, optimizer_params=dict(lr=2e-2),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            scheduler_params={"mode": "max", "factor": 0.5, "patience": 5},
            mask_type='entmax', device_name='cuda' if torch.cuda.is_available() else 'cpu',
            verbose=0
        )
        tn_clf.fit(
            X_train_tabnet_np[train_idx], y_train_np[train_idx],
            eval_set=[(X_train_tabnet_np[val_idx], y_train_np[val_idx])],
            patience=20, batch_size=1024, virtual_batch_size=256, weights=1, max_epochs=100
        )
        test_preds_tabnet.append(tn_clf.predict_proba(X_test_tabnet_np)[:, 1])

        # LightGBM
        lgbm_params = {
            'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
            'colsample_bytree': 0.662, 'learning_rate': 0.0216, 'n_estimators': 1000,
            'num_leaves': 55, 'subsample': 0.657, 'scale_pos_weight': scale_pos_weight,
            'random_state': 42, 'n_jobs': -1, 'verbose': -1
        }
        lgbm_clf = lgb.LGBMClassifier(**lgbm_params)
        lgbm_clf.fit(
            X_train_lgbm.iloc[train_idx], y_train.iloc[train_idx],
            eval_set=[(X_train_lgbm.iloc[val_idx], y_train.iloc[val_idx])],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        test_preds_lgbm.append(lgbm_clf.predict_proba(X_test_lgbm)[:, 1])

    # Evaluation
    final_test_preds_tabnet = np.mean(test_preds_tabnet, axis=0)
    final_test_preds_lgbm = np.mean(test_preds_lgbm, axis=0)
    final_test_preds_ensemble = (final_test_preds_tabnet + final_test_preds_lgbm) / 2

    print("\n🏆 FINAL PERFORMANCE (Hold-Out Set)")
    acc_ensemble = accuracy_score(y_test_np, (final_test_preds_ensemble > 0.5).astype(int))
    auc_ensemble = roc_auc_score(y_test_np, final_test_preds_ensemble)
    print(f"Ensemble Accuracy: {acc_ensemble:.4f} | AUC: {auc_ensemble:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test_np, (final_test_preds_ensemble > 0.5).astype(int)))


# --- Submission Phase ---
def generate_submission2():
    print("\n📥 Training full ultimate ensemble on train.csv and predicting test.csv...")

    df_train = pd.read_csv("train.csv")
    df_test = pd.read_csv("test.csv")

    TARGET_COLUMN = "song_popularity"
    X_full = engineer_features(df_train.drop(TARGET_COLUMN, axis=1))
    y_full = df_train[TARGET_COLUMN]
    X_test = engineer_features(df_test.copy())
    ids = df_test["id"]

    # Preprocessing for both models
    X_full_lgbm, X_test_lgbm = X_full.copy(), X_test.copy()
    categorical_features = X_full.select_dtypes(include=['category', 'object']).columns.tolist()
    numerical_features = X_full.select_dtypes(include=np.number).columns.tolist()

    for col in categorical_features:
        le = LabelEncoder()
        X_full[col] = le.fit_transform(X_full[col])
        X_test[col] = le.transform(X_test[col])
        X_full_lgbm[col] = X_full_lgbm[col].astype('category')
        X_test_lgbm[col] = X_test_lgbm[col].astype('category')

    scaler = QuantileTransformer(output_distribution='normal', random_state=42)
    X_full[numerical_features] = scaler.fit_transform(X_full[numerical_features])
    X_test[numerical_features] = scaler.transform(X_test[numerical_features])

    cat_idxs = [X_full.columns.get_loc(col) for col in categorical_features]
    cat_dims = [X_full[col].nunique() for col in categorical_features]

    X_full_tabnet_np = X_full.values
    y_full_np = y_full.values
    X_test_tabnet_np = X_test.values

    scale_pos_weight = y_full.value_counts()[0] / y_full.value_counts()[1]

    # K-Fold Training on full dataset
    N_SPLITS = 5
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    test_preds_tabnet, test_preds_lgbm = [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_full, y_full_np)):
        print(f"--- Fold {fold+1}/{N_SPLITS} ---")

        # TabNet
        tn_clf = TabNetClassifier(
            cat_idxs=cat_idxs, cat_dims=cat_dims, n_d=16, n_a=16, n_steps=4,
            gamma=1.5, lambda_sparse=1e-4,
            optimizer_fn=torch.optim.AdamW, optimizer_params=dict(lr=2e-2),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            scheduler_params={"mode": "max", "factor": 0.5, "patience": 5},
            mask_type='entmax', device_name='cuda' if torch.cuda.is_available() else 'cpu',
            verbose=0
        )
        tn_clf.fit(
            X_full_tabnet_np[train_idx], y_full_np[train_idx],
            eval_set=[(X_full_tabnet_np[val_idx], y_full_np[val_idx])],
            patience=20, batch_size=1024, virtual_batch_size=256, weights=1, max_epochs=100
        )
        test_preds_tabnet.append(tn_clf.predict_proba(X_test_tabnet_np)[:, 1])

        # LightGBM
        lgbm_params = {
            'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
            'colsample_bytree': 0.662, 'learning_rate': 0.0216, 'n_estimators': 1000,
            'num_leaves': 55, 'subsample': 0.657, 'scale_pos_weight': scale_pos_weight,
            'random_state': 42, 'n_jobs': -1, 'verbose': -1
        }
        lgbm_clf = lgb.LGBMClassifier(**lgbm_params)
        lgbm_clf.fit(
            X_full_lgbm.iloc[train_idx], y_full.iloc[train_idx],
            eval_set=[(X_full_lgbm.iloc[val_idx], y_full.iloc[val_idx])],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        test_preds_lgbm.append(lgbm_clf.predict_proba(X_test_lgbm)[:, 1])

    # Ensemble prediction
    final_test_preds_tabnet = np.mean(test_preds_tabnet, axis=0)
    final_test_preds_lgbm = np.mean(test_preds_lgbm, axis=0)
    final_test_preds_ensemble = (final_test_preds_tabnet + final_test_preds_lgbm) / 2
    y_pred = (final_test_preds_ensemble > 0.5).astype(int)

    # Save submission
    submission = pd.DataFrame({
        "id": ids,
        "song_popularity": y_pred
    })
    submission.to_csv("Submission2.csv", index=False)
    print("✅ Submission2.csv created successfully!")


if __name__ == "__main__":
    run_ultimate_ensemble_pipeline()
    generate_submission2()


📊 Data split: 24000 training samples, 6000 hold-out test samples.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_out[col].fillna(df_out[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_out[col].fillna(df_out[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

Calculated scale_pos_weight for LightGBM: 1.74

⏳ Starting 5-Fold Ensemble Training...

--- Fold 1/5 ---

Early stopping occurred at epoch 52 with best_epoch = 32 and best_val_0_auc = 0.54739





--- Fold 2/5 ---

Early stopping occurred at epoch 52 with best_epoch = 32 and best_val_0_auc = 0.56332





--- Fold 3/5 ---

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_auc = 0.55046





--- Fold 4/5 ---

Early stopping occurred at epoch 35 with best_epoch = 15 and best_val_0_auc = 0.57184





--- Fold 5/5 ---

Early stopping occurred at epoch 22 with best_epoch = 2 and best_val_0_auc = 0.55134





🏆 FINAL PERFORMANCE (Hold-Out Set)
Ensemble Accuracy: 0.5880 | AUC: 0.5684

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.74      0.70      3814
           1       0.42      0.32      0.36      2186

    accuracy                           0.59      6000
   macro avg       0.54      0.53      0.53      6000
weighted avg       0.57      0.59      0.57      6000


📥 Training full ultimate ensemble on train.csv and predicting test.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_out[col].fillna(df_out[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_out[col].fillna(df_out[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

--- Fold 1/5 ---

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.55046




--- Fold 2/5 ---

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.56672




--- Fold 3/5 ---

Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_auc = 0.57663




--- Fold 4/5 ---

Early stopping occurred at epoch 71 with best_epoch = 51 and best_val_0_auc = 0.56599




--- Fold 5/5 ---

Early stopping occurred at epoch 28 with best_epoch = 8 and best_val_0_auc = 0.55325




✅ Submission2.csv created successfully!
