In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import warnings

warnings.filterwarnings("ignore")

def engineer_features(df_in, df_ref=None):
    # This function is correct and does not need changes.
    df_out = df_in.copy()
    if df_ref is None: df_ref = df_out
    if 'id' in df_out.columns: df_out.drop('id', axis=1, inplace=True)
    df_out['loudness_scaled'] = (df_out['loudness'] - df_ref['loudness'].min()) / (df_ref['loudness'].max() - df_ref['loudness'].min())
    df_out['duration_log'] = np.log1p(df_out['song_duration_ms'])
    df_out['tempo_log'] = np.log1p(df_out['tempo'])
    df_out['mood_index'] = df_out['audio_valence'] * df_out['energy']
    df_out['valence_category'] = pd.cut(df_out['audio_valence'], bins=[0, 0.25, 0.5, 0.75, 1.0], labels=['Very Sad', 'Sad', 'Happy', 'Very Happy'])
    df_out['speech_to_acoustic'] = df_out['speechiness'] / (df_out['acousticness'] + 1e-6)
    df_out['talky_song_index'] = df_out['speechiness'] / (df_out['instrumentalness'] + 1e-6)
    df_out['key_mode_signature'] = df_out['key'].astype(str) + '_' + df_out['audio_mode'].astype(str) + '_' + df_out['time_signature'].astype(str)
    df_out['tempo_dance_prod'] = df_out['tempo'] * df_out['danceability']
    df_out['energy_sq'] = df_out['energy'] ** 2
    df_out['danceability_sq'] = df_out['danceability'] ** 2
    df_out['vocal_focus'] = 1 - df_out['instrumentalness']
    df_out['studio_polish'] = 1 - df_out['liveness']
    cat_cols = ['key', 'audio_mode', 'time_signature', 'valence_category', 'key_mode_signature']
    for col in cat_cols:
        if col in df_out.columns: df_out[col] = df_out[col].astype('category')
    df_out.drop(['loudness', 'song_duration_ms'], axis=1, inplace=True, errors='ignore')
    return df_out

def train_xgb_ensemble_and_predict():
    # --- 1. Load and Prepare Data ---
    print("STEP 1: Loading and preparing data...")
    df_train = pd.read_csv("train.csv")
    df_test = pd.read_csv("test.csv")
    ids = df_test['id']
    TARGET_COLUMN = "song_popularity"

    X = engineer_features(df_train.drop(TARGET_COLUMN, axis=1), df_train)
    y = df_train[TARGET_COLUMN]
    X_test = engineer_features(df_test, df_train)

    train_cols = X.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    X_test = X_test[train_cols]
    
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['category']).columns.tolist()
    print("✅ Data preparation complete.")

    # --- 2. Define XGBoost Model Configurations ---
    print("\nSTEP 2: Defining XGBoost models...")
    
    # NEW: Calculate the scale_pos_weight for balancing classes
    class_counts = y.value_counts()
    scale_pos_weight_value = class_counts[0] / class_counts[1]
    print(f"⚖️ Class Weight (scale_pos_weight) calculated as: {scale_pos_weight_value:.2f}")
    
    common_params = {
        'objective': 'binary:logistic', 'eval_metric': 'logloss',
        'n_estimators': 2000, 'early_stopping_rounds': 50,
        'use_label_encoder': False, 'seed': 42,
        'scale_pos_weight': scale_pos_weight_value  # MODIFIED: Added balanced weight
    }
    
    xgb_models_params = [
        {**common_params, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8},
        {**common_params, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.6, 'colsample_bytree': 0.6},
        {**common_params, 'learning_rate': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.7}
    ]
    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ], remainder='passthrough')
    print("✅ Models defined.")

    # --- 3. Train Ensemble with Cross-Validation ---
    print("\nSTEP 3: Training XGBoost ensemble with 10-fold CV...")
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    fold_accuracies, fold_aucs = [], []
    total_cm = np.zeros((2, 2), dtype=int)
    final_test_predictions = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        X_train_processed = preprocessor.fit_transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        fold_val_preds, fold_test_preds = [], []
        
        for i, params in enumerate(xgb_models_params):
            xgb_model = xgb.XGBClassifier(**params)
            xgb_model.fit(X_train_processed, y_train, eval_set=[(X_val_processed, y_val)], verbose=False)
            fold_val_preds.append(xgb_model.predict_proba(X_val_processed)[:, 1])
            X_test_processed = preprocessor.transform(X_test)
            fold_test_preds.append(xgb_model.predict_proba(X_test_processed)[:, 1])

        avg_val_proba = np.mean(fold_val_preds, axis=0)
        blend_val_labels = np.round(avg_val_proba)
        
        acc_blend = accuracy_score(y_val, blend_val_labels)
        auc_blend = roc_auc_score(y_val, avg_val_proba)
        cm = confusion_matrix(y_val, blend_val_labels)
        
        fold_accuracies.append(acc_blend)
        fold_aucs.append(auc_blend)
        total_cm += cm
        
        print(f"--- Fold {fold+1}/10 | BLEND Acc: {acc_blend:.4f} | BLEND AUC: {auc_blend:.4f} ---")
        print(f"Confusion Matrix:\n{cm}\n")
        
        avg_test_proba = np.mean(fold_test_preds, axis=0)
        final_test_predictions.append(avg_test_proba)

    # --- 4. Report Final Averages & Metrics ---
    print("\n" + "="*50)
    print("🏆 FINAL VALIDATION METRICS 🏆")
    print("="*50)
    
    print(f"Average Accuracy: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")
    print(f"Average AUC Score: {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")
    
    print("\nTotal Confusion Matrix (across all folds):")
    print(total_cm)
    
    tn, fp, fn, tp = total_cm.ravel()
    print(f"\nTrue Negatives (TN): {tn} (Correctly predicted 'unpopular')")
    print(f"False Positives (FP): {fp} (Incorrectly predicted 'popular')")
    print(f"False Negatives (FN): {fn} (Incorrectly predicted 'unpopular')")
    print(f"True Positives (TP): {tp} (Correctly predicted 'popular')")
    print("="*50 + "\n")

    # --- 5. Create Submission File ---
    final_preds = np.mean(final_test_predictions, axis=0)
    final_labels = np.round(final_preds).astype(int)

    submission = pd.DataFrame({"id": ids, "song_popularity": final_labels})
    submission.to_csv("Submission15_XGB_Balanced.csv", index=False)
    print("✅ Submission15_XGB_Balanced.csv created successfully!")

if __name__ == "__main__":
    train_xgb_ensemble_and_predict()

STEP 1: Loading and preparing data...
✅ Data preparation complete.

STEP 2: Defining XGBoost models...
⚖️ Class Weight (scale_pos_weight) calculated as: 1.74
✅ Models defined.

STEP 3: Training XGBoost ensemble with 10-fold CV...
--- Fold 1/10 | BLEND Acc: 0.5603 | BLEND AUC: 0.5796 ---
Confusion Matrix:
[[1111  796]
 [ 523  570]]

--- Fold 2/10 | BLEND Acc: 0.5477 | BLEND AUC: 0.5652 ---
Confusion Matrix:
[[1029  878]
 [ 479  614]]

--- Fold 3/10 | BLEND Acc: 0.5760 | BLEND AUC: 0.5829 ---
Confusion Matrix:
[[1134  773]
 [ 499  594]]

--- Fold 4/10 | BLEND Acc: 0.5573 | BLEND AUC: 0.5825 ---
Confusion Matrix:
[[1066  841]
 [ 487  606]]

--- Fold 5/10 | BLEND Acc: 0.5627 | BLEND AUC: 0.5859 ---
Confusion Matrix:
[[1080  827]
 [ 485  608]]

--- Fold 6/10 | BLEND Acc: 0.5630 | BLEND AUC: 0.5903 ---
Confusion Matrix:
[[1090  817]
 [ 494  599]]

--- Fold 7/10 | BLEND Acc: 0.5373 | BLEND AUC: 0.5598 ---
Confusion Matrix:
[[1021  886]
 [ 502  591]]

--- Fold 8/10 | BLEND Acc: 0.5653 | BLEND 

In [8]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import warnings

warnings.filterwarnings("ignore")

def engineer_features(df_in, df_ref=None):
    # This function is correct and does not need changes.
    df_out = df_in.copy()
    if df_ref is None: df_ref = df_out
    if 'id' in df_out.columns: df_out.drop('id', axis=1, inplace=True)
    df_out['loudness_scaled'] = (df_out['loudness'] - df_ref['loudness'].min()) / (df_ref['loudness'].max() - df_ref['loudness'].min())
    df_out['duration_log'] = np.log1p(df_out['song_duration_ms'])
    df_out['tempo_log'] = np.log1p(df_out['tempo'])
    df_out['mood_index'] = df_out['audio_valence'] * df_out['energy']
    df_out['valence_category'] = pd.cut(df_out['audio_valence'], bins=[0, 0.25, 0.5, 0.75, 1.0], labels=['Very Sad', 'Sad', 'Happy', 'Very Happy'])
    df_out['speech_to_acoustic'] = df_out['speechiness'] / (df_out['acousticness'] + 1e-6)
    df_out['talky_song_index'] = df_out['speechiness'] / (df_out['instrumentalness'] + 1e-6)
    df_out['key_mode_signature'] = df_out['key'].astype(str) + '_' + df_out['audio_mode'].astype(str) + '_' + df_out['time_signature'].astype(str)
    df_out['tempo_dance_prod'] = df_out['tempo'] * df_out['danceability']
    df_out['energy_sq'] = df_out['energy'] ** 2
    df_out['danceability_sq'] = df_out['danceability'] ** 2
    df_out['vocal_focus'] = 1 - df_out['instrumentalness']
    df_out['studio_polish'] = 1 - df_out['liveness']
    cat_cols = ['key', 'audio_mode', 'time_signature', 'valence_category', 'key_mode_signature']
    for col in cat_cols:
        if col in df_out.columns: df_out[col] = df_out[col].astype('category')
    df_out.drop(['loudness', 'song_duration_ms'], axis=1, inplace=True, errors='ignore')
    return df_out

def train_lgbm_balanced_and_predict():
    # --- 1. Load and Prepare Data ---
    print("STEP 1: Loading and preparing data for LightGBM...")
    df_train = pd.read_csv("train.csv")
    df_test = pd.read_csv("test.csv")
    ids = df_test['id']
    TARGET_COLUMN = "song_popularity"

    X = engineer_features(df_train.drop(TARGET_COLUMN, axis=1), df_train)
    y = df_train[TARGET_COLUMN]
    X_test = engineer_features(df_test, df_train)

    train_cols = X.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    X_test = X_test[train_cols]
    
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['category']).columns.tolist()
    print("✅ Data preparation complete.")

    # --- 2. Define Balanced LightGBM Model ---
    print("\nSTEP 2: Defining balanced LightGBM model...")
    class_counts = y.value_counts()
    scale_pos_weight_value = class_counts[0] / class_counts[1]
    print(f"⚖️ Class Weight (scale_pos_weight) calculated as: {scale_pos_weight_value:.2f}")
    
    lgbm_params = {
        'objective': 'binary', 'random_state': 42, 'n_jobs': -1,
        'learning_rate': 0.031, 'n_estimators': 640, 'num_leaves': 44,
        'max_depth': 2, 'min_child_samples': 44, 'subsample': 0.805,
        'colsample_bytree': 0.594,
        'scale_pos_weight': scale_pos_weight_value # Apply the balanced weight
    }
    lgbm_model = lgb.LGBMClassifier(**lgbm_params)
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ], remainder='passthrough')
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', lgbm_model)])
    print("✅ Model defined.")

    # --- 3. Train and Evaluate with Cross-Validation ---
    print("\nSTEP 3: Training LightGBM with 10-fold CV...")
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    fold_accuracies, fold_aucs = [], []
    total_cm = np.zeros((2, 2), dtype=int)
    final_test_predictions = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        pipeline.fit(X_train, y_train)
        
        val_proba = pipeline.predict_proba(X_val)[:, 1]
        val_labels = np.round(val_proba)
        
        acc = accuracy_score(y_val, val_labels)
        auc = roc_auc_score(y_val, val_proba)
        cm = confusion_matrix(y_val, val_labels)
        
        fold_accuracies.append(acc)
        fold_aucs.append(auc)
        total_cm += cm
        
        print(f"--- Fold {fold+1}/10 | Acc: {acc:.4f} | AUC: {auc:.4f} ---")
        print(f"Confusion Matrix:\n{cm}\n")
        
        test_proba = pipeline.predict_proba(X_test)[:, 1]
        final_test_predictions.append(test_proba)

    # --- 4. Report Final Averages & Metrics ---
    print("\n" + "="*50)
    print("🏆 FINAL VALIDATION METRICS (LGBM) 🏆")
    print("="*50)
    print(f"Average Accuracy: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")
    print(f"Average AUC Score: {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")
    print("\nTotal Confusion Matrix (across all folds):")
    print(total_cm)
    tn, fp, fn, tp = total_cm.ravel()
    print(f"\nTrue Negatives (TN): {tn}\nFalse Positives (FP): {fp}\nFalse Negatives (FN): {fn}\nTrue Positives (TP): {tp}")
    print("="*50 + "\n")

    # --- 5. Create Submission File ---
    final_preds = np.mean(final_test_predictions, axis=0)
    final_labels = np.round(final_preds).astype(int)
    submission = pd.DataFrame({"id": ids, "song_popularity": final_labels})
    submission.to_csv("Submission3.csv", index=False)
    print("✅ Submission3.csv created successfully!")

if __name__ == "__main__":
    train_lgbm_balanced_and_predict()

STEP 1: Loading and preparing data for LightGBM...
✅ Data preparation complete.

STEP 2: Defining balanced LightGBM model...
⚖️ Class Weight (scale_pos_weight) calculated as: 1.74
✅ Model defined.

STEP 3: Training LightGBM with 10-fold CV...
[LightGBM] [Info] Number of positive: 9839, number of negative: 17161
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4989
[LightGBM] [Info] Number of data points in the train set: 27000, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364407 -> initscore=-0.556285
[LightGBM] [Info] Start training from score -0.556285
--- Fold 1/10 | Acc: 0.5527 | AUC: 0.5772 ---
Confusion Matrix:
[[1059  848]
 [ 494  599]]

[LightGBM] [Info] Number of positive: 9839, number of negative: 17161
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001811 seconds

In [9]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import warnings

warnings.filterwarnings("ignore")

def engineer_features(df_in, df_ref=None):
    # This function is correct and does not need changes.
    df_out = df_in.copy()
    if df_ref is None: df_ref = df_out
    if 'id' in df_out.columns: df_out.drop('id', axis=1, inplace=True)
    df_out['loudness_scaled'] = (df_out['loudness'] - df_ref['loudness'].min()) / (df_ref['loudness'].max() - df_ref['loudness'].min())
    df_out['duration_log'] = np.log1p(df_out['song_duration_ms'])
    df_out['tempo_log'] = np.log1p(df_out['tempo'])
    df_out['mood_index'] = df_out['audio_valence'] * df_out['energy']
    df_out['valence_category'] = pd.cut(df_out['audio_valence'], bins=[0, 0.25, 0.5, 0.75, 1.0], labels=['Very Sad', 'Sad', 'Happy', 'Very Happy'])
    df_out['speech_to_acoustic'] = df_out['speechiness'] / (df_out['acousticness'] + 1e-6)
    df_out['talky_song_index'] = df_out['speechiness'] / (df_out['instrumentalness'] + 1e-6)
    df_out['key_mode_signature'] = df_out['key'].astype(str) + '_' + df_out['audio_mode'].astype(str) + '_' + df_out['time_signature'].astype(str)
    df_out['tempo_dance_prod'] = df_out['tempo'] * df_out['danceability']
    df_out['energy_sq'] = df_out['energy'] ** 2
    df_out['danceability_sq'] = df_out['danceability'] ** 2
    df_out['vocal_focus'] = 1 - df_out['instrumentalness']
    df_out['studio_polish'] = 1 - df_out['liveness']
    cat_cols = ['key', 'audio_mode', 'time_signature', 'valence_category', 'key_mode_signature']
    for col in cat_cols:
        if col in df_out.columns: df_out[col] = df_out[col].astype('category')
    df_out.drop(['loudness', 'song_duration_ms'], axis=1, inplace=True, errors='ignore')
    return df_out

def train_catboost_balanced_and_predict():
    # --- 1. Load and Prepare Data ---
    print("STEP 1: Loading and preparing data for CatBoost...")
    df_train = pd.read_csv("train.csv")
    df_test = pd.read_csv("test.csv")
    ids = df_test['id']
    TARGET_COLUMN = "song_popularity"

    X = engineer_features(df_train.drop(TARGET_COLUMN, axis=1), df_train)
    y = df_train[TARGET_COLUMN]
    X_test = engineer_features(df_test, df_train)

    train_cols = X.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    X_test = X_test[train_cols]
    
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['category']).columns.tolist()
    print("✅ Data preparation complete.")

    # --- 2. Define Balanced CatBoost Model ---
    print("\nSTEP 2: Defining balanced CatBoost model...")
    catboost_model = cb.CatBoostClassifier(
        iterations=2000, learning_rate=0.05, depth=6,
        loss_function='Logloss', eval_metric='Accuracy',
        random_seed=42, verbose=0,
        auto_class_weights='Balanced', # Apply the balanced weight
        early_stopping_rounds=50
    )
    print("✅ Model defined.")

    # --- 3. Train and Evaluate with Cross-Validation ---
    print("\nSTEP 3: Training CatBoost with 10-fold CV...")
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    fold_accuracies, fold_aucs = [], []
    total_cm = np.zeros((2, 2), dtype=int)
    final_test_predictions = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        scaler = StandardScaler()
        X_train_scaled_num = pd.DataFrame(scaler.fit_transform(X_train[numerical_features]), columns=numerical_features, index=X_train.index)
        X_val_scaled_num = pd.DataFrame(scaler.transform(X_val[numerical_features]), columns=numerical_features, index=X_val.index)
        
        X_train_processed = pd.concat([X_train_scaled_num, X_train[categorical_features]], axis=1)
        X_val_processed = pd.concat([X_val_scaled_num, X_val[categorical_features]], axis=1)
        
        for col in categorical_features:
            X_train_processed[col] = X_train_processed[col].astype(str)
            X_val_processed[col] = X_val_processed[col].astype(str)
            
        catboost_model.fit(X_train_processed, y_train, 
                           cat_features=categorical_features, 
                           eval_set=(X_val_processed, y_val))
                           
        val_proba = catboost_model.predict_proba(X_val_processed)[:, 1]
        val_labels = np.round(val_proba)
        
        acc = accuracy_score(y_val, val_labels)
        auc = roc_auc_score(y_val, val_proba)
        cm = confusion_matrix(y_val, val_labels)
        
        fold_accuracies.append(acc)
        fold_aucs.append(auc)
        total_cm += cm
        
        print(f"--- Fold {fold+1}/10 | Acc: {acc:.4f} | AUC: {auc:.4f} ---")
        print(f"Confusion Matrix:\n{cm}\n")
        
        X_test_scaled_num = pd.DataFrame(scaler.transform(X_test[numerical_features]), columns=numerical_features, index=X_test.index)
        X_test_processed = pd.concat([X_test_scaled_num, X_test[categorical_features]], axis=1)
        for col in categorical_features:
            X_test_processed[col] = X_test_processed[col].astype(str)
        test_proba = catboost_model.predict_proba(X_test_processed)[:, 1]
        final_test_predictions.append(test_proba)

    # --- 4. Report Final Averages & Metrics ---
    print("\n" + "="*50)
    print("🏆 FINAL VALIDATION METRICS (CatBoost) 🏆")
    print("="*50)
    print(f"Average Accuracy: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")
    print(f"Average AUC Score: {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")
    print("\nTotal Confusion Matrix (across all folds):")
    print(total_cm)
    tn, fp, fn, tp = total_cm.ravel()
    print(f"\nTrue Negatives (TN): {tn}\nFalse Positives (FP): {fp}\nFalse Negatives (FN): {fn}\nTrue Positives (TP): {tp}")
    print("="*50 + "\n")

    # --- 5. Create Submission File ---
    final_preds = np.mean(final_test_predictions, axis=0)
    final_labels = np.round(final_preds).astype(int)
    submission = pd.DataFrame({"id": ids, "song_popularity": final_labels})
    submission.to_csv("Submission17_CatBoost_Balanced.csv", index=False)
    print("✅ Submission17_CatBoost_Balanced.csv created successfully!")

if __name__ == "__main__":
    train_catboost_balanced_and_predict()

STEP 1: Loading and preparing data for CatBoost...
✅ Data preparation complete.

STEP 2: Defining balanced CatBoost model...
✅ Model defined.

STEP 3: Training CatBoost with 10-fold CV...
--- Fold 1/10 | Acc: 0.5393 | AUC: 0.5616 ---
Confusion Matrix:
[[1001  906]
 [ 476  617]]

--- Fold 2/10 | Acc: 0.5290 | AUC: 0.5635 ---
Confusion Matrix:
[[932 975]
 [438 655]]

--- Fold 3/10 | Acc: 0.5523 | AUC: 0.5635 ---
Confusion Matrix:
[[1012  895]
 [ 448  645]]

--- Fold 4/10 | Acc: 0.5503 | AUC: 0.5717 ---
Confusion Matrix:
[[1001  906]
 [ 443  650]]

--- Fold 5/10 | Acc: 0.5500 | AUC: 0.5790 ---
Confusion Matrix:
[[995 912]
 [438 655]]

--- Fold 6/10 | Acc: 0.5620 | AUC: 0.5880 ---
Confusion Matrix:
[[1040  867]
 [ 447  646]]

--- Fold 7/10 | Acc: 0.5470 | AUC: 0.5645 ---
Confusion Matrix:
[[999 908]
 [451 642]]

--- Fold 8/10 | Acc: 0.5590 | AUC: 0.5775 ---
Confusion Matrix:
[[1047  860]
 [ 463  630]]

--- Fold 9/10 | Acc: 0.5297 | AUC: 0.5490 ---
Confusion Matrix:
[[958 948]
 [463 631]]



In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils import shuffle
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings("ignore")

def print_metrics(y_true, y_pred_proba, model_name):
    """A helper function to print a full report for a model's predictions."""
    y_pred_labels = np.round(y_pred_proba)
    acc = accuracy_score(y_true, y_pred_labels)
    auc = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred_labels)
    tn, fp, fn, tp = cm.ravel()
    
    print("-" * 50)
    print(f"🏆 METRICS FOR: {model_name} 🏆")
    print("="*50)
    print(f"Average Accuracy: {acc:.4f}")
    print(f"Average AUC Score:  {auc:.4f}")
    print("\nTotal Confusion Matrix:")
    print(cm)
    print(f"\nTN: {tn} | FP: {fp} | FN: {fn} | TP: {tp}")
    print("-" * 50 + "\n")


def run_10_model_xgb_ensemble():
    # --- 1. Load and Undersample Data ---
    print("STEP 1: Loading and undersampling data...")
    df_train_full = pd.read_csv("train.csv")
    df_test = pd.read_csv("test.csv")
    ids = df_test['id']
    TARGET_COLUMN = "song_popularity"

    df_majority = df_train_full[df_train_full[TARGET_COLUMN] == 0]
    df_minority = df_train_full[df_train_full[TARGET_COLUMN] == 1]
    
    n_minority = len(df_minority)
    df_majority_downsampled = df_majority.sample(n=n_minority, random_state=42)
    
    df_train = pd.concat([df_majority_downsampled, df_minority])
    df_train = shuffle(df_train, random_state=42)
    print(f"Using balanced training data size: {len(df_train)}")

    # --- 2. Prepare Data (NO FEATURE ENGINEERING) ---
    print("\nSTEP 2: Preparing raw data and defining preprocessor...")
    
    X = df_train.drop(columns=['id', TARGET_COLUMN])
    y = df_train[TARGET_COLUMN]
    X_test = df_test.drop(columns=['id'])

    train_cols = X.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    X_test = X_test[train_cols]
    
    categorical_features = ['key', 'audio_mode', 'time_signature']
    numerical_features = [col for col in X.columns if col not in categorical_features]
    
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_features), ('cat', categorical_transformer, categorical_features)])
    print("✅ Data preparation complete.")

    # --- 3. Define a Single, Strong XGBoost Configuration ---
    print("\nSTEP 3: Defining XGBoost base model...")
    # We will use this same configuration 10 times with different random seeds
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'learning_rate': 0.05,
        'max_depth': 5,
        'n_estimators': 1000, # Increased estimators for a stronger base model
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'early_stopping_rounds': 50
    }
    print("✅ Model defined.")

    # --- 4. Train 10-Model Ensemble with Cross-Validation ---
    print("\nSTEP 4: Training 10-model XGBoost ensemble with 10-fold CV...")
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    oof_preds = []
    oof_true = []
    final_test_predictions = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        print(f"--- Processing Fold {fold+1}/10 ---")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        oof_true.extend(y_val)
        
        X_train_processed = preprocessor.fit_transform(X_train)
        X_val_processed = preprocessor.transform(X_val)
        X_test_processed = preprocessor.transform(X_test)

        fold_val_preds = []
        fold_test_preds = []
        
        # Inner loop: train 10 models with different seeds
        for i in range(10):
            model = xgb.XGBClassifier(**xgb_params, seed=i) # Use loop index as the seed
            model.fit(X_train_processed, y_train, eval_set=[(X_val_processed, y_val)], verbose=False)
            
            fold_val_preds.append(model.predict_proba(X_val_processed)[:, 1])
            fold_test_preds.append(model.predict_proba(X_test_processed)[:, 1])

        # Blend predictions for this fold
        oof_preds.extend(np.mean(fold_val_preds, axis=0))
        final_test_predictions.append(np.mean(fold_test_preds, axis=0))

    print("✅ Training complete.")

    # --- 5. Report Final Metrics for the Ensemble ---
    print("\nSTEP 5: Reporting final validation metrics for the 10-model blend...")
    print_metrics(oof_true, np.array(oof_preds), '10-Model XGB Ensemble (Blended)')

    # --- 6. Create Submission File ---
    final_preds = np.mean(final_test_predictions, axis=0)
    final_labels = (final_preds > 0.5).astype(int)
    
    submission = pd.DataFrame({"id": ids, "song_popularity": final_labels})
    submission.to_csv("Submission3.csv", index=False)
    print("✅ Submission3.csv created successfully!")

if __name__ == "__main__":
    run_10_model_xgb_ensemble()

STEP 1: Loading and undersampling data...
Using balanced training data size: 21864

STEP 2: Preparing raw data and defining preprocessor...
✅ Data preparation complete.

STEP 3: Defining multiple XGBoost configurations...
✅ Models defined.

STEP 4: Training 20-model ensemble with 10-fold CV...
--- Processing Fold 1/10 ---
--- Processing Fold 2/10 ---
--- Processing Fold 3/10 ---
--- Processing Fold 4/10 ---
--- Processing Fold 5/10 ---
--- Processing Fold 6/10 ---
--- Processing Fold 7/10 ---
--- Processing Fold 8/10 ---
--- Processing Fold 9/10 ---
--- Processing Fold 10/10 ---
✅ Training complete.

STEP 5: Reporting final validation metrics for the Mega ensemble...
--------------------------------------------------
🏆 METRICS FOR: "Mega" 20-Model XGB Ensemble 🏆
Average Accuracy: 0.5558
Average AUC Score:  0.5768

Total Confusion Matrix:
[[5856 5076]
 [4637 6295]]

TN: 5856 | FP: 5076 | FN: 4637 | TP: 6295
--------------------------------------------------

✅ Submission3.csv created su

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score # ADDED accuracy_score
from scipy.special import expit

# Boosting libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

RANDOM_SEED = 23

# ------------------------
# Helpers
# ------------------------
def safe_predict_proba(pipe, X):
    """Return probability-like scores for AUC from pipeline or model."""
    if hasattr(pipe, "predict_proba"):
        return pipe.predict_proba(X)[:, 1]
    elif hasattr(pipe, "decision_function"):
        return expit(pipe.decision_function(X))
    else:
        return pipe.predict(X)

def get_oof_preds(model, X, y, X_test, preprocessor, n_splits=7):
    """Generates Out-of-Fold (OOF) predictions for stacking."""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    oof = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))

    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr = y.iloc[train_idx]

        pipe = Pipeline([("pre", preprocessor), ("model", model)])
        pipe.fit(X_tr, y_tr)
        oof[val_idx] = safe_predict_proba(pipe, X_val)
        test_preds += safe_predict_proba(pipe, X_test) / n_splits
    return oof, test_preds

# ------------------------
# Load data
# ------------------------
print("STEP 1: Loading data...")
train_full = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
id_col = "id"
target = "song_popularity"

# --- Using our Undersampling strategy ---
print("STEP 2: Balancing data with Random Undersampling...")
df_majority = train_full[train_full[target] == 0]
df_minority = train_full[train_full[target] == 1]
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=54)
train = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=54)
print(f"Using balanced training data of size: {len(train)}")

# ------------------------
# Feature engineering
# ------------------------
print("STEP 3: Performing feature engineering...")
def feature_engineer(df):
    df = df.copy()
    eps = 1e-6
    if "song_duration_ms" in df.columns:
        df["duration_min"] = df["song_duration_ms"] / 60000.0
    df["energy_by_dance"] = df["energy"] / (df["danceability"] + eps)
    df["tempo_valence"] = df["tempo"] * df["audio_valence"]
    df["acoustic_to_energy"] = df["acousticness"] / (df["energy"] + eps)
    df["speech_to_dance"] = df["speechiness"] / (df["danceability"] + eps)
    df["complexity"] = df["acousticness"] + df["instrumentalness"]
    df["tempo_bin"] = pd.cut(df["tempo"].fillna(df["tempo"].median()), bins=[-1, 80, 110, 140, 300], labels=[0,1,2,3]).astype(float)
    return df

train = feature_engineer(train)
test = feature_engineer(test)
print("✅ Feature engineering complete.")

# ------------------------
# CV-safe target encoding
# ------------------------
print("STEP 4: Performing CV-safe Target Encoding...")
def cv_target_encode(train_df, test_df, col, target, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    oof = np.zeros(len(train_df))
    global_mean = train_df[target].mean()
    for tr_idx, val_idx in skf.split(train_df, train_df[target]):
        tr = train_df.iloc[tr_idx]
        val = train_df.iloc[val_idx]
        mapping = tr.groupby(col)[target].mean()
        oof[val_idx] = val[col].map(mapping).fillna(global_mean)
    test_map = train_df.groupby(col)[target].mean()
    test_enc = test_df[col].map(test_map).fillna(global_mean)
    return oof, test_enc

for c in ["key", "time_signature", "audio_mode"]:
    oof_enc, test_enc = cv_target_encode(train, test, c, target, n_splits=7)
    train[f"{c}_te"] = oof_enc
    test[f"{c}_te"] = test_enc
print("✅ Target encoding complete.")

# ------------------------
# Prepare features & preprocessing
# ------------------------
print("STEP 5: Preparing final data and preprocessor...")
drop_cols = [id_col, target, "song_duration_ms"]
X = train.drop(columns=drop_cols, errors='ignore')
y = train[target].astype(int)
X_test = test.drop(columns=[id_col, "song_duration_ms"], errors='ignore')

# Align columns
X_test = X_test[X.columns]

numeric_features = list(X.columns)
num_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
preprocessor = ColumnTransformer(transformers=[("num", num_transformer, numeric_features)])
print("✅ Data ready.")

# ------------------------
# Define Base Models
# ------------------------
print("\nSTEP 6: Defining pre-tuned base models...")
lgbm = LGBMClassifier(random_state=RANDOM_SEED, n_estimators=1000, learning_rate=0.05, num_leaves=31, n_jobs=-1, verbosity=-1)
xgb = XGBClassifier(random_state=RANDOM_SEED, n_estimators=1000, learning_rate=0.05, max_depth=5, use_label_encoder=False, eval_metric="logloss", n_jobs=-1)
cat = CatBoostClassifier(random_state=RANDOM_SEED, n_estimators=1000, learning_rate=0.05, depth=6, verbose=0)
rf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=RANDOM_SEED, n_jobs=-1)
logreg = LogisticRegression(max_iter=2000, random_state=RANDOM_SEED)

base_models = [
    ("lgbm", lgbm),
    ("xgb", xgb),
    ("cat", cat),
    ("rf", rf),
    ("logreg", logreg)
]
print("✅ Base models defined.")

# ------------------------
# Build OOF meta-features
# ------------------------
print("\nSTEP 7: Generating Out-of-Fold (OOF) predictions for stacking...")
oof_preds = np.zeros((len(X), len(base_models)))
test_preds = np.zeros((len(X_test), len(base_models)))

for i, (name, model) in enumerate(base_models):
    print(f"--- Generating OOF for: {name} ---")
    oof, tpred = get_oof_preds(model, X, y, X_test, preprocessor, n_splits=7)
    oof_preds[:, i] = oof
    test_preds[:, i] = tpred
    print(f"{name} OOF AUC: {roc_auc_score(y, oof):.5f}")

# ------------------------
# Train Meta-learner
# ------------------------
print("\nSTEP 8: Training meta-learner...")
meta_model = LogisticRegression(C=0.1, solver="liblinear")
meta_model.fit(oof_preds, y)

final_oof = meta_model.predict_proba(oof_preds)[:, 1]
final_test_preds = meta_model.predict_proba(test_preds)[:, 1]

# --- MODIFIED: Calculate and print both AUC and Accuracy ---
final_auc = roc_auc_score(y, final_oof)
final_oof_labels = np.round(final_oof)
final_accuracy = accuracy_score(y, final_oof_labels)

print(f"\n🏆 Final Stacked Ensemble OOF AUC: {final_auc:.5f}")
print(f"🎯 Final Stacked Ensemble OOF Accuracy: {final_accuracy:.5f}")

# ------------------------
# Save submission
# ------------------------
submission = pd.DataFrame({id_col: test[id_col], target: final_test_preds})
submission.to_csv("Submission3.csv", index=False)
print("\n✅ Submission3.csv created successfully!")

STEP 1: Loading data...
STEP 2: Balancing data with Random Undersampling...
Using balanced training data of size: 21864
STEP 3: Performing feature engineering...
✅ Feature engineering complete.
STEP 4: Performing CV-safe Target Encoding...
✅ Target encoding complete.
STEP 5: Preparing final data and preprocessor...
✅ Data ready.

STEP 6: Defining pre-tuned base models...
✅ Base models defined.

STEP 7: Generating Out-of-Fold (OOF) predictions for stacking...
--- Generating OOF for: lgbm ---
lgbm OOF AUC: 0.54456
--- Generating OOF for: xgb ---
xgb OOF AUC: 0.53809
--- Generating OOF for: cat ---
cat OOF AUC: 0.55368
--- Generating OOF for: rf ---
rf OOF AUC: 0.56535
--- Generating OOF for: logreg ---
logreg OOF AUC: 0.54801

STEP 8: Training meta-learner...

🏆 Final Stacked Ensemble OOF AUC: 0.56341
🎯 Final Stacked Ensemble OOF Accuracy: 0.54331

✅ Submission3.csv created successfully!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.special import expit

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import optuna

import warnings
warnings.filterwarnings("ignore")



RANDOM_SEED = 54

# Helpers Functions
def safe_predict_proba(pipe, X):
    """Return probability-like scores for AUC from pipeline or model."""
    if hasattr(pipe, "predict_proba"):
        return pipe.predict_proba(X)[:, 1]
    elif hasattr(pipe, "decision_function"):
        return expit(pipe.decision_function(X))
    else:
        return pipe.predict(X)

def get_oof_preds(model, X, y, X_test, preprocessor, n_splits=5):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=54)
    oof = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))

    for train_idx, val_idx in kf.split(X, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        pipe = Pipeline([("pre", preprocessor), ("model", model)])
        pipe.fit(X_tr, y_tr)
        oof[val_idx] = safe_predict_proba(pipe, X_val)
        test_preds += safe_predict_proba(pipe, X_test) / n_splits
    
    return oof, test_preds


# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

id_col = "id"
target = "song_popularity"


# Feature engineering
def feature_engineer(df):
    df = df.copy()
    if "song_duration_ms" in df.columns:
        df["duration_min"] = df["song_duration_ms"] / 60000.0
        df["log_duration"] = np.log1p(df["song_duration_ms"])

    eps = 1e-6
    df["energy_by_dance"] = df["energy"] / (df["danceability"] + eps)
    df["dance_x_energy"] = df["danceability"] * df["energy"]
    df["tempo_valence"] = df["tempo"] * df["audio_valence"]
    df["acoustic_to_energy"] = df["acousticness"] / (df["energy"] + eps)
    df["speech_to_dance"] = df["speechiness"] / (df["danceability"] + eps)
    df["mood_balance"] = df["energy"] - df["audio_valence"]
    df["complexity"] = df["acousticness"] + df["instrumentalness"] + df["liveness"]
    df["tempo_bin"] = pd.cut(df["tempo"].fillna(df["tempo"].median()),
                             bins=[-1, 80, 110, 140, 300], labels=[0,1,2,3]).astype(float)
    
    return df

train = feature_engineer(train)
test = feature_engineer(test)


# CV-safe target encoding
def cv_target_encode(train_df, test_df, col, target, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=54)
    oof = np.zeros(len(train_df))
    global_mean = train_df[target].mean()
    for tr_idx, val_idx in skf.split(train_df, train_df[target]):
        tr = train_df.iloc[tr_idx]
        val = train_df.iloc[val_idx]
        mapping = tr.groupby(col)[target].mean()
        oof[val_idx] = val[col].map(mapping).fillna(global_mean)
    test_map = train_df.groupby(col)[target].mean()
    test_enc = test_df[col].map(test_map).fillna(global_mean)
    return oof, test_enc

for c in ["key", "time_signature", "audio_mode"]:
    oof_enc, test_enc = cv_target_encode(train, test, c, target, n_splits=5)
    train[f"{c}_te"] = oof_enc
    test[f"{c}_te"] = test_enc


# Prepare features & preprocessing
drop_cols = [id_col, target]
X = train.drop(columns=drop_cols)
y = train[target].astype(int)
X_test = test.drop(columns=[id_col])

numeric = list(X.columns)  # all numeric now, no categorical left

num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, numeric)
])


# Optuna tuning functions
def tune_lgbm(X, y, preprocessor, n_trials=40):
    def objective(trial):
        params = {
            "n_estimators": 1000,
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
            "num_leaves": trial.suggest_int("num_leaves", 16, 128),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
            "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
            "random_state": 54,
            "n_jobs": 4
        }
        model = LGBMClassifier(**params)
        score = cross_val_score(Pipeline([("pre", preprocessor), ("m", model)]),
                                X, y, cv=3, scoring="roc_auc", n_jobs=1).mean()
        return score
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    best_params = study.best_params
    best = LGBMClassifier(**best_params, n_estimators=1000, random_state=54, n_jobs=-1)
    return best

def tune_cat(X, y, preprocessor, n_trials=20):
    
    def objective(trial):
        params = {
            "iterations": 1000,
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
            "depth": trial.suggest_int("depth", 4, 10),
            "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-1, 10.0),
            "border_count": trial.suggest_int("border_count", 32, 255),
            "random_seed": 54,
            "verbose": 0
        }
        model = CatBoostClassifier(**params)
        score = cross_val_score(Pipeline([("pre", preprocessor), ("m", model)]),X, y, cv=3, scoring="roc_auc", n_jobs=1).mean()
        return score
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    best_params = study.best_params
    best = CatBoostClassifier(**best_params, iterations=1000, random_seed=54, verbose=100)
    return best

def tune_xgb(X, y, preprocessor, n_trials=40):
    def objective(trial):
        params = {
            "n_estimators": 1000,
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.6, 1.0),
            "gamma": trial.suggest_uniform("gamma", 0.0, 5.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
            "random_state": 54,
            "n_jobs": 4,
            "use_label_encoder": False,
            "eval_metric": "auc"
        }
        model = XGBClassifier(**params)
        score = cross_val_score(Pipeline([("pre", preprocessor), ("m", model)]), X, y, cv=3, scoring="roc_auc", n_jobs=1).mean()
        return score
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    best_params = study.best_params
    best = XGBClassifier(**best_params, n_estimators=1000, random_state=54, use_label_encoder=False, eval_metric="auc", n_jobs=-1)
    return best


# Run tuning
print("Tuning CatBoost...")
best_cat = tune_cat(X, y, preprocessor, n_trials=15)

print("Tuning LightGBM...")
best_lgbm = tune_lgbm(X, y, preprocessor, n_trials=30)

print("Tuning XGBoost...")
best_xgb = tune_xgb(X, y, preprocessor, n_trials=30)



# Other models
rf = RandomForestClassifier(n_estimators=400, max_depth=12, random_state=54, n_jobs=-1)
logreg = LogisticRegression(max_iter=5000, solver="saga", penalty="elasticnet", l1_ratio=0.5, random_state=54)

base_models = [
    ("lgbm", best_lgbm),
    ("xgb", best_xgb),
    ("cat", best_cat),
    ("rf", rf),
    ("logreg", logreg)
]


# Build OOF meta-features
oof_preds = np.zeros((len(X), len(base_models)))
test_preds = np.zeros((len(X_test), len(base_models)))

for i, (name, model) in enumerate(base_models):
    print(f"\nGenerating OOF for: {name}")
    oof, tpred = get_oof_preds(model, X, y, X_test, preprocessor, n_splits=7)
    oof_preds[:, i] = oof
    test_preds[:, i] = tpred
    print(f"{name} OOF AUC: {roc_auc_score(y, oof):.5f}")


# Meta-learner (Dynamic weigths)
print("\nTraining meta-learner (LogisticRegression)...")
meta_model = LogisticRegression(max_iter=5000, solver="lbfgs")
meta_model.fit(oof_preds, y)

final_oof = meta_model.predict_proba(oof_preds)[:, 1]
final_test_preds = meta_model.predict_proba(test_preds)[:, 1]

print("Stacked (meta) OOF AUC:", roc_auc_score(y, final_oof))


submission = pd.DataFrame({id_col: test[id_col], target: final_test_preds})
submission.to_csv("submission3.csv", index=False)
print("Saved submission3.csv")

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# ---------------------------
# 1. Load data
# ---------------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["song_popularity"])
y = train["song_popularity"]

# ---------------------------
# 2. Balance dataset (0:1 ≈ 1.25:1)
# ---------------------------
df = pd.concat([X, y], axis=1)
df_1 = df[df.song_popularity == 1]
df_0 = df[df.song_popularity == 0].sample(int(len(df_1)*1.25), random_state=154)

df_bal = pd.concat([df_1, df_0]).sample(frac=1, random_state=54)  # shuffle
X_bal = df_bal.drop(columns=["song_popularity"])
y_bal = df_bal["song_popularity"]

# ---------------------------
# 3. Tuned model parameters
# ---------------------------
lgbm_params = {
    "learning_rate": 0.0112,
    "num_leaves": 97,
    "max_depth": 3,
    "min_child_samples": 100,
    "subsample": 0.8805,
    "colsample_bytree": 0.9248,
    "reg_alpha": 1.5212,
    "reg_lambda": 4.8888,
    "n_estimators": 1000
}

xgb_params = {
    "learning_rate": 0.0138,
    "max_depth": 5,
    "subsample": 0.8216,
    "colsample_bytree": 0.8920,
    "gamma": 3.8185,
    "reg_alpha": 3.4388,
    "reg_lambda": 4.3441,
    "n_estimators": 1000,
    "use_label_encoder": False,
    "eval_metric": "logloss"
}

cat_params = {
    "learning_rate": 0.0102,
    "depth": 4,
    "l2_leaf_reg": 0.2337,
    "border_count": 133,
    "verbose": 0
}

rf_params = {
    "n_estimators": 500,
    "max_depth": None,
    "random_state": 154
}

# ---------------------------
# 4. Function to get OOF predictions and validation metrics
# ---------------------------
def get_oof(model, X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=54)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    for trn_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        val_pred_proba = model.predict_proba(X_val)[:,1]
        val_pred_label = model.predict(X_val)
        oof_preds[val_idx] = val_pred_proba
        test_preds += model.predict_proba(X_test)[:,1] / n_splits
        
        # Print fold metrics
        print("Fold metrics:")
        print("Accuracy:", accuracy_score(y_val, val_pred_label))
        print("AUC:", roc_auc_score(y_val, val_pred_proba))
        print("Confusion Matrix:\n", confusion_matrix(y_val, val_pred_label))
        print("-"*30)
    
    auc = roc_auc_score(y, oof_preds)
    return oof_preds, test_preds, auc

# ---------------------------
# 5. Generate OOF for each base model
# ---------------------------
print("\n--- Training LGBM ---")
oof_lgbm, pred_lgbm, auc_lgbm = get_oof(LGBMClassifier(**lgbm_params), X_bal, y_bal, test)

print("\n--- Training XGB ---")
oof_xgb, pred_xgb, auc_xgb = get_oof(XGBClassifier(**xgb_params), X_bal, y_bal, test)

print("\n--- Training CatBoost ---")
oof_cat, pred_cat, auc_cat = get_oof(CatBoostClassifier(**cat_params), X_bal, y_bal, test)

print("\n--- Training RandomForest ---")
oof_rf, pred_rf, auc_rf = get_oof(RandomForestClassifier(**rf_params), X_bal, y_bal, test)

print("\nOOF AUCs:")
print("LGBM:", auc_lgbm)
print("XGB:", auc_xgb)
print("CatBoost:", auc_cat)
print("RF:", auc_rf)

# ---------------------------
# 6. Train meta-learner
# ---------------------------
X_meta = np.vstack([oof_lgbm, oof_xgb, oof_cat, oof_rf]).T
meta_model = LogisticRegression()
meta_model.fit(X_meta, y_bal)

X_test_meta = np.vstack([pred_lgbm, pred_xgb, pred_cat, pred_rf]).T
submission_preds = meta_model.predict_proba(X_test_meta)[:,1]

# ---------------------------
# 7. Save submission
# ---------------------------
submission = pd.DataFrame({"id": test["id"], "song_popularity": submission_preds})
submission.to_csv("Submission5.csv", index=False)
print("Saved Submission5.csv")



--- Training LGBM ---
Fold metrics:
Accuracy: 0.5617886178861788
AUC: 0.5773359560226071
Confusion Matrix:
 [[2275  458]
 [1698  489]]
------------------------------
Fold metrics:
Accuracy: 0.5686991869918699
AUC: 0.5626083411088809
Confusion Matrix:
 [[2261  472]
 [1650  537]]
------------------------------
Fold metrics:
Accuracy: 0.5637324659483635
AUC: 0.5694970723116101
Confusion Matrix:
 [[2248  485]
 [1661  525]]
------------------------------
Fold metrics:
Accuracy: 0.5616995324252897
AUC: 0.5694895400963254
Confusion Matrix:
 [[2234  499]
 [1657  529]]
------------------------------
Fold metrics:
Accuracy: 0.5647489327099003
AUC: 0.5575394294731901
Confusion Matrix:
 [[2205  528]
 [1613  573]]
------------------------------

--- Training XGB ---
Fold metrics:
Accuracy: 0.5654471544715447
AUC: 0.5773972736813734
Confusion Matrix:
 [[2258  475]
 [1663  524]]
------------------------------
Fold metrics:
Accuracy: 0.5615853658536586
AUC: 0.5682800990652445
Confusion Matrix:
 [[223

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# -----------------------------
# 1. Load datasets
# -----------------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

target_col = "song_popularity"
X = train.drop(columns=[target_col])
y = train[target_col]
X_test = test.copy()

# -----------------------------
# 2. Feature engineering
# -----------------------------
def feature_engineering(df):
    df = df.copy()
    # Example interactions/ratios
    df['dance_energy'] = df['danceability'] * df['energy']
    df['acoustic_instr_ratio'] = df['acousticness'] / (df['instrumentalness'] + 1e-6)
    df['tempo_bins'] = pd.cut(df['tempo'], bins=5, labels=False)
    df['loudness_bins'] = pd.cut(df['loudness'], bins=5, labels=False)
    df['acoustic_dance'] = df['acousticness'] * df['danceability']
    df.fillna(0, inplace=True)  # Fill missing values
    return df

X = feature_engineering(X)
X_test = feature_engineering(X_test)

# -----------------------------
# 3. Define base models (tuned params)
# -----------------------------
lgbm_model = LGBMClassifier(
    learning_rate=0.0112, num_leaves=97, max_depth=3,
    min_child_samples=100, subsample=0.8805, colsample_bytree=0.9248,
    reg_alpha=1.521, reg_lambda=4.888, n_estimators=1000
)

xgb_model = XGBClassifier(
    learning_rate=0.0138, max_depth=5, subsample=0.8216,
    colsample_bytree=0.892, gamma=3.818, reg_alpha=3.439,
    reg_lambda=4.344, n_estimators=1000, use_label_encoder=False,
    eval_metric='logloss'
)

cat_model = CatBoostClassifier(
    learning_rate=0.0102, depth=4, l2_leaf_reg=0.2336, border_count=133,
    n_estimators=1000, verbose=0
)

rf_model = RandomForestClassifier(
    n_estimators=500, max_depth=10, min_samples_leaf=4, random_state=42
)

# Meta-learner
meta_model = GradientBoostingClassifier(
    n_estimators=500, learning_rate=0.05, max_depth=3
)

# -----------------------------
# 4. Helper function for OOF
# -----------------------------
def get_oof(model, X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_train = np.zeros((X.shape[0],))
    oof_test = np.zeros((X_test.shape[0], n_splits))
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_tr, y_tr)
        oof_train[val_idx] = model.predict_proba(X_val)[:,1]
        oof_test[:,i] = model.predict_proba(X_test)[:,1]
    
    oof_test_mean = oof_test.mean(axis=1)
    return oof_train.reshape(-1,1), oof_test_mean

# -----------------------------
# 5. Generate OOF for base models
# -----------------------------
print("--- Generating OOF for LGBM ---")
oof_lgbm, pred_lgbm = get_oof(lgbm_model, X, y, X_test)
print("--- Generating OOF for XGB ---")
oof_xgb, pred_xgb = get_oof(xgb_model, X, y, X_test)
print("--- Generating OOF for CatBoost ---")
oof_cat, pred_cat = get_oof(cat_model, X, y, X_test)
print("--- Generating OOF for RF ---")
oof_rf, pred_rf = get_oof(rf_model, X, y, X_test)

# -----------------------------
# 6. Stack OOF predictions
# -----------------------------
X_meta = np.hstack([oof_lgbm, oof_xgb, oof_cat, oof_rf])

# Meta OOF using CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
meta_oof = np.zeros(y.shape[0])
meta_test = np.zeros(X_test.shape[0])
for train_idx, val_idx in skf.split(X_meta, y):
    X_tr, X_val = X_meta[train_idx], X_meta[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    meta_model.fit(X_tr, y_tr)
    meta_oof[val_idx] = meta_model.predict_proba(X_val)[:,1]
    meta_test += meta_model.predict_proba(np.hstack([pred_lgbm.reshape(-1,1),
                                                     pred_xgb.reshape(-1,1),
                                                     pred_cat.reshape(-1,1),
                                                     pred_rf.reshape(-1,1)]))[:,1] / skf.n_splits

# -----------------------------
# 7. Evaluation
# -----------------------------
val_pred_labels = (meta_oof >= 0.5).astype(int)
val_acc = accuracy_score(y, val_pred_labels)
val_auc = roc_auc_score(y, meta_oof)
cm = confusion_matrix(y, val_pred_labels)

print("\n--- Stacked Meta Model Evaluation ---")
print(f"Validation Accuracy: {val_acc:.4f}, ROC-AUC: {val_auc:.4f}")
print("Confusion Matrix:")
print(cm)

# -----------------------------
# 8. Create submission
# -----------------------------
submission = pd.DataFrame({
    "id": test["id"],
    "song_popularity": (meta_test >= 0.5).astype(int)  # 0 or 1
})
submission.to_csv("Submission6.csv", index=False)
print("Saved Submission6.csv")


--- Generating OOF for LGBM ---
--- Generating OOF for XGB ---
--- Generating OOF for CatBoost ---
--- Generating OOF for RF ---

--- Stacked Meta Model Evaluation ---
Validation Accuracy: 0.6323, ROC-AUC: 0.5621
Confusion Matrix:
[[18591   477]
 [10553   379]]
Saved Submission6.csv


In [42]:
import pandas as pd

# Read the old submission
submission3 = pd.read_csv("submission3.csv")

# Apply threshold 0.4
submission3["song_popularity"] = (submission3["song_popularity"] > 0.4).astype(int)

# Save as new submission
submission3.to_csv("submission7.csv", index=False)

print("submission7.csv created with threshold 0.4")


submission7.csv created with threshold 0.4
