In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/xgboost_model/pytorch/default/1/xgboost_model.pkl
/kaggle/input/catboost_model/pytorch/default/1/catboost_model.pkl
/kaggle/input/lightgbm_model/pytorch/default/1/lightgbm_model.pkl
/kaggle/input/iisc-umc-301-kaggle-competition-1/sample_submission.csv
/kaggle/input/iisc-umc-301-kaggle-competition-1/train.csv
/kaggle/input/iisc-umc-301-kaggle-competition-1/test.csv


In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold
import optuna
from sklearn.preprocessing import LabelEncoder

In [3]:
train_df = pd.read_csv('/kaggle/input/iisc-umc-301-kaggle-competition-1/train.csv')
test_df = pd.read_csv('/kaggle/input/iisc-umc-301-kaggle-competition-1/test.csv')

In [4]:
X = train_df.drop(['id', 'song_popularity'], axis=1)
y = train_df['song_popularity']
X_test = test_df.drop('id', axis=1)

print(X.shape, y.shape, X_test.shape)

test_ids = test_df['id']

(30000, 13) (30000,) (10000, 13)


In [5]:
X_train = X
y_train = y

In [None]:
# KNN Imputation Function
def knn_impute(X_train, X_test, n_neighbors=5):
    # Combine train + test for consistent encoding + imputation
    combined = pd.concat([X_train, X_test], axis=0, ignore_index=True)

    # Separate categorical and numerical columns
    cat_cols = combined.select_dtypes(include=['object', 'category']).columns
    num_cols = combined.select_dtypes(include=['number']).columns

    # Encode categorical features
    encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        # Handle missing
        combined[col] = combined[col].astype(str)
        combined[col] = combined[col].replace("nan", pd.NA)
        combined[col] = combined[col].fillna("MISSING")
        combined[col] = le.fit_transform(combined[col])
        encoders[col] = le

    
    imputer = KNNImputer(n_neighbors=n_neighbors, weights="uniform")
    imputed_data = imputer.fit_transform(combined)

    
    imputed_df = pd.DataFrame(imputed_data, columns=combined.columns)

    
    for col in cat_cols:
        le = encoders[col]
        imputed_df[col] = imputed_df[col].round().astype(int)  # round float to int
        imputed_df[col] = le.inverse_transform(imputed_df[col])

    
    imputed_train = imputed_df.iloc[:len(X_train), :].reset_index(drop=True)
    imputed_test = imputed_df.iloc[len(X_train):, :].reset_index(drop=True)

    return imputed_train, imputed_test


X_train_imputed, X_test_imputed = knn_impute(X_train, X_test, n_neighbors=5)


In [None]:
#Check if any missing values remain
X_train_imputed.isna().sum()

song_duration_ms    0
acousticness        0
danceability        0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
audio_mode          0
speechiness         0
tempo               0
time_signature      0
audio_valence       0
dtype: int64

In [None]:
#Optuna Objective Function for XGBoost for Hyperparameter Tuning

def objective_xgb(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": "gbtree",
        "tree_method": "hist",
        "device" : "cuda",
        "random_state": 42,

        # Search space
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),  # L2
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True)     # L1
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in kf.split(X_train_imputed, y):
        X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dval, "validation")],
            early_stopping_rounds=100,
            verbose_eval=False
        )

        preds = model.predict(dval)
        aucs.append(roc_auc_score(y_val, preds))

    return sum(aucs) / len(aucs)


In [9]:
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=50)
print("Best params XGB:", study_xgb.best_params)
xgb_best_params = study_xgb.best_params

[I 2025-09-24 05:22:02,073] A new study created in memory with name: no-name-12a95e1a-c1b3-4a6f-b8f5-59c59647a529
[I 2025-09-24 05:22:03,909] Trial 0 finished with value: 0.5608436288249685 and parameters: {'eta': 0.2868266411134794, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.5789873150730352, 'colsample_bytree': 0.5767779123224004, 'gamma': 2.9299336173840738, 'lambda': 9.986921374118948, 'alpha': 3.070989456982558}. Best is trial 0 with value: 0.5608436288249685.
[I 2025-09-24 05:22:06,238] Trial 1 finished with value: 0.5713267625520043 and parameters: {'eta': 0.04997558672583577, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.9639044721860399, 'colsample_bytree': 0.6793852932055418, 'gamma': 3.262065114324999, 'lambda': 0.007349247245322492, 'alpha': 0.03349455785974895}. Best is trial 1 with value: 0.5713267625520043.
[I 2025-09-24 05:22:09,120] Trial 2 finished with value: 0.5723466007930321 and parameters: {'eta': 0.01991777529091761, 'max_depth': 5, 'min_child

Best params XGB: {'eta': 0.046541262043125975, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.7152458479019418, 'colsample_bytree': 0.7664791561705949, 'gamma': 4.831132594660023, 'lambda': 2.78695701133032, 'alpha': 0.1749431723443901}


In [None]:
# Optuna Objective Function for CatBoost for Hyperparameter Tuning

def objective_cat(trial):
    params = {
        "iterations": 2000,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "random_seed": 42,
        "task_type": "GPU",    
        "verbose": False,

        # Search space
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 5.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0)
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in kf.split(X_train_imputed, y):
        X_tr, X_val = X_train_imputed.iloc[train_idx], X_train_imputed.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]


        model = CatBoostClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100)

        preds = model.predict_proba(X_val)[:, 1]
        aucs.append(roc_auc_score(y_val, preds))

    return sum(aucs) / len(aucs)


In [11]:
study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_cat, n_trials=50)
print("Best params CatBoost:", study_cat.best_params)
cat_best_params = study_cat.best_params

[I 2025-09-24 05:25:29,807] A new study created in memory with name: no-name-11c6d095-86a1-4468-843e-c717275c1e89
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-09-24 05:28:44,027] Trial 0 finished with value: 0.573190953891166 and parameters: {'depth': 10, 'learning_rate': 0.02924997938685539, 'l2_leaf_reg': 1.7994973780371595, 'bagging_temperature': 0.5160752580946026, 'border_count': 87, 'random_strength': 6.207733375940205}. Best is trial 0 with value: 0.573190953891166.
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU

Best params CatBoost: {'depth': 4, 'learning_rate': 0.0653519914018847, 'l2_leaf_reg': 4.533500125999296, 'bagging_temperature': 3.5000012999587784, 'border_count': 198, 'random_strength': 5.014308358082966}


In [None]:
# Optuna Objective Function for LightGBM for Hyperparameter Tuning

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        "n_estimators": 2000,
        "device": "gpu"
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in cv.split(X_train_imputed, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric="auc",
        )
        preds = model.predict_proba(X_val)[:, 1]
        aucs.append(roc_auc_score(y_val, preds))

    return np.mean(aucs)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50) 

print("Best trial:", study.best_trial.params)
print("Best AUC:", study.best_value)


[I 2025-09-24 06:20:22,724] A new study created in memory with name: no-name-653246f6-cab1-47e8-b874-f7c171a56f4f
[I 2025-09-24 06:20:39,269] Trial 0 finished with value: 0.5467089101629585 and parameters: {'learning_rate': 0.16921750760498647, 'num_leaves': 81, 'max_depth': 5, 'min_child_samples': 89, 'subsample': 0.6369770647558126, 'colsample_bytree': 0.5136933014998049, 'reg_alpha': 0.0035501638752162843, 'reg_lambda': 0.0018769776781935857, 'min_split_gain': 0.18739595146995425}. Best is trial 0 with value: 0.5467089101629585.
[I 2025-09-24 06:20:49,624] Trial 1 finished with value: 0.5574028398162978 and parameters: {'learning_rate': 0.18351965022435784, 'num_leaves': 125, 'max_depth': 5, 'min_child_samples': 93, 'subsample': 0.5435050332118655, 'colsample_bytree': 0.8735178147870959, 'reg_alpha': 0.44962786034799956, 'reg_lambda': 6.601937597162703e-07, 'min_split_gain': 0.39972873264896225}. Best is trial 1 with value: 0.5574028398162978.
[I 2025-09-24 06:21:04,074] Trial 2 fin

Best trial: {'learning_rate': 0.011569231480988288, 'num_leaves': 16, 'max_depth': 10, 'min_child_samples': 15, 'subsample': 0.547247457489464, 'colsample_bytree': 0.5507146543429488, 'reg_alpha': 3.6300463205930353, 'reg_lambda': 1.3018760604860127e-07, 'min_split_gain': 0.8935026096031666}
Best AUC: 0.5728629770674987


In [None]:
#Divide the training data into training and validation sets 
#To tune the weights for the weighted average of the models
X_train, X_val, y_train, y_val = train_test_split(X_train_imputed, y, test_size=0.2, random_state=2025, stratify=y)

In [None]:
# Train final LightGBM model with best hyperparameters
lgb_best_params = study.best_params
lgb_best_params["objective"] = "binary"
lgb_best_params["metric"] = "auc"

train_data = lgb.Dataset(X_train, label=y_train)
data = lgb.Dataset(X_train_imputed, label = y)
lgb_model = lgb.train(
    lgb_best_params,
    train_data,
    num_boost_round=study.best_trial.number
)
val_preds = lgb_model.predict(X_val)
val_auc = roc_auc_score(y_val, val_preds)

print("Validation AUC:", val_auc)

Validation AUC: 0.565508520397956


In [None]:
# Prepare train, validation and final data for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
ddata = xgb.DMatrix(X_train_imputed, label = y)

In [None]:
# Train final XGBoost model with best hyperparameters
xgb_model = xgb.train(
    xgb_best_params,
    dtrain,
    num_boost_round=2000,
    evals=[(dval, "validation")],
    early_stopping_rounds=100,
    verbose_eval=False
)

In [None]:
# Train final CatBoost model with best hyperparameters
cat_model = CatBoostClassifier(**cat_best_params, iterations=2000, task_type="GPU", verbose=0)
cat_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f372f3633d0>

In [None]:
# Evaluate CatBoost on validation set
y_pred_cat = cat_model.predict_proba(X_val)[:, 1]

auc_score = roc_auc_score(y_val, y_pred_cat)

print(f'AUC Score: {auc_score}')

AUC Score: 0.5697026316584874


In [None]:
# Evaluate XGBoost on validation set
y_pred_xgb = xgb_model.predict(dval)

auc_score = roc_auc_score(y_val, y_pred_xgb)

print(f'AUC Score: {auc_score}')

AUC Score: 0.563286965583052


In [None]:
# Ensemble Predictions of XGBoost and CatBoost with weighted average
weights = [0.7, 0.3]  # [XGB, Cat]
ensemble_proba = weights[0] * y_pred_xgb + weights[1] * y_pred_cat

print("Ensemble AUC:", roc_auc_score(y_val, ensemble_proba))

Ensemble AUC: 0.571816479086296


In [None]:
# Optimize weights for ensemble of CatBoost, LightGBM, and XGBoost using Optuna

# validation probabilities (already computed earlier)
cat_val_proba = cat_model.predict_proba(X_val)[:, 1]
lgb_val_proba = lgb_model.predict(X_val)
xgb_val_proba = xgb_model.predict(dval)

def objective(trial):
    # suggest weights
    w1 = trial.suggest_float("cat_weight", 0.0, 1.0)
    w2 = trial.suggest_float("lgb_weight", 0.0, 1.0)
    w3 = 1.0 - w1 - w2
    
    if w3 < 0:  # invalid combination
        return 0.0
    
    # weighted ensemble
    ensemble_proba = w1 * cat_val_proba + w2 * lgb_val_proba + w3 * xgb_val_proba
    
    # AUC with probabilities
    auc = roc_auc_score(y_val, ensemble_proba)
    return auc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200)

best_params = study.best_params
best_auc = study.best_value

# final weights
w1 = best_params["cat_weight"]
w2 = best_params["lgb_weight"]
w3 = 1 - w1 - w2

print(f"Best Weights -> CatBoost: {w1:.3f}, LightGBM: {w2:.3f}, XGBoost: {w3:.3f}")
print(f"Best Validation AUC: {best_auc:.4f}")


[I 2025-09-24 06:37:10,391] A new study created in memory with name: no-name-3d50e940-364f-4d2f-ba28-a0a0271557e8
[I 2025-09-24 06:37:10,393] Trial 0 finished with value: 0.0 and parameters: {'cat_weight': 0.7949002323807223, 'lgb_weight': 0.6691411139789754}. Best is trial 0 with value: 0.0.
[I 2025-09-24 06:37:10,398] Trial 1 finished with value: 0.5717753391823162 and parameters: {'cat_weight': 0.5112271622876129, 'lgb_weight': 0.08306891303350816}. Best is trial 1 with value: 0.5717753391823162.
[I 2025-09-24 06:37:10,399] Trial 2 finished with value: 0.0 and parameters: {'cat_weight': 0.8078078549196919, 'lgb_weight': 0.9789012312753728}. Best is trial 1 with value: 0.5717753391823162.
[I 2025-09-24 06:37:10,404] Trial 3 finished with value: 0.5668100046489291 and parameters: {'cat_weight': 0.05690927864109707, 'lgb_weight': 0.08183172647585901}. Best is trial 1 with value: 0.5717753391823162.
[I 2025-09-24 06:37:10,405] Trial 4 finished with value: 0.0 and parameters: {'cat_weigh

Best Weights -> CatBoost: 0.320, LightGBM: 0.144, XGBoost: 0.536
Best Validation AUC: 0.5721


In [None]:
# Retrain models on full training data with best hyperparameters
xgb_model = xgb.train(
    xgb_best_params,
    ddata,
    num_boost_round=2000,
    evals=[(dval, "validation")],
    early_stopping_rounds=100,
    verbose_eval=False
)
cat_model.fit(X_train_imputed, y)
lgb_model = lgb.train(
    lgb_best_params,
    data,
    num_boost_round=study.best_trial.number
)

In [None]:
# Final Ensemble Predictions on Test Set
dtest = xgb.DMatrix(X_test_imputed)
test_data = lgb.Dataset(X_test_imputed)
y_test_prob_xgb = xgb_model.predict(dtest)
y_test_prob_cat = cat_model.predict_proba(X_test_imputed)[:, 1]
y_test_prob_lgb= lgb_model.predict(X_test_imputed)
weights = [w1, w2, w3]
en_prob = weights[0]*y_test_prob_cat + weights[1]*y_test_prob_lgb + weights[2]*y_test_prob_xgb

In [None]:
# Save to CSV
results_df_1 = pd.DataFrame({
    'id' : test_ids,
    'song_popularity': en_prob
})
results_df_1.to_csv('/kaggle/working/predictions_prob_en_3.csv', index=False)

In [None]:
# Save models using joblib
import joblib

joblib.dump(cat_model, "catboost_model.pkl")
joblib.dump(lgb_model, "lightgbm_model.pkl")
joblib.dump(xgb_model, "xgboost_model.pkl")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

X_adv = pd.concat([X_train, X_val])
y_adv = np.concatenate([np.zeros(len(X_train)), np.ones(len(X_val))])
Xtr, Xva, ytr, yva = train_test_split(X_adv, y_adv, test_size=0.2, stratify=y_adv, random_state=42)
m = LGBMClassifier(n_estimators=200).fit(Xtr, ytr)
print("Adversarial AUC:", roc_auc_score(yva, m.predict_proba(Xva)[:,1]))


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import QuantileTransformer

def transform_features(df, skew_threshold=1.0, use_quantile=False):
    df_transformed = df.copy()
    
    numeric_cols = df_transformed.select_dtypes(include=[np.number]).columns
    
    # 1. Apply log1p for skewed features
    for col in numeric_cols:
        skew_val = df_transformed[col].skew()
        if abs(skew_val) > skew_threshold:
            df_transformed[col] = np.log1p(df_transformed[col].clip(lower=0))  # clip negatives
            print(f"Applied log1p to: {col} (skew={skew_val:.2f})")
    
    # 2. Optionally apply quantile transformation
    # if use_quantile:
    #     qt = QuantileTransformer(output_distribution="normal", random_state=42)
    #     df_transformed[numeric_cols] = qt.fit_transform(df_transformed[numeric_cols])
    #     print("Applied QuantileTransformer to numeric features")
    
    return df_transformed


In [None]:
X_train_transformed = transform_features(X_train, skew_threshold=1.0, use_quantile=False)
X_val_transformed   = transform_features(X_val, skew_threshold=1.0, use_quantile=False)

In [None]:
dtrain_trans = xgb.DMatrix(X_train_transformed, label = y_train)
dval_trans = xgb.DMatrix(X_val_transformed, label = y_val)

xgb_model = xgb.train(
    xgb_best_params,
    dtrain_trans,
    num_boost_round=2000,
    evals=[(dval, "validation")],
    early_stopping_rounds=100,
    verbose_eval=False
)

In [None]:
cat_model = CatBoostClassifier(**cat_best_params, iterations=2000, task_type="GPU", verbose=0)
cat_model.fit(X_train, y_train)

In [None]:
y_pred_cat = cat_model.predict_proba(X_val_transformed)[:, 1]
# y_pred = (y_pred_proba > 0.35).astype(int)

auc_score = roc_auc_score(y_val, y_pred_cat)

print(f'AUC Score: {auc_score}')

In [None]:
y_pred_xgb = xgb_model.predict(dval_trans)
# y_pred = (y_pred_proba > 0.35).astype(int)

auc_score = roc_auc_score(y_val, y_pred_xgb)

print(f'AUC Score: {auc_score}')

In [None]:
weights = [0.7, 0.3]  # [XGB, Cat]
ensemble_proba = weights[0] * y_pred_xgb + weights[1] * y_pred_cat


print("Ensemble AUC:", roc_auc_score(y_val, ensemble_proba))

In [None]:
# best_params = {'learning_rate': 0.16480795387917288, 'depth': 5, 'l2_leaf_reg': 0.0005322391948166793, 'random_strength': 7.786614741276959, 'bootstrap_type': 'Bernoulli', 'subsample': 0.5875758867895423}

In [None]:
# Optuna Objective Function for XGBoost with transformed features
def objective(trial):
    # Define hyperparameter search space
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist", 
        "device": "cuda",      
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
    }

    # Stratified k-fold CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dval, "validation")],
            early_stopping_rounds=50,
            verbose_eval=False
        )

        preds_prob = model.predict(dval)
        preds = (preds_prob > 0.35).astype(int)
        aucs.append(roc_auc_score(y_val, preds))

    return np.mean(aucs)


# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50) 

print("Best params:", study.best_trial.params)
print("Best AUC:", study.best_trial.value)


In [None]:
xgb_best_params = {'eta': 0.01109609133854797, 'max_depth': 3, 'subsample': 0.8281878643153439, 'colsample_bytree': 0.9931500979412877, 'gamma': 0.06097481929038295, 'lambda': 1.3089984380695487, 'alpha': 0.010773357859785837, 'seed': 42}
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [None]:
# Train final XGBoost model with best hyperparameters
xgb_model = xgb.train(
    xgb_best_params,
    dtrain,
    num_boost_round=150,
    evals=[(dval, "validation")],
    verbose_eval=False
)

In [None]:
# Define XGBoost Classifier model
model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=150,
    learning_rate=0.179,
    max_depth=11,
    subsample=0.7,
    colsample_bytree=0.7,
    gamma= 0.21,
    reg_lambda= 2.35e-08,
    alpha= 2.05e-06,
    eval_metric='auc',
    tree_method='hist',
    device='cuda'
)

In [None]:
# Train XGBoost Classifier on train set and evaluate on validation set
model.fit(X_train, y_train)

y_prob = model.predict_proba(X_val)[:,1]
y_pred = (y_prob > 0.35).astype(int)
auc_score = roc_auc_score(y_val, y_pred)

print(f'AUC Score: {auc_score}')


In [None]:
# import numpy as np
# import pandas as pd
# import os
# import xgboost as xgb
# from sklearn.model_selection import KFold
# from sklearn.metrics import roc_auc_score

# Install Optuna
# !pip install optuna

# import optuna

# --- Data Loading ---
# train_df = pd.read_csv('/kaggle/input/iisc-umc-301-kaggle-competition-1/train.csv')
# X = train_df.drop(['id', 'song_popularity'], axis=1)
# y = train_df['song_popularity']

# --- Define the Optuna Objective Function ---
# def objective(trial):
#     # Suggest hyperparameters to the trial
#     param = {
#         'objective': 'binary:logistic',
#         'eval_metric': 'auc',
#         'tree_method': 'hist',
#         'device':'cuda',
        
#         'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#         'gamma': trial.suggest_float('gamma', 0.0, 1.0),
#         'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
#         'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
#     }

#     # Set up K-fold cross-validation
#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     auc_scores = []
    
#     # Loop through each fold
#     for train_index, val_index in kf.split(X, y):
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         # Initialize and fit the model with the current trial's parameters
#         model = xgb.XGBClassifier(**param, early_stopping_rounds=50,)
#         model.fit(X_train, y_train, 
#                   eval_set=[(X_val, y_val)], 
#                   verbose=False
#                 )
        
#         # Predict probabilities and calculate AUC
#         y_pred_proba = model.predict(X_val)
#         fold_auc = roc_auc_score(y_val, y_pred_proba)
#         auc_scores.append(fold_auc)

#     # Return the average AUC score across all folds
#     return np.mean(auc_scores)

# # --- Run the Optuna Study ---
# # Create a study object and specify that we want to maximize the AUC score
# study = optuna.create_study(direction='maximize')
# # Run the optimization for a specified number of trials
# study.optimize(objective, n_trials=200)

# # --- Print the results ---
# print("\n--- Optuna Optimization Results ---")
# print(f"Best AUC score: {study.best_value:.4f}")
# print("Best hyperparameters found:")
# for key, value in study.best_params.items():
#     print(f"  {key}: {value}")

In [None]:
# Define the XGBoost Classifier with the best hyperparameters found by Optuna
new_model = xgb.XGBClassifier(
        objective='binary:logistic',
        n_estimators= 223,
        learning_rate= 0.19146753175063527,
        max_depth= 14,
        subsample= 0.6004353115430108,
        colsample_bytree= 0.9343428565854878,
        gamma= 0.2092019760043383,
        reg_lambda= 2.349046247166798e-08,
        alpha= 2.048689819651056e-06,
        eval_metric='auc',
        tree_method='hist',
        device = 'cuda'
)
    

In [None]:
# Train the new model on the entire training set and evaluate on validation set
new_model.fit(X_train, y_train)
y_pred_proba_new = new_model.predict_proba(X_val)[:, 1]

auc_score = roc_auc_score(y_val, y_pred_proba_new)

print(f'AUC Score: {auc_score}')

In [None]:
# Make predictions on the test set using the new model
y_test_2 = new_model.predict(X_test)

results_df_1 = pd.DataFrame({
    'id' : test_ids,
    'song_popularity': y_test_2
})
results_df_1.to_csv('/kaggle/working/predictions_7.csv', index=False)

In [None]:
# Try using RandomForestClassifier for classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

In [None]:
# Optuna Objective Function for RandomForest with KNN Imputer

# def objective(trial):
#     n_neighbors = trial.suggest_int('imputer__n_neighbors', 2, 10)
#     n_estimators = trial.suggest_int('classifier__n_estimators', 50, 500)
#     max_depth = trial.suggest_int('classifier__max_depth', 2, 32)
#     min_samples_split = trial.suggest_int('classifier__min_samples_split', 2, 20)
#     min_samples_leaf = trial.suggest_int('classifier__min_samples_leaf', 1, 20)
#     max_features = trial.suggest_categorical('classifier__max_features', ['sqrt', 'log2', None])

#     pipeline = Pipeline([
#         ('imputer', KNNImputer()),
#         ('classifier', RandomForestClassifier(random_state=42))
#     ])
    
#     params = {
#         'imputer__n_neighbors': n_neighbors,
#         'classifier__n_estimators': n_estimators,
#         'classifier__max_depth': max_depth,
#         'classifier__min_samples_split': min_samples_split,
#         'classifier__min_samples_leaf': min_samples_leaf,
#         'classifier__max_features': max_features
#     }
#     pipeline.set_params(**params)
    
#     pipeline.fit(X_train, y_train)

#     y_pred_proba = pipeline.predict(X_val)
#     auc_score = roc_auc_score(y_val, y_pred_proba)

#     return auc_score

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# print("Best hyperparameters:", study.best_params)
# print("Best AUC Score:", study.best_value)

# best_params = study.best_params
# best_model = RandomForestClassifier(random_state=42)
# best_model.set_params(**{k.replace('classifier__', ''): v for k, v in best_params.items() if k.startswith('classifier__')})

# best_imputer = KNNImputer()
# best_imputer.set_params(**{k.replace('imputer__', ''): v for k, v in best_params.items() if k.startswith('imputer__')})

# final_pipeline = Pipeline([
#     ('imputer', best_imputer),
#     ('classifier', best_model)
# ])

# final_pipeline.fit(X_train, y_train)

# y_pred_proba = final_pipeline.predict(X_val)
# final_auc = roc_auc_score(y_val, y_pred_proba)
# print(f"AUC score of the final model on validation set: {final_auc}")

In [None]:
# Manual K-Fold Cross-Validation with RandomForest and KNN Imputer

# for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
#     print(f"--- Fold {fold+1}/5 ---")
    
#     X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#     imputer = KNNImputer(n_neighbors=5)
#     imputer.fit(X_train)
#     X_train_imputed = imputer.transform(X_train)
#     X_val_imputed = imputer.transform(X_val)
    
#     scaler = StandardScaler()
#     scaler.fit(X_train_imputed)
#     X_train_scaled = scaler.transform(X_train_imputed)
#     X_val_scaled = scaler.transform(X_val_imputed)
    
#     model = RandomForestClassifier(
#         n_estimators=200, 
#         max_depth=10, 
#         random_state=42, 
#         n_jobs=-1
#     )
    
#     model.fit(X_train_scaled, y_train)
    
#     y_pred_proba = model.predict(X_val_scaled)
    
#     fold_auc = roc_auc_score(y_val, y_pred_proba)
#     auc_scores.append(fold_auc)
#     print(f"Fold {fold+1} AUC: {fold_auc:.4f}")

# print("\n--- Cross-Validation Results ---")
# print(f"Average AUC: {np.mean(auc_scores):.4f}")
# print(f"Standard Deviation: {np.std(auc_scores):.4f}")

In [None]:
# Try using CatBoostClassifier for classification
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold

In [None]:
# Optuna Objective Function for CatBoost with pruning

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli']),
        'eval_metric': 'AUC',
        'early_stopping_rounds': 1000,
        'verbose': 0,
        'random_seed': 42
    }
    
    model = CatBoostClassifier(**params)
    
    pruning_callback = optuna.integration.CatBoostPruningCallback(trial, 'AUC')
    
    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_val, y_val)
    
    model.fit(train_pool, eval_set=val_pool, callbacks=[pruning_callback])
    
    preds_proba = model.predict(X_val)
    
    auc_score = roc_auc_score(y_val, preds_proba)
    return auc_score

study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30))
study.optimize(objective, n_trials=100)



In [None]:
# print("Best trial:")
# trial = study.best_trial
# print(f"  Value: {trial.value}")
# print("  Params:")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")

In [None]:
# Train final CatBoost model with best hyperparameters

cat_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    iterations = 520,
    learning_rate = 0.17,
    depth = 9,
    l2_leaf_reg = 5.79e-07,
    bootstrap_type = 'Bayesian',
    task_type='GPU'
)
cat_model.fit(X_train, y_train)

# Make predictions


In [None]:
# Evaluate CatBoost on validation set
y_pred_cat_prob = cat_model.predict_proba(X_val)[:, 1]
y_pred_cat = (y_pred_cat_prob > 0.5).astype(int)
cat_auc = roc_auc_score(y_val, y_pred_cat_prob)
print("CatBoost AUC: ",cat_auc)


In [None]:
# Make predictions on the test set

y_test_cat_proba = cat_model.predict_proba(X_test)[:, 1]
y_test_cat = (y_test_cat_proba > 0.35).astype(int)

results_df_cat = pd.DataFrame({
    'id' : test_ids,
    'song_popularity': y_test_cat
})
results_df_cat.to_csv('/kaggle/working/predictions_cat_3.csv', index=False)

In [None]:
# Feature Importance from CatBoost and XGBoost

feature_importance_cat = cat_model.get_feature_importance()
feature_importance_xgb = model.feature_importances_
feature_names = X_train.columns 


importance_df_cat = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance_cat
}).sort_values(by='importance', ascending=False)

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance_xgb
}).sort_values(by='importance', ascending=False)

print(importance_df)
print(importance_df_cat)

In [None]:
# Make predictions on the test set using XGBoost and CatBoost and ensemble them

y_test_en_prob = (y_test_cat_proba + y_test_1_proba)/2

# results_df_en = pd.DataFrame({
#     'id' : test_ids,
#     'song_popularity': y_test_en
# })
# results_df_1.to_csv('/kaggle/working/predictions_en.csv', index=False)

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from catboost import CatBoostClassifier

# Using Stratified K-Fold Cross-Validation to evaluate XGBoost and CatBoost

# Define models (reuse your tuned params)
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=150,
    learning_rate=0.179,
    max_depth=11,
    subsample=0.7,
    colsample_bytree=0.7,
    gamma=0.21,
    reg_lambda=2.35e-08,
    alpha=2.05e-06,
    eval_metric='auc',
    tree_method='hist',
    device='cuda',
    random_state=42,
    use_label_encoder=False
)

cat_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    iterations = 520,
    learning_rate = 0.17,
    depth = 9,
    l2_leaf_reg = 5.79e-07,
    bootstrap_type = 'Bayesian',
    task_type='GPU'
)

# Stratified KFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_scores = []
cat_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # XGBoost
    xgb_model.fit(X_train, y_train)
    xgb_pred_proba = xgb_model.predict_proba(X_val)[:, 1]
    xgb_pred = (xgb_pred_proba > 0.35).astype(int)
    xgb_auc = roc_auc_score(y_val, xgb_pred)
    xgb_scores.append(xgb_auc)
    
    # CatBoost
    cat_model.fit(X_train, y_train)
    cat_pred_proba = cat_model.predict_proba(X_val)[:, 1]
    cat_pred = (cat_pred_proba > 0.35).astype(int)
    cat_auc = roc_auc_score(y_val, cat_pred)
    cat_scores.append(cat_auc)

print(f"XGBoost CV AUC: {np.mean(xgb_scores):.4f} ± {np.std(xgb_scores):.4f}")
print(f"CatBoost CV AUC: {np.mean(cat_scores):.4f} ± {np.std(cat_scores):.4f}")



In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from catboost import CatBoostClassifier

# Using Stratified K-Fold Cross-Validation with imbalance handling

# Compute imbalance ratio
pos = y.sum()
neg = len(y) - pos
scale_pos_weight = neg / pos
print(f"scale_pos_weight (neg/pos): {scale_pos_weight:.2f}")

# Define models with imbalance handling
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=150,
    learning_rate=0.179,
    max_depth=11,
    subsample=0.7,
    colsample_bytree=0.7,
    gamma=0.21,
    reg_lambda=2.35e-08,
    alpha=2.05e-06,
    eval_metric='auc',
    tree_method='hist',
    device='cuda',
    random_state=42,
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight
)

cat_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    iterations=520,
    learning_rate=0.17,
    depth=9,
    l2_leaf_reg=5.79e-07,
    bootstrap_type='Bayesian',
    task_type='GPU',
    class_weights=[1.0, scale_pos_weight]  # weight negative=1, positive=ratio
)

# Stratified KFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_scores = []
cat_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # XGBoost
    xgb_model.fit(X_train, y_train)
    xgb_pred_proba = xgb_model.predict_proba(X_val)[:, 1]
    xgb_pred = (xgb_pred_proba > 0.5).astype(int)
    xgb_auc = roc_auc_score(y_val, xgb_pred)
    xgb_scores.append(xgb_auc)
    
    # CatBoost
    cat_model.fit(X_train, y_train)
    cat_pred_proba = cat_model.predict_proba(X_val)[:, 1]
    cat_pred = (cat_pred_proba > 0.5).astype(int)
    cat_auc = roc_auc_score(y_val, cat_pred)
    cat_scores.append(cat_auc)

print(f"XGBoost (weighted) CV AUC: {np.mean(xgb_scores):.4f} ± {np.std(xgb_scores):.4f}")
print(f"CatBoost (weighted) CV AUC: {np.mean(cat_scores):.4f} ± {np.std(cat_scores):.4f}")


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from catboost import CatBoostClassifier

# Using Stratified K-Fold Cross-Validation with imbalance handling and ensembling

# imbalance ratio
pos = y.sum()
neg = len(y) - pos
scale_pos_weight = neg / pos
print(f"scale_pos_weight (neg/pos): {scale_pos_weight:.2f}")

# Define models
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=150,
    learning_rate=0.179,
    max_depth=11,
    subsample=0.7,
    colsample_bytree=0.7,
    gamma=0.21,
    reg_lambda=2.35e-08,
    alpha=2.05e-06,
    eval_metric='auc',
    tree_method='hist',
    device='cuda',
    random_state=42,
    use_label_encoder=False
)

cat_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    iterations=520,
    learning_rate=0.17,
    depth=9,
    l2_leaf_reg=5.79e-07,
    bootstrap_type='Bayesian',
    task_type='GPU',
    class_weights=[1.0, scale_pos_weight]
)

# Stratified KFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_scores = []
cat_scores = []
ensemble_scores = []

# ensemble weights (you can tune these!)
cat_weight = 0.7
xgb_weight = 0.3

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # XGBoost
    xgb_model.fit(X_train, y_train)
    xgb_pred_proba = xgb_model.predict_proba(X_val)[:, 1]
    xgb_pred = (xgb_pred_proba > 0.5).astype(int)
    xgb_auc = roc_auc_score(y_val, xgb_pred)
    xgb_scores.append(xgb_auc)
    
    # CatBoost
    cat_model.fit(X_train, y_train)
    cat_pred_proba = cat_model.predict_proba(X_val)[:, 1]
    cat_pred = (cat_pred_proba > 0.5).astype(int)
    cat_auc = roc_auc_score(y_val, cat_pred)
    cat_scores.append(cat_auc)
    
    # Weighted Ensemble
    ensemble_pred = xgb_weight * xgb_pred + cat_weight * cat_pred
    ensemble_auc = roc_auc_score(y_val, ensemble_pred)
    ensemble_scores.append(ensemble_auc)

print(f"XGBoost CV AUC: {np.mean(xgb_scores):.4f} ± {np.std(xgb_scores):.4f}")
print(f"CatBoost CV AUC: {np.mean(cat_scores):.4f} ± {np.std(cat_scores):.4f}")
print(f"Ensemble ({xgb_weight:.1f}*XGB + {cat_weight:.1f}*Cat) CV AUC: "
      f"{np.mean(ensemble_scores):.4f} ± {np.std(ensemble_scores):.4f}")


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from catboost import CatBoostClassifier

# Using Stratified K-Fold Cross-Validation with imbalance handling and ensembling with different weights

# imbalance ratio
pos = y.sum()
neg = len(y) - pos
scale_pos_weight = neg / pos
print(f"scale_pos_weight (neg/pos): {scale_pos_weight:.2f}")

# Define models
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=150,
    learning_rate=0.179,
    max_depth=11,
    subsample=0.7,
    colsample_bytree=0.7,
    gamma=0.21,
    reg_lambda=2.35e-08,
    alpha=2.05e-06,
    eval_metric='auc',
    tree_method='hist',
    device='cuda',
    random_state=42,
    use_label_encoder=False
)

cat_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    iterations=520,
    learning_rate=0.17,
    depth=9,
    l2_leaf_reg=5.79e-07,
    bootstrap_type='Bayesian',
    task_type='GPU',
    class_weights=[1.0, scale_pos_weight]
)

# Stratified KFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# weight search
weight_grid = np.linspace(0, 1, 11)  # 0.0, 0.1, ... 1.0
results = {}

for w in weight_grid:
    ensemble_scores = []
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Fit models
        xgb_model.fit(X_train, y_train)
        xgb_pred_proba = xgb_model.predict_proba(X_val)[:, 1]
        xgb_pred = (xgb_pred_proba > 0.5).astype(int)

        cat_model.fit(X_train, y_train)
        cat_pred = cat_model.predict_proba(X_val)[:, 1]

        # Ensemble with current weight
        ensemble_pred = w * xgb_pred + (1 - w) * cat_pred
        ensemble_auc = roc_auc_score(y_val, ensemble_pred)
        ensemble_scores.append(ensemble_auc)

    results[w] = (np.mean(ensemble_scores), np.std(ensemble_scores))

# Display results sorted by AUC
for w, (mean_auc, std_auc) in sorted(results.items(), key=lambda x: -x[1][0]):
    print(f"Weight {w:.1f} (XGB {w:.1f}, Cat {1-w:.1f}) "
          f"=> CV AUC: {mean_auc:.4f} ± {std_auc:.4f}")


In [None]:
# Predict on test set with best ensemble weights
y_test_cat_proba = cat_model.predict_proba(X_test)[:, 1]
y_test_cat = (y_test_cat_proba > 0.5).astype(int)

results_df_cat = pd.DataFrame({
    'id' : test_ids,
    'song_popularity': y_test_cat
})
results_df_cat.to_csv('/kaggle/working/predictions_cat_5.csv', index=False)

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

# Using Stratified K-Fold Cross-Validation with CatBoost, imbalance handling, and bagging

# imbalance ratio
pos = y.sum()
neg = len(y) - pos
scale_pos_weight = neg / pos
print(f"scale_pos_weight (neg/pos): {scale_pos_weight:.2f}")

# Bagging settings
n_bags = 5
seeds = [42, 99, 123, 2024, 2025]  # you can expand this list

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

bagged_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    bagged_preds_proba = np.zeros(len(X_val))

    for seed in seeds[:n_bags]:
        cat_model = CatBoostClassifier(
            verbose=0,
            random_state=seed,
            iterations=520,
            learning_rate=0.17,
            depth=9,
            l2_leaf_reg=5.79e-07,
            bootstrap_type='Bayesian',
            task_type='GPU',
            class_weights=[1.0, scale_pos_weight]
        )

        cat_model.fit(X_train, y_train)
        bagged_preds_proba += cat_model.predict_proba(X_val)[:, 1]

    # Average predictions from all bags
    bagged_preds_proba /= n_bags
    

    # Get hard labels if needed
    bagged_preds = (bagged_preds_proba > 0.5).astype(int)
    bagged_auc = roc_auc_score(y_val, bagged_preds)
    bagged_scores.append(bagged_auc)

print(f"Bagged CatBoost (n={n_bags}) CV AUC: {np.mean(bagged_scores):.4f} ± {np.std(bagged_scores):.4f}")


In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

# Optuna Objective Function for CatBoost with more hyperparameters

def objective(trial):
    params = {
        "iterations": 5000,  # we’ll use early stopping anyway
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 4, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10.0),
        "task_type": "GPU", 
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "verbose": 0,
        "random_seed": 42
    }

    # Choose bootstrap type
    bootstrap_type = trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "Poisson"])
    params["bootstrap_type"] = bootstrap_type

    if bootstrap_type == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    else:
        params["subsample"] = trial.suggest_float("subsample", 0.5, 1.0)

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = CatBoostClassifier(**params)
        model.fit(X_tr, y_tr,
                  eval_set=(X_val, y_val),
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict_proba(X_val)[:, 1]
        aucs.append(roc_auc_score(y_val, preds))

    return sum(aucs) / len(aucs)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)  # increase n_trials for better tuning

print("Best trial:", study.best_trial.params)


In [None]:
# Extract best parameters
best_params = study.best_trial.params
print("Best trial:", best_params)


# Bagging CatBoost with Best Params
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
val_preds_bagged = np.zeros(len(X_train))

n_bags = 5
for seed in range(n_bags):
    model = CatBoostClassifier(
        **best_params,
        iterations=2000,
        eval_metric="AUC",
        random_seed=seed,
        task_type="GPU",
        verbose=0
    )

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=0)
        val_preds_bagged[val_idx] += model.predict_proba(X_val)[:, 1]

val_preds_bagged /= n_bags
val_auc = roc_auc_score(y_train, val_preds_bagged)
print("Bagged CatBoost Validation AUC:", val_auc)

# Threshold Tuning
best_thresh = 0.5
best_auc = 0

for thresh in np.linspace(0.3, 0.7, 41):  # step of 0.01
    preds = (val_preds_bagged > thresh).astype(int)
    auc = roc_auc_score(y_train, preds)
    if auc > best_auc:
        best_auc = auc
        best_thresh = thresh

print("Best Threshold:", best_thresh, "AUC at threshold:", best_auc)

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

# Optuna Objective Function for LightGBM imbalance handling

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "n_estimators": 10000,  # large, let early stopping cut it down
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 512, log=True),
        "max_depth": trial.suggest_int("max_depth", -1, 16),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "class_weight": "balanced",
        "device": "gpu"
        
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )

        y_val_pred = model.predict_proba(X_val)[:, 1]
        y_pred = (y_val_pred > 0.5).astype(int)
        aucs.append(roc_auc_score(y_val, y_pred))

    return np.mean(aucs)

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

In [None]:
import lightgbm as lgb
import pandas as pd

# Extract best parameters and evaluate on validation set
# Make predictions on test set

best_params = study.best_params
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "n_estimators": 10000,   # use early stopping
    "random_state": 42,
    "class_weight": "balanced",
    "device": "gpu"
})

# Train final LightGBM model
lgb_model = lgb.LGBMClassifier(**best_params)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

# Predict on test
y_test_lgb_proba = lgb_model.predict_proba(X_test)[:, 1]
y_test_lgb = (y_test_lgb_proba > 0.5).astype(int)

# # Save results
# results_df_lgb = pd.DataFrame({
#     "id": test_ids,
#     "song_popularity": y_test_lgb
# })

# results_df_lgb.to_csv("/kaggle/working/predictions_lgb.csv", index=False)


In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

# Validation Ensemble of CatBoost and LightGBM with Probabilities

cat_val_proba = cat_model.predict_proba(X_val)[:, 1]
lgb_val_proba = lgb_model.predict_proba(X_val)[:, 1]

# ensemble weights
w_cat, w_lgb = 0.5, 0.5  # equal weights

# ensemble probabilities
val_ensemble_proba = w_cat * cat_val_proba + w_lgb * lgb_val_proba

# compute AUC on probabilities
val_auc = roc_auc_score(y_val, val_ensemble_proba)
print(f"Validation AUC (ensemble): {val_auc:.4f}")

# ---- Final Test Predictions ----
# get test probabilities
cat_test_proba = cat_model.predict_proba(X_test)[:, 1]
lgb_test_proba = lgb_model.predict_proba(X_test)[:, 1]

# ensemble probabilities
test_ensemble_proba = w_cat * cat_test_proba + w_lgb * lgb_test_proba

# convert to binary 0/1 
test_ensemble_pred = (test_ensemble_proba > 0.5).astype(int)

# save to csv 
results_df = pd.DataFrame({
    'id': test_ids,
    'song_popularity': test_ensemble_pred
})
results_df.to_csv('/kaggle/working/predictions_ensemble.csv', index=False)
print("Submission file saved: predictions_ensemble.csv")
