# Data processing

In [1]:
import pandas as pd

In [2]:
# -------------------------------
# 1. Load the data
# -------------------------------
train_df = pd.read_csv(r"E:\project\MLComp\FindIT\2025\data\train.csv")
target_df = pd.read_csv(r"E:\project\MLComp\FindIT\2025\data\target.csv")
test_df = pd.read_csv(r"E:\project\MLComp\FindIT\2025\data\test.csv")
submission_format = pd.read_csv(r"E:\project\MLComp\FindIT\2025\data\submission_format.csv")
# Concatenate train and target as they are already in order.
train_df["coppaRisk"] = target_df["coppaRisk"]

# -------------------------------
# 2. New Missing‐Value and Flag Features
# -------------------------------

# 2.1 countryCode: treat NaN as its own category 'nan'
for df in [train_df, test_df]:
    df["countryCode"] = df["countryCode"].fillna("nan")  # fill missing categoricals with 'nan' :contentReference[oaicite:0]{index=0}

# 2.2 downloads → Downloads_avg: parse ranges; if downloads is NaN, set Downloads_avg = 0
def parse_downloads(x):
    if pd.isna(x):
        return 0.0  # missing downloads → zero average :contentReference[oaicite:1]{index=1}
    parts = x.split('-')
    if len(parts)==2:
        return (float(parts[0].strip()) + float(parts[1].strip())) / 2.0
    return np.nan

for df in [train_df, test_df]:
    if 'downloads' in df.columns:
        df["Downloads_avg"] = df["downloads"].apply(parse_downloads)

# 2.3 hasTermsOfServiceLink & hasTermsOfServiceLinkRating: missing → new category 'None'
for df in [train_df, test_df]:
    df[["hasTermsOfServiceLink", "hasTermsOfServiceLinkRating"]] = \
        df[["hasTermsOfServiceLink", "hasTermsOfServiceLinkRating"]].fillna("None")  # new category :contentReference[oaicite:2]{index=2}

# 2.4 adSpent: flag missingness and replace missing with -1
for df in [train_df, test_df]:
    df["adSpend_isna"] = df["adSpent"].isna().astype(int)  # 1 if missing, 0 otherwise :contentReference[oaicite:3]{index=3}
    df["adSpent"] = df["adSpent"].fillna(-1)               # missing adSpent → -1 :contentReference[oaicite:4]{index=4}

# 2.5 averageUserRating must be integer 0–5
# for df in [train_df, test_df]:
#     df["averageUserRating"] = df["averageUserRating"].round().astype(int)
#     df["averageUserRating"] = np.clip(df["averageUserRating"], 0, 5)  # ensure within [0,5] :contentReference[oaicite:5]{index=5}

# 2.6 appContentBrandSafetyRating: treat NaN as its own category 'nan'
for df in [train_df, test_df]:
    df["appContentBrandSafetyRating"] = df["appContentBrandSafetyRating"].fillna("nan")  # new category :contentReference[oaicite:6]{index=6}

# -------------------------------
# 3. Data Cleaning Functions
# -------------------------------
def clean_developer_country(df):
    mask = df["developerCountry"].isin([
        "ADDRESS NOT LISTED IN PLAYSTORE",
        "CANNOT IDENTIFY COUNTRY",
        "PERSONAL DATA, CAN NOT BE PUBLICLY DISCLOSED ACCORDING TO APPLICABLE LAWS.",
        "STATUTORY MASKING ENABLED"
    ])
    df.loc[mask, "developerCountry"] = "Empty"
    return df

train_df = clean_developer_country(train_df)
test_df = clean_developer_country(test_df)

# -------------------------------
# 4. Define Feature Groups
# -------------------------------
numerical_cols = [
    'userRatingCount', 'isCorporateEmailScore', 'adSpent',
    'appAge', 'averageUserRating', 'Downloads_avg'
]
binary_cols = ['hasPrivacyLink', 'hasTermsOfServiceLink']
nominal_cols = [
    'developerCountry', 'countryCode', 'primaryGenreName',
    'deviceType', 'appContentBrandSafetyRating'
]
ordinal_cols = [
    'hasTermsOfServiceLinkRating',
    'appDescriptionBrandSafetyRating',
    'mfaRating'
]
ordinal_mapping = {'low': 1, 'medium': 2, 'high': 3}

# -------------------------------
# 5. Imputation Functions
# -------------------------------
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.preprocessing import OrdinalEncoder

def impute_nominal_knn(df, columns):
    temp = df[columns].replace("", np.nan).copy()
    encoder = OrdinalEncoder(
        handle_unknown="use_encoded_value", unknown_value=-1
    )
    encoded = encoder.fit_transform(temp)
    imputed = KNNImputer(n_neighbors=5).fit_transform(encoded)  # KNN imputation :contentReference[oaicite:7]{index=7}
    decoded = encoder.inverse_transform(np.round(imputed).astype(int))
    df[columns] = decoded
    return df

def impute_ordinal_numerical_iter(df, columns):
    df_num = df[columns].replace("", np.nan).astype(float)
    df[columns] = IterativeImputer(random_state=42).fit_transform(df_num)  # MICE imputation :contentReference[oaicite:8]{index=8}
    return df
# -------------------------------
# 6. Preprocess the datasets
# -------------------------------
def preprocess_df(df):
    # df = impute_nominal_knn(df, nominal_cols)  # fill nominal categoricals
    for col in ordinal_cols:
        df[col] = df[col].map(ordinal_mapping)
    impute_cols = ordinal_cols + numerical_cols
    # df = impute_ordinal_numerical_iter(df, impute_cols)
    # for col in ordinal_cols:
    #     df[col] = np.round(df[col]).astype(int)
    # One-hot encode nominal columns (now including appContentBrandSafetyRating)
    from sklearn.preprocessing import OneHotEncoder
    df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)  # OHE :contentReference[oaicite:9]{index=9}
    for col in binary_cols + ['adSpend_isna']:
        df[col] = df[col].apply(lambda x: 1 if str(x).lower() in ["true", "1"] else 0)
    return df

train_processed = preprocess_df(train_df.copy())
test_processed  = preprocess_df(test_df.copy())
# 7. Final Alignment and Split
# -------------------------------
train_features = train_processed.drop(['coppaRisk', 'downloads'], axis=1, errors='ignore')
y = (train_processed['coppaRisk'].astype(str).str.lower() == 'true').astype(int)
test_features = test_processed.reindex(columns=train_features.columns, fill_value=0)

# Clean column names
import re
train_features.columns = train_features.columns.map(lambda x: re.sub(r'[^A-Za-z0-9_]+', '', x))
test_features.columns  = test_features.columns.map(lambda x: re.sub(r'[^A-Za-z0-9_]+', '', x))

# Stratified split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train_features, y, test_size=0.2, random_state=42, stratify=y
)

# Only catboost

In [3]:
import numpy as np
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

# Configuration
N_MODELS = 9
SEED     = 42
ROUNDS   = 300   # you can adjust

# Make sure you’ve already done:
# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split( ... stratify=y ... )

def objective(trial):
    # 1) sample hyperparameters
    params = {
        'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'depth':              trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg':        trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
        'min_data_in_leaf':   trial.suggest_int('min_data_in_leaf', 5, 100),
        'border_count':       trial.suggest_int('border_count', 32, 255),
        'random_strength':    trial.suggest_float('random_strength', 0.0, 1.0),
        'subsample':          trial.suggest_float('subsample', 0.4, 1.0),
        'rsm':                trial.suggest_float('rsm', 0.4, 1.0),
        'bootstrap_type':     'Bernoulli',            # supports subsample on CPU
        'objective':          'Logloss',
        'eval_metric':        'AUC',
        'verbose':            False,
        'random_seed':        SEED,
        'task_type':          'CPU'
    }

    # 2) majority/minority splits on X_train
    arr = y_train.values
    maj_idx = np.where(arr == 0)[0]
    rar_idx = np.where(arr == 1)[0]
    rng = np.random.RandomState(SEED)
    splits = np.array_split(rng.permutation(maj_idx), N_MODELS)

    # 3) train an ensemble on X_train → predict on X_val
    preds = np.zeros(len(X_val), dtype=float)
    for grp in splits:
        idx = np.sort(np.concatenate([rar_idx, grp]))
        X_sub, y_sub = X_train.iloc[idx], y_train.iloc[idx]
        m = CatBoostClassifier(**params, iterations=ROUNDS)
        m.fit(X_sub, y_sub)
        preds += m.predict_proba(X_val)[:, 1]
    preds /= N_MODELS

    return roc_auc_score(y_val, preds)

# run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=6000)

print("▶ Best CatBoost parameters:")
for k, v in study.best_trial.params.items():
    print(f"   {k}: {v}")
print(f"▶ Best hold-out AUC: {study.best_value:.4f}")


[I 2025-04-28 08:20:21,378] A new study created in memory with name: no-name-95e1939e-2c41-467f-9d2e-e4004d73fd8a
  'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':        trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
[I 2025-04-28 08:20:26,441] Trial 0 finished with value: 0.8944539847899635 and parameters: {'learning_rate': 0.006255888445794001, 'depth': 4, 'l2_leaf_reg': 1.1164171204811146, 'min_data_in_leaf': 48, 'border_count': 186, 'random_strength': 0.3127155827025371, 'subsample': 0.4780949240439404, 'rsm': 0.765934571418941}. Best is trial 0 with value: 0.8944539847899635.
  'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':        trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
[I 2025-04-28 08:20:33,912] Trial 1 finished with value: 0.8990067264190233 and parameters: {'learning_rate': 0.00897724153139161, 'depth': 7, 'l2_leaf_reg': 0.24567385068743638, 'min_data_in_leaf': 29, 

▶ Best CatBoost parameters:
   learning_rate: 0.03688688094534395
   depth: 7
   l2_leaf_reg: 0.06202998934307267
   min_data_in_leaf: 93
   border_count: 141
   random_strength: 0.8195253675619897
   subsample: 0.46528052286516963
   rsm: 0.5808697982329277
▶ Best hold-out AUC: 0.9032


In [4]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

# pull in the best params & fixed iteration count
best = study.best_trial.params
ROUNDS = 300  # same as your Cell 1 ROUNDS
cb_params = {
    'learning_rate':    best['learning_rate'],
    'depth':            best['depth'],
    'l2_leaf_reg':      best['l2_leaf_reg'],
    'min_data_in_leaf': best['min_data_in_leaf'],
    'border_count':     best['border_count'],
    'random_strength':  best['random_strength'],
    'subsample':        best['subsample'],
    'rsm':              best['rsm'],
    'bootstrap_type':   'Bernoulli',
    'objective':        'Logloss',
    'eval_metric':      'AUC',
    'task_type':        'CPU',
    'verbose':          False,
    'random_seed':      SEED
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(train_features, y), 1):
    X_tr, X_va = train_features.iloc[tr_idx], train_features.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    # majority/minority in this fold
    arr = y_tr.values
    maj_idx = np.where(arr==0)[0]
    rar_idx = np.where(arr==1)[0]
    splits = np.array_split(np.random.RandomState(SEED).permutation(maj_idx), N_MODELS)

    # ensemble for this fold
    preds = np.zeros(len(X_va))
    for grp in splits:
        idx = np.sort(np.concatenate([rar_idx, grp]))
        m = CatBoostClassifier(**cb_params, iterations=ROUNDS)
        m.fit(X_tr.iloc[idx], y_tr.iloc[idx])
        preds += m.predict_proba(X_va)[:,1]
    preds /= N_MODELS

    auc = roc_auc_score(y_va, preds)
    fold_aucs.append(auc)
    print(f"Fold {fold} AUC: {auc:.4f}")

print(f"\nMean CV AUC: {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")


Fold 1 AUC: 0.8901
Fold 2 AUC: 0.8990
Fold 3 AUC: 0.8895
Fold 4 AUC: 0.8855
Fold 5 AUC: 0.8890

Mean CV AUC: 0.8906 ± 0.0045


In [5]:

import numpy as np
import joblib
from catboost import CatBoostClassifier

# pull best params & rounds
best = study.best_trial.params

cb_params = {
    'learning_rate':    best['learning_rate'],
    'depth':            best['depth'],
    'l2_leaf_reg':      best['l2_leaf_reg'],
    'min_data_in_leaf': best['min_data_in_leaf'],
    'border_count':     best['border_count'],
    'random_strength':  best['random_strength'],
    'subsample':        best['subsample'],
    'rsm':              best['rsm'],
    'bootstrap_type':   'Bernoulli',
    'objective':        'Logloss',
    'eval_metric':      'AUC',
    'task_type':        'CPU',
    'verbose':          False,
    'random_seed':      SEED
}

# prepare splits on full train
arr = y.values
maj_idx = np.where(arr==0)[0]
rar_idx = np.where(arr==1)[0]
splits = np.array_split(np.random.RandomState(SEED).permutation(maj_idx), N_MODELS)

ensemble = []
for i, grp in enumerate(splits, 1):
    idx = np.sort(np.concatenate([rar_idx, grp]))
    X_sub, y_sub = train_features.iloc[idx], y.iloc[idx]
    print(f"Ensembling member {i}: majority={len(grp)}, minority={len(rar_idx)}")
    m = CatBoostClassifier(**cb_params, iterations=ROUNDS)
    m.fit(X_sub, y_sub)
    ensemble.append(m)

# predict test
test_preds = np.mean([m.predict_proba(test_features)[:,1] for m in ensemble], axis=0)

# write submission
sub = submission_format.copy()
sub['coppaRisk'] = test_preds
sub.to_csv('catboost_majority_ensemble.csv', index=False)
print(" Written catboost_majority_ensemble.csv")

# save models
joblib.dump(ensemble, 'catboost_majority_ensemble.pkl')
print(" Saved catboost_majority_ensemble.pkl")


Ensembling member 1: majority=701, minority=696
Ensembling member 2: majority=701, minority=696
Ensembling member 3: majority=701, minority=696
Ensembling member 4: majority=701, minority=696
Ensembling member 5: majority=700, minority=696
Ensembling member 6: majority=700, minority=696
Ensembling member 7: majority=700, minority=696
Ensembling member 8: majority=700, minority=696
Ensembling member 9: majority=700, minority=696
 Written catboost_majority_ensemble.csv
 Saved catboost_majority_ensemble.pkl


In [6]:
best

{'learning_rate': 0.03688688094534395,
 'depth': 7,
 'l2_leaf_reg': 0.06202998934307267,
 'min_data_in_leaf': 93,
 'border_count': 141,
 'random_strength': 0.8195253675619897,
 'subsample': 0.46528052286516963,
 'rsm': 0.5808697982329277}

{'learning_rate': 0.06834041243586388,
 'depth': 7,
 'l2_leaf_reg': 9.363606651596735,
 'min_data_in_leaf': 43,
 'border_count': 148,
 'random_strength': 0.9514868218379223,
 'subsample': 0.6011198068621887,
 'rsm': 0.718895532911034}

In [7]:
best

{'learning_rate': 0.03688688094534395,
 'depth': 7,
 'l2_leaf_reg': 0.06202998934307267,
 'min_data_in_leaf': 93,
 'border_count': 141,
 'random_strength': 0.8195253675619897,
 'subsample': 0.46528052286516963,
 'rsm': 0.5808697982329277}

{'learning_rate': 0.06834041243586388,
 'depth': 7,
 'l2_leaf_reg': 9.363606651596735,
 'min_data_in_leaf': 43,
 'border_count': 148,
 'random_strength': 0.9514868218379223,
 'subsample': 0.6011198068621887,
 'rsm': 0.718895532911034}

# Lightgbm

In [8]:
# ┌───────────────────────────────────────────────────────────────┐
# Cell B1: Tune LightGBM via Optuna + majority/minority hold-out
# └───────────────────────────────────────────────────────────────┘
import numpy as np
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

N_MODELS = 9
SEED     = 42
ROUNDS   = 300

def objective_lgb(trial):
    params = {
        'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'num_leaves':         trial.suggest_int('num_leaves', 20, 300),
        'max_depth':          trial.suggest_int('max_depth', 3, 15),
        'feature_fraction':   trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction':   trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq':       trial.suggest_int('bagging_freq', 1, 10),
        'reg_alpha':          trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda':         trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'min_child_samples':  trial.suggest_int('min_child_samples', 5, 200),
        'objective':          'binary',
        'metric':             'auc',
        'verbosity':          -1,
        'boosting_type':      'gbdt',
        'seed':               SEED
    }

    preds = np.zeros(len(X_val), dtype=float)
    arr = y_train.values
    maj_idx = np.where(arr == 0)[0]
    rar_idx = np.where(arr == 1)[0]
    rng = np.random.RandomState(SEED)
    splits = np.array_split(rng.permutation(maj_idx), N_MODELS)

    for grp in splits:
        idx = np.sort(np.concatenate([rar_idx, grp]))
        X_sub, y_sub = X_train.iloc[idx], y_train.iloc[idx]
        dsub = lgb.Dataset(X_sub, label=y_sub)
        m = lgb.train(params, dsub, num_boost_round=ROUNDS)
        preds += m.predict(X_val)

    preds /= N_MODELS
    return roc_auc_score(y_val, preds)

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=200, timeout=3600)

best_lgb = study_lgb.best_trial.params
print("▶ Best LGB params:", best_lgb)
print(f"▶ Hold-out AUC: {study_lgb.best_value:.4f}")


[I 2025-04-28 08:40:44,933] A new study created in memory with name: no-name-0136c882-a6e5-4d5d-a775-ea4a22ff1599
  'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'reg_alpha':          trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':         trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
[I 2025-04-28 08:40:45,461] Trial 0 finished with value: 0.8014479772248815 and parameters: {'learning_rate': 0.05484028608626748, 'num_leaves': 244, 'max_depth': 13, 'feature_fraction': 0.8262147168545062, 'bagging_fraction': 0.7199385928488657, 'bagging_freq': 5, 'reg_alpha': 7.458714146483146e-07, 'reg_lambda': 2.5086344632339282e-08, 'min_child_samples': 159}. Best is trial 0 with value: 0.8014479772248815.
  'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'reg_alpha':          trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':         trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
[I 2025-04-

▶ Best LGB params: {'learning_rate': 0.007505906858196698, 'num_leaves': 177, 'max_depth': 10, 'feature_fraction': 0.6009712042224474, 'bagging_fraction': 0.764757532881105, 'bagging_freq': 6, 'reg_alpha': 3.783155365098816e-08, 'reg_lambda': 0.0034876075212315833, 'min_child_samples': 5}
▶ Hold-out AUC: 0.9048


In [9]:
# ┌───────────────────────────────────────────────────────────────┐
# Cell B2: 5-Fold Stratified CV of LightGBM majority/minority
# └───────────────────────────────────────────────────────────────┘
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

ROUNDS = 300
lgb_params = dict(best_lgb)
lgb_params.update({
    'objective':'binary','metric':'auc',
    'verbosity':-1,'boosting_type':'gbdt','seed':SEED
})

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_aucs = []
for fold, (tr, va) in enumerate(kf.split(train_features, y), 1):
    preds = np.zeros(len(va), dtype=float)
    X_tr, X_va = train_features.iloc[tr], train_features.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    arr = y_tr.values
    maj_idx = np.where(arr == 0)[0]
    rar_idx = np.where(arr == 1)[0]
    splits = np.array_split(np.random.RandomState(SEED).permutation(maj_idx), N_MODELS)

    for grp in splits:
        idx = np.sort(np.concatenate([rar_idx, grp]))
        m = lgb.train(lgb_params, lgb.Dataset(X_tr.iloc[idx], label=y_tr.iloc[idx]), num_boost_round=ROUNDS)
        preds += m.predict(X_va)

    preds /= N_MODELS
    auc = roc_auc_score(y_va, preds)
    cv_aucs.append(auc)
    print(f"[LGB CV] Fold {fold} AUC: {auc:.4f}")

print(f"[LGB CV] Mean AUC: {np.mean(cv_aucs):.4f} ± {np.std(cv_aucs):.4f}")


[LGB CV] Fold 1 AUC: 0.8877
[LGB CV] Fold 2 AUC: 0.9060
[LGB CV] Fold 3 AUC: 0.8966
[LGB CV] Fold 4 AUC: 0.8831
[LGB CV] Fold 5 AUC: 0.8860
[LGB CV] Mean AUC: 0.8919 ± 0.0084


In [10]:
# ┌───────────────────────────────────────────────────────────────┐
# Cell B3: Retrain full LightGBM ensemble on ALL data & submit
# └───────────────────────────────────────────────────────────────┘
import numpy as np
import joblib
import lightgbm as lgb

ROUNDS = 300
lgb_models = []
arr = y.values
maj_idx = np.where(arr == 0)[0]
rar_idx = np.where(arr == 1)[0]
splits = np.array_split(np.random.RandomState(SEED).permutation(maj_idx), N_MODELS)

for grp in splits:
    idx = np.sort(np.concatenate([rar_idx, grp]))
    m = lgb.train(lgb_params, lgb.Dataset(train_features.iloc[idx], label=y.iloc[idx]), num_boost_round=ROUNDS)
    lgb_models.append(m)

test_preds = np.mean([m.predict(test_features) for m in lgb_models], axis=0)
sub = submission_format.copy()
sub['coppaRisk'] = test_preds
sub.to_csv('lgb_majority_ensemble.csv', index=False)
print("✅ lgb_majority_ensemble.csv saved")
joblib.dump(lgb_models, 'lgb_majority_ensemble.pkl')


✅ lgb_majority_ensemble.csv saved


['lgb_majority_ensemble.pkl']

# Xgboost

In [11]:
# ┌───────────────────────────────────────────────────────────────┐
# Cell A1: Tune XGBoost via Optuna + majority/minority hold-out
# └───────────────────────────────────────────────────────────────┘
import numpy as np
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score

N_MODELS = 9
SEED     = 42
ROUNDS   = 300  # n_estimators

def objective_xgb(trial):
    params = {
        'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'max_depth':          trial.suggest_int('max_depth', 3, 12),
        'min_child_weight':   trial.suggest_int('min_child_weight', 1, 10),
        'gamma':              trial.suggest_float('gamma', 0.0, 2.0),
        'subsample':          trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree':   trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha':          trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda':         trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'objective':          'binary:logistic',
        'eval_metric':        'auc',
        'use_label_encoder':  False,
        'verbosity':          0,
        'seed':               SEED,
        'tree_method':        'hist'
    }

    # majority/minority hold-out on X_train→X_val
    preds = np.zeros(len(X_val), dtype=float)
    arr = y_train.values
    maj_idx = np.where(arr == 0)[0]
    rar_idx = np.where(arr == 1)[0]
    rng = np.random.RandomState(SEED)
    splits = np.array_split(rng.permutation(maj_idx), N_MODELS)

    for grp in splits:
        idx = np.sort(np.concatenate([rar_idx, grp]))
        X_sub, y_sub = X_train.iloc[idx], y_train.iloc[idx]
        model = xgb.XGBClassifier(**params, n_estimators=ROUNDS)
        model.fit(X_sub, y_sub)
        preds += model.predict_proba(X_val)[:,1]

    preds /= N_MODELS
    return roc_auc_score(y_val, preds)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=200, timeout=3600)

best_xgb = study_xgb.best_trial.params
print("▶ Best XGB params:", best_xgb)
print(f"▶ Hold-out AUC: {study_xgb.best_value:.4f}")


[I 2025-04-28 08:48:16,788] A new study created in memory with name: no-name-004a48e4-caa6-49d5-aac7-56ee779795db
  'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'reg_alpha':          trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':         trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
[I 2025-04-28 08:48:21,365] Trial 0 finished with value: 0.891087922683265 and parameters: {'learning_rate': 0.08485771845062871, 'max_depth': 4, 'min_child_weight': 6, 'gamma': 1.137573522662165, 'subsample': 0.6340960965264234, 'colsample_bytree': 0.6198424359241197, 'reg_alpha': 0.10474740926422153, 'reg_lambda': 3.0548537071359494e-08}. Best is trial 0 with value: 0.891087922683265.
  'learning_rate':      trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'reg_alpha':          trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':         trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
[I 2025-04-28 08:48:26,276] Trial 1 

▶ Best XGB params: {'learning_rate': 0.08881182046802887, 'max_depth': 11, 'min_child_weight': 1, 'gamma': 1.5132941267619924, 'subsample': 0.9809952824406312, 'colsample_bytree': 0.4242910594890692, 'reg_alpha': 0.0405420760888111, 'reg_lambda': 3.6925176750154237e-06}
▶ Hold-out AUC: 0.9063


In [12]:
# ┌───────────────────────────────────────────────────────────────┐
# Cell A2: 5-Fold Stratified CV of XGBoost majority/minority
# └───────────────────────────────────────────────────────────────┘
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb

ROUNDS = 300
xgb_params = dict(best_xgb)  # copy best_xgb
xgb_params.update({
    'objective':'binary:logistic','eval_metric':'auc',
    'use_label_encoder':False,'verbosity':0,'seed':SEED,'tree_method':'hist'
})

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_aucs = []

for fold, (tr, va) in enumerate(kf.split(train_features, y), 1):
    preds = np.zeros(len(va), dtype=float)
    X_tr, X_va = train_features.iloc[tr], train_features.iloc[va]
    y_tr, y_va = y.iloc[tr], y.iloc[va]

    arr = y_tr.values
    maj_idx = np.where(arr == 0)[0]
    rar_idx = np.where(arr == 1)[0]
    splits = np.array_split(np.random.RandomState(SEED).permutation(maj_idx), N_MODELS)

    for grp in splits:
        idx = np.sort(np.concatenate([rar_idx, grp]))
        m = xgb.XGBClassifier(**xgb_params, n_estimators=ROUNDS)
        m.fit(X_tr.iloc[idx], y_tr.iloc[idx])
        preds += m.predict_proba(X_va)[:,1]

    preds /= N_MODELS
    auc = roc_auc_score(y_va, preds)
    cv_aucs.append(auc)
    print(f"[XGB CV] Fold {fold} AUC: {auc:.4f}")

print(f"[XGB CV] Mean AUC: {np.mean(cv_aucs):.4f} ± {np.std(cv_aucs):.4f}")


[XGB CV] Fold 1 AUC: 0.8865
[XGB CV] Fold 2 AUC: 0.9066
[XGB CV] Fold 3 AUC: 0.8958
[XGB CV] Fold 4 AUC: 0.8877
[XGB CV] Fold 5 AUC: 0.8839
[XGB CV] Mean AUC: 0.8921 ± 0.0083


In [13]:
# ┌───────────────────────────────────────────────────────────────┐
# Cell A3: Retrain full XGBoost ensemble on ALL data & submit
# └───────────────────────────────────────────────────────────────┘
import numpy as np
import joblib
import xgboost as xgb

# rebuild XGBoost ensemble on full train_features+y
xgb_models = []
arr = y.values
maj_idx = np.where(arr == 0)[0]
rar_idx = np.where(arr == 1)[0]
splits = np.array_split(np.random.RandomState(SEED).permutation(maj_idx), N_MODELS)

for grp in splits:
    idx = np.sort(np.concatenate([rar_idx, grp]))
    m = xgb.XGBClassifier(**xgb_params, n_estimators=ROUNDS)
    m.fit(train_features.iloc[idx], y.iloc[idx])
    xgb_models.append(m)

# predict test
test_preds = np.mean([m.predict_proba(test_features)[:,1] for m in xgb_models], axis=0)
sub = submission_format.copy()
sub['coppaRisk'] = test_preds
sub.to_csv('xgb_majority_ensemble.csv', index=False)
print("✅ xgb_majority_ensemble.csv saved")
joblib.dump(xgb_models, 'xgb_majority_ensemble.pkl')


✅ xgb_majority_ensemble.csv saved


['xgb_majority_ensemble.pkl']

# Cat + Lgb + Xgb

In [14]:
# ┌───────────────────────────────────────────────────────────────┐
# Cell C: Blend all three 9-model ensembles & CV + Submission
# └───────────────────────────────────────────────────────────────┘
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import joblib
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# load your 9-model ensembles
cb_models  = joblib.load('catboost_majority_ensemble.pkl')
xgb_models = joblib.load('xgb_majority_ensemble.pkl')
lgb_models = joblib.load('lgb_majority_ensemble.pkl')
ALL_MODELS = cb_models + xgb_models + lgb_models

# 1) 5-fold Stratified CV on train_features,y
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_aucs = []
for fold, (tr, va) in enumerate(kf.split(train_features, y), 1):
    preds = np.zeros(len(va), dtype=float)
    X_va = train_features.iloc[va]
    y_va = y.iloc[va]
    for m in ALL_MODELS:
        if isinstance(m, lgb.Booster):
            preds += m.predict(X_va)
        else:
            preds += m.predict_proba(X_va)[:,1]
    preds /= len(ALL_MODELS)
    auc = roc_auc_score(y_va, preds)
    cv_aucs.append(auc)
    print(f"[Blend CV] Fold {fold} AUC: {auc:.4f}")

print(f"[Blend CV] Mean AUC: {np.mean(cv_aucs):.4f} ± {np.std(cv_aucs):.4f}")

# 2) Full test prediction & submission
test_preds = np.zeros(len(test_features), dtype=float)
for m in ALL_MODELS:
    if isinstance(m, lgb.Booster):
        test_preds += m.predict(test_features)
    else:
        test_preds += m.predict_proba(test_features)[:,1]
test_preds /= len(ALL_MODELS)

sub = submission_format.copy()
sub['coppaRisk'] = test_preds
sub.to_csv('full_27_model_ensemble.csv', index=False)
print("✅ full_27_model_ensemble.csv saved")


[Blend CV] Fold 1 AUC: 0.9524
[Blend CV] Fold 2 AUC: 0.9610
[Blend CV] Fold 3 AUC: 0.9556
[Blend CV] Fold 4 AUC: 0.9496
[Blend CV] Fold 5 AUC: 0.9486
[Blend CV] Mean AUC: 0.9534 ± 0.0045
✅ full_27_model_ensemble.csv saved
