In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


In [3]:
# ===============================================================
# GPU AutoML - XGBoost, LightGBM, CatBoost
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error

# ML Models
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

# ------------------ Random seed ------------------
RANDOM_STATE = 42

# ------------------ User Inputs ------------------
train_path = "/kaggle/input/playground-series-s5e12/train.csv"
test_path  = "/kaggle/input/playground-series-s5e12/test.csv"
target_col = "diagnosed_diabetes"
task_type = "classification"  # 'regression' or 'classification'

# ------------------ Load data ------------------
train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# ------------------ Data cleaning ------------------
def clean_data(df):
    df = df.copy()
    for col in df.select_dtypes(include=np.number).columns:
        df[col] = df[col].fillna(df[col].median())
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].fillna(df[col].mode()[0])
        df[col] = LabelEncoder().fit_transform(df[col])
    return df

train_df = clean_data(train_df)
test_df  = clean_data(test_df)

# ------------------ Split features & target ------------------
X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]
X_test  = test_df.drop(columns=[target_col]) if target_col in test_df.columns else test_df
y_test  = test_df[target_col] if target_col in test_df.columns else None

# ------------------ Split validation ------------------
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=RANDOM_STATE
)

print(f"Task Type: {task_type}, Train: {X_train_sub.shape}, Val: {X_val.shape}")


Task Type: classification, Train: (560000, 25), Val: (140000, 25)


In [4]:
# ------------------ Initialize GPU models ------------------
models = {}

if task_type == 'regression':
    models['XGBRegressor'] = XGBRegressor(booster='gbtree', tree_method='hist', eval_metric='rmse',
                                         verbosity=0, enable_categorical=True, random_state=RANDOM_STATE, device='cuda', n_jobs=-1)
    models['LGBMRegressor'] = LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1, device='gpu')
    models['CatBoostRegressor'] = CatBoostRegressor(random_state=RANDOM_STATE, verbose=0, task_type='GPU', bootstrap_type='Bernoulli')
else:
    models['XGBClassifier'] = XGBClassifier(booster='gbtree', tree_method='hist', eval_metric='auc',
                                           verbosity=0, enable_categorical=True, random_state=RANDOM_STATE,
                                           device='cuda', n_jobs=-1, early_stopping_rounds=100)
    models['LGBMClassifier'] = LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1, device='gpu')
    models['CatBoostClassifier'] = CatBoostClassifier(random_state=RANDOM_STATE, verbose=0, task_type='GPU', bootstrap_type='Bernoulli')

# ------------------ Hyperparameter grids ------------------
param_grids = {
    'XGBClassifier': {
        'n_estimators': [800, 1500, 3000, 4500],
        'learning_rate': [0.005, 0.01, 0.03],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.85, 0.95],
        'colsample_bytree': [0.4, 0.6, 0.8],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.0, 0.01, 0.1],
        'reg_alpha': [0.0, 1e-4, 1e-2],
        'reg_lambda': [1e-3, 1e-1, 1.0]
    },
    'LGBMClassifier': {
        'n_estimators': [500, 1000, 2000, 5000],
        'learning_rate': [0.005, 0.01, 0.05],
        'num_leaves': [31, 63, 127],
        'max_depth': [-1, 6, 10],
        'subsample': [0.7, 0.85, 0.95],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'CatBoostClassifier': {
        'iterations': [800, 1500, 3000],
        'learning_rate': [0.01, 0.03, 0.1],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [1, 3, 5, 9],
        'subsample': [0.7, 0.85, 1.0],
        'border_count': [64, 128, 254],
        'bootstrap_type': ['Bernoulli']  # Required for GPU
    }
}

# ------------------ Helper to sample hyperparameters ------------------
import random
def sample_params(model_name, param_grid):
    return {k: random.choice(v) for k, v in param_grid.get(model_name, {}).items()}


In [5]:
results = []

for name, base_model in models.items():
    print(f"\nTraining {name} on GPU...")

    # Sample hyperparameters
    sampled_params = sample_params(name, param_grids)

    # Rebuild CatBoost with sampled params (cannot set after fit)
    if name.startswith("CatBoost"):
        model = type(base_model)(**{**base_model.get_params(), **sampled_params})
    else:
        model = base_model
        if sampled_params:
            model.set_params(**sampled_params)

    # -------------------- Train --------------------
    if name.startswith("XGB"):
        model.fit(X_train_sub, y_train_sub, eval_set=[(X_val, y_val)], verbose=False)
        y_pred_train = model.predict(X_train_sub)
        y_pred_val = model.predict(X_val)

        hyperparams_display = {
            'n_estimators': model.best_iteration if hasattr(model, 'best_iteration') else model.n_estimators,
            'learning_rate': model.learning_rate,
            'max_depth': model.max_depth,
            'subsample': model.subsample,
            'colsample_bytree': model.colsample_bytree,
            'min_child_weight': model.min_child_weight,
            'gamma': model.gamma,
            'reg_alpha': model.reg_alpha,
            'reg_lambda': model.reg_lambda
        }

    elif name.startswith("LGBM"):
        model.fit(X_train_sub, y_train_sub)
        y_pred_train = model.predict(X_train_sub)
        y_pred_val = model.predict(X_val)

        hyperparams_display = {
            'n_estimators': model.n_estimators,
            'learning_rate': model.learning_rate,
            'num_leaves': model.num_leaves,
            'max_depth': model.max_depth,
            'subsample': model.subsample,
            'colsample_bytree': model.colsample_bytree
        }

    elif name.startswith("CatBoost"):
        model.fit(X_train_sub, y_train_sub, eval_set=(X_val, y_val), use_best_model=True, verbose=False)
        y_pred_train = model.predict(X_train_sub)
        y_pred_val = model.predict(X_val)

        params = model.get_params()
        hyperparams_display = {
            'iterations': model.tree_count_,
            'learning_rate': params.get('learning_rate'),
            'depth': params.get('depth'),
            'l2_leaf_reg': params.get('l2_leaf_reg'),
            'subsample': params.get('subsample'),
            'border_count': params.get('border_count'),
            'bootstrap_type': params.get('bootstrap_type'),
            'best_iteration': model.get_best_iteration()
        }

    # -------------------- Scoring --------------------
    if task_type == 'regression':
        train_score = mean_squared_error(y_train_sub, y_pred_train, squared=False)
        val_score = mean_squared_error(y_val, y_pred_val, squared=False)
        overfit = "Yes" if train_score < val_score else "No"
    else:
        train_score = accuracy_score(y_train_sub, y_pred_train)
        val_score = accuracy_score(y_val, y_pred_val)
        overfit = "Yes" if train_score > val_score else "No"

    results.append({
        'Model': name,
        'Train_Score': train_score,
        'Validation_Score': val_score,
        'Overfitting': overfit,
        'Used_Params': hyperparams_display
    })

# -------------------- Display --------------------
results_df = pd.DataFrame(results).sort_values('Validation_Score', ascending=False if task_type=='classification' else True)

for _, row in results_df.iterrows():
    print(f"\nüîπ {row['Model']}")
    print(f"Train Score: {row['Train_Score']:.4f}")
    print(f"Validation Score: {row['Validation_Score']:.4f}")
    print(f"Overfitting: {row['Overfitting']}")
    print("Hyperparameters Used:")
    for k, v in row['Used_Params'].items():
        print(f"  - {k}: {v}")

# -------------------- Best Model --------------------
best_row = results_df.iloc[0] if task_type=='classification' else results_df.iloc[-1]
print("\n" + "="*60)
print("üèÜ BEST MODEL OVERALL")
print("="*60)
print(f"Model Name       : {best_row['Model']}")
print(f"Train Score      : {best_row['Train_Score']:.6f}")
print(f"Validation Score : {best_row['Validation_Score']:.6f}")
print(f"Overfitting      : {best_row['Overfitting']}")
print("\nüîß Hyperparameters Used:")
for k, v in best_row['Used_Params'].items():
    print(f"{k}: {v}")



Training XGBClassifier on GPU...

Training LGBMClassifier on GPU...
[LightGBM] [Info] Number of positive: 348936, number of negative: 211064
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1900
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 25
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 20 dense feature groups (10.68 MB) transferred to GPU in 0.019400 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623100 -> initscore=0.502727
[LightGBM] [Info] Start training from score 0.502727

Training CatBoostClassifier on GPU...

üîπ LGBMClassifier
Train Score: 0.7123
Validation Score: 0.6842
Overfitting: Yes
Hyperparameters Used:
  - n_estimators: 2000
  - learning_rate: 0.01
  - num_leaves: 127
  - max_depth: -1
  - subsample: 0.95
  - colsample_bytree: 1.0

üîπ XGBClassifier
Train Score: 0.6877
Validation Score: 0.6804
Overfitting: Yes
Hyperparameters Used:
  - n_estimators: 799
  - learning_rate: 0.01
  - max_depth: 7
  - subsample: 0.95
  - colsample_bytree: 0.6
  - min_child_weight: 1
  - gamma: 0.01
  - reg_alpha: 0.01
  - reg_lambda: 1.0

üîπ CatBoostClassifier
Train Score: 0.6795
Validation Score: 0.6757
Overfitting: Yes
Hyp

In [10]:
import pandas as pd

# -------------------- Flatten results --------------------
flattened_results = []
for r in results:
    flat_dict = r.copy()
    hyperparams = flat_dict.pop('Used_Params')
    for k, v in hyperparams.items():
        flat_dict[k] = v
    flattened_results.append(flat_dict)

# Create DataFrame
results_df = pd.DataFrame(flattened_results)

# Sort by Validation Score
results_df = results_df.sort_values(
    'Validation_Score', 
    ascending=False if task_type=='classification' else True
).reset_index(drop=True)

# Mark best model
best_index = 0 if task_type=='classification' else results_df['Validation_Score'].idxmin()
results_df['Best_Model üèÜ'] = ''
results_df.loc[best_index, 'Best_Model üèÜ'] = 'üèÜ'

# -------------------- Styling --------------------
def highlight_models(row):
    if row['Best_Model üèÜ'] == 'üèÜ':
        return ['background-color: #b6fcd5'] * len(row)  # Light green for best model
    elif row['Overfitting'] == 'Yes':
        return ['background-color: #fcb6b6'] * len(row)  # Light red for overfitting
    else:
        return [''] * len(row)  # No highlight

# Display styled DataFrame
pd.set_option('display.max_columns', None)
styled_df = results_df.style.apply(highlight_models, axis=1)
display(styled_df)


Unnamed: 0,Model,Train_Score,Validation_Score,Overfitting,n_estimators,learning_rate,max_depth,subsample,colsample_bytree,min_child_weight,gamma,reg_alpha,reg_lambda,num_leaves,iterations,depth,l2_leaf_reg,border_count,bootstrap_type,best_iteration,Best_Model üèÜ
0,LGBMClassifier,0.712311,0.684243,Yes,2000.0,0.01,-1.0,0.95,1.0,,,,,127.0,,,,,,,üèÜ
1,XGBClassifier,0.687661,0.680436,Yes,799.0,0.01,7.0,0.95,0.6,1.0,0.01,0.01,1.0,,,,,,,,
2,CatBoostClassifier,0.679461,0.675743,Yes,,0.1,,1.0,,,,,,,792.0,4.0,9.0,64.0,Bernoulli,791.0,


In [16]:
import pandas as pd
import numpy as np

# -------------------- Flatten results --------------------
flattened_results = []
for r in results:
    flat_dict = r.copy()
    hyperparams = flat_dict.pop('Used_Params')
    for k, v in hyperparams.items():
        flat_dict[k] = v
    flattened_results.append(flat_dict)

# Create DataFrame
results_df = pd.DataFrame(flattened_results)

# -------------------- Compute Gap (Train - Val) --------------------
results_df['Gap (Train - Val)'] = abs(results_df['Train_Score'] - results_df['Validation_Score']).round(4)

# -------------------- Add Star Ranking based on Gap --------------------
# Smaller gap = better (fewer stars), larger gap = more stars
n = len(results_df)
# Rank by Gap ascending (smallest = rank 1)
results_df['Gap_Rank'] = results_df['Gap (Train - Val)'].rank(method='min').astype(int)

# Function to convert rank to stars
def gap_to_stars(rank, n):
    if rank <= n/3:
        return '‚≠ê'           # least overfit
    elif rank <= 2*n/3:
        return '‚≠ê‚≠ê'
    else:
        return '‚≠ê‚≠ê‚≠ê'         # most overfit

results_df['Gap_Stars'] = results_df['Gap_Rank'].apply(lambda x: gap_to_stars(x, n))

# -------------------- Compute Overfitting Score --------------------
if task_type == 'classification':
    results_df['Overfit_Score'] = results_df['Train_Score'] - results_df['Validation_Score']
    results_df['Sort_Validation'] = results_df['Validation_Score']  # Higher is better
    results_df['Sort_Overfit'] = -results_df['Overfit_Score']
else:
    results_df['Overfit_Score'] = results_df['Validation_Score'] - results_df['Train_Score']
    results_df['Sort_Validation'] = -results_df['Validation_Score']  # Lower is better
    results_df['Sort_Overfit'] = results_df['Overfit_Score']

# -------------------- Rank Overfitting --------------------
results_df['Overfit_Rank'] = results_df['Overfit_Score'].rank(method='min', ascending=False).astype(int)

# -------------------- Combined Rank --------------------
results_df = results_df.sort_values(['Sort_Validation', 'Sort_Overfit'], ascending=True).reset_index(drop=True)
results_df['Combined_Rank'] = range(1, len(results_df)+1)

# -------------------- Mark Best Model üèÜ --------------------
best_index = 0 if task_type=='classification' else results_df['Validation_Score'].idxmax()
results_df['Best_Model üèÜ'] = ''
results_df.loc[best_index, 'Best_Model üèÜ'] = 'üèÜ'

# -------------------- Styling --------------------
def highlight_models(row):
    if row['Best_Model üèÜ'] == 'üèÜ':
        return ['background-color: #b6fcd5'] * len(row)  # Green for best
    elif row['Overfit_Score'] > 0:
        return ['background-color: #fcb6b6'] * len(row)  # Red for overfitting
    else:
        return [''] * len(row)

pd.set_option('display.max_columns', None)
styled_df = results_df.style.apply(highlight_models, axis=1)
display(styled_df)


Unnamed: 0,Model,Train_Score,Validation_Score,Overfitting,n_estimators,learning_rate,max_depth,subsample,colsample_bytree,min_child_weight,gamma,reg_alpha,reg_lambda,num_leaves,iterations,depth,l2_leaf_reg,border_count,bootstrap_type,best_iteration,Gap (Train - Val),Gap_Rank,Gap_Stars,Overfit_Score,Sort_Validation,Sort_Overfit,Overfit_Rank,Combined_Rank,Best_Model üèÜ
0,CatBoostClassifier,0.679461,0.675743,Yes,,0.1,,1.0,,,,,,,792.0,4.0,9.0,64.0,Bernoulli,791.0,0.0037,1,‚≠ê,0.003718,0.675743,-0.003718,3,1,üèÜ
1,XGBClassifier,0.687661,0.680436,Yes,799.0,0.01,7.0,0.95,0.6,1.0,0.01,0.01,1.0,,,,,,,,0.0072,2,‚≠ê‚≠ê,0.007225,0.680436,-0.007225,2,2,
2,LGBMClassifier,0.712311,0.684243,Yes,2000.0,0.01,-1.0,0.95,1.0,,,,,127.0,,,,,,,0.0281,3,‚≠ê‚≠ê‚≠ê,0.028068,0.684243,-0.028068,1,3,
