In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

In [3]:
from google.colab import drive
drive.mount('/content/drive')

train = pd.read_csv("/content/drive/MyDrive/Kagglethon/train.csv")
test  = pd.read_csv("/content/drive/MyDrive/Kagglethon/test.csv")

TARGET = "Outage_Risk"
FEATURES = ["Feature_2", "Feature_3"]

X_train_raw = train[FEATURES]
y = train[TARGET]
X_test_raw = test[FEATURES]

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Available columns: {train.columns.tolist()}")

Mounted at /content/drive
Train shape: (7500, 8)
Test shape: (2500, 7)
Available columns: ['id', 'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Outage_Risk']


In [4]:
imputer = SimpleImputer(strategy="median")
X_train = imputer.fit_transform(X_train_raw)
X_test  = imputer.transform(X_test_raw)

In [5]:

N_FOLDS = 5
N_SEEDS = 5
seeds = [42, 123, 456, 789, 2024]


oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

for seed_idx, seed in enumerate(seeds):
    print(f"\n{'='*50}")
    print(f"SEED {seed_idx+1}/{N_SEEDS}: {seed}")
    print(f"{'='*50}")

    seed_oof = np.zeros(len(train))
    seed_test = np.zeros(len(test))

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y)):
        X_tr, X_val = X_train[tr_idx], X_train[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]


        model = CatBoostClassifier(
            iterations=1200,
            learning_rate=0.035,
            depth=6,
            l2_leaf_reg=3,
            min_data_in_leaf=20,
            bagging_temperature=0.5,
            loss_function="Logloss",
            eval_metric="AUC",
            monotone_constraints=[0, 1],
            random_seed=seed,
            verbose=False
        )

        model.fit(
            X_tr, y_tr,
            eval_set=(X_val, y_val),
            early_stopping_rounds=100,
            verbose=False
        )

        val_preds = model.predict_proba(X_val)[:, 1]
        test_fold_preds = model.predict_proba(X_test)[:, 1]

        seed_oof[val_idx] = val_preds
        seed_test += test_fold_preds / N_FOLDS

        fold_auc = roc_auc_score(y_val, val_preds)
        print(f"  Fold {fold+1} AUC: {fold_auc:.6f}")

    oof_preds += seed_oof / N_SEEDS
    test_preds += seed_test / N_SEEDS

    seed_auc = roc_auc_score(y, seed_oof)
    print(f"  → Seed {seed} CV AUC: {seed_auc:.6f}")


SEED 1/5: 42
  Fold 1 AUC: 0.704193
  Fold 2 AUC: 0.738080
  Fold 3 AUC: 0.698334
  Fold 4 AUC: 0.725934
  Fold 5 AUC: 0.727223
  → Seed 42 CV AUC: 0.679844

SEED 2/5: 123
  Fold 1 AUC: 0.707884
  Fold 2 AUC: 0.738923
  Fold 3 AUC: 0.698412
  Fold 4 AUC: 0.725072
  Fold 5 AUC: 0.722950
  → Seed 123 CV AUC: 0.672498

SEED 3/5: 456
  Fold 1 AUC: 0.707110
  Fold 2 AUC: 0.738472
  Fold 3 AUC: 0.699585
  Fold 4 AUC: 0.725060
  Fold 5 AUC: 0.720038
  → Seed 456 CV AUC: 0.670711

SEED 4/5: 789
  Fold 1 AUC: 0.706084
  Fold 2 AUC: 0.739431
  Fold 3 AUC: 0.698376
  Fold 4 AUC: 0.725549
  Fold 5 AUC: 0.719226
  → Seed 789 CV AUC: 0.702152

SEED 5/5: 2024
  Fold 1 AUC: 0.711874
  Fold 2 AUC: 0.738178
  Fold 3 AUC: 0.699496
  Fold 4 AUC: 0.725650
  Fold 5 AUC: 0.722478
  → Seed 2024 CV AUC: 0.679055


In [6]:
# =========================================
# FINAL CV SCORE
# =========================================
final_cv_auc = roc_auc_score(y, oof_preds)
print(f"\n{'='*50}")
print(f"FINAL CV AUC (Seed-Averaged): {final_cv_auc:.6f}")
print(f"{'='*50}")


FINAL CV AUC (Seed-Averaged): 0.704625


In [7]:

final_model = CatBoostClassifier(
    iterations=1200,
    learning_rate=0.035,
    depth=6,
    l2_leaf_reg=3,
    min_data_in_leaf=20,
    bagging_temperature=0.5,
    monotone_constraints=[0, 1],
    random_seed=42,
    verbose=False
)
final_model.fit(X_train, y)
train_preds_check = final_model.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y, train_preds_check)

print(f"\nOverfitting Check:")
print(f"  Train AUC: {train_auc:.6f}")
print(f"  CV AUC:    {final_cv_auc:.6f}")
print(f"  Gap:       {train_auc - final_cv_auc:.6f}")

if train_auc - final_cv_auc > 0.10:
    print("  ⚠️  WARNING: Possible overfitting!")
elif train_auc - final_cv_auc < 0.02:
    print("  ⚠️  WARNING: Possible underfitting - consider adding features!")
else:
    print("  ✅ Good generalization")


Overfitting Check:
  Train AUC: 0.748969
  CV AUC:    0.704625
  Gap:       0.044344
  ✅ Good generalization


In [8]:

submission = pd.DataFrame({
    "id": test["id"],
    "Outage_Risk": test_preds
})

submission.to_csv("submission_optimized_monotonic.csv", index=False)
print(f"\n✅ Submission saved: submission_optimized_monotonic.csv")
print(f"   Shape: {submission.shape}")
print(f"   Prediction range: [{test_preds.min():.4f}, {test_preds.max():.4f}]")


✅ Submission saved: submission_optimized_monotonic.csv
   Shape: (2500, 2)
   Prediction range: [0.2702, 0.6736]
