In [1]:
!pip install catboost

import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
import joblib
import warnings
warnings.filterwarnings("ignore")

# Step 1: Load dataset
df = pd.read_csv('water_potability_final.csv')

# Step 2: Separate features and target
X = df.drop('Potability', axis=1)
y = df['Potability']

#  Step 3: Apply Standard Scaler BEFORE model training
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Save the scaler for future inference
joblib.dump(scaler, "scaler_catboost.pkl")
print(" Scaler fitted and saved successfully as 'scaler_catboost.pkl'")

# Artificial groups for CV
groups = np.arange(len(X_scaled)) % 5

#  Step 4: Define CatBoost model (your best Optuna params)
cat_best = CatBoostClassifier(
    iterations=733,
    depth=9,
    learning_rate=0.06430747110839902,
    l2_leaf_reg=3.564319988017288,
    bagging_temperature=0.005960502387030066,
    border_count=166,
    random_strength=1.173166172791412,
    class_weights=[1, 1.5],
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=0,
    random_seed=42
)

# Step 5: Cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

gkf = GroupKFold(n_splits=5)
cv_results = {
    metric: cross_val_score(cat_best, X_scaled, y, cv=gkf.split(X_scaled, y, groups), scoring=scorer).mean()
    for metric, scorer in scoring.items()
}

results = pd.DataFrame([cv_results])
print("\n Cross-validation performance (Scaled + Balanced):")
print(results)

# Step 6: Train on full scaled dataset
cat_best.fit(X_scaled, y)

# Step 7: Save the new model
cat_best.save_model("water_potability_catboost_scaled_model.cbm")
print("\n Model retrained and saved as 'water_potability_catboost_scaled_model.cbm'")

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
 Scaler fitted and saved successfully as 'scaler_catboost.pkl'

 Cross-validation performance (Scaled + Balanced):
   accuracy  precision    recall        f1
0  0.824575    0.79606  0.873378  0.832796

 Model retrained and saved as 'water_potability_catboost_scaled_model.cbm'
