In [3]:
import sys
!{sys.executable} -m pip install catboost
from catboost import CatBoostClassifier
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

# Step 1: Load model and scaler
model = CatBoostClassifier()
model.load_model("water_potability_catboost_scaled_model.cbm")
scaler = joblib.load("scaler_catboost.pkl")

#  Step 2: Load the same training data used during training
df = pd.read_csv("water_potability_final.csv")
X = df.drop("Potability", axis=1)
y = df["Potability"]

#  Step 3: Apply scaling
X_scaled = scaler.transform(X)

#  Step 4: Find optimal F1 threshold
y_prob = model.predict_proba(X_scaled)[:, 1]
prec, rec, thresholds = precision_recall_curve(y, y_prob)
f1_scores = 2 * (prec * rec) / (prec + rec)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f" Best threshold for max F1: {best_threshold:.3f}")
print(f"Precision: {prec[best_idx]:.3f}, Recall: {rec[best_idx]:.3f}")

#  Step 5: Define sample input
sample = pd.DataFrame([{
    "ph": 1.88,
    "Hardness": 0.37,
    "Solids": 1.514,
    "Chloramines": 0.77,
    "Sulfate": 0.101,
    "Conductivity": 0.448,
    "Organic_carbon": -1.263,
    "Trihalomethanes": -0.984,
    "Turbidity": 0.711
}])

sample_scaled = scaler.transform(sample)

#  Step 6: Predict & decide
probs = model.predict_proba(sample_scaled)[0]
prob_safe = probs[1]

print(f"\n Predicted Probabilities → [Not Safe: {probs[0]:.4f}, Safe: {prob_safe:.4f}]")

if prob_safe > best_threshold:
    print("\n Water is SAFE for drinking ✅")
else:
    print("\n Water is NOT SAFE for drinking ❌")

print(f"\n Prediction Confidence: {max(probs)*100:.2f}%")

 Best threshold for max F1: 0.876
Precision: 1.000, Recall: 1.000

 Predicted Probabilities → [Not Safe: 0.0223, Safe: 0.9777]

 Water is SAFE for drinking ✅

 Prediction Confidence: 97.77%


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
