In [6]:
import numpy as np
from collections import Counter

# Models & utils
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report, confusion_matrix
import joblib

# XGBoost (installed on your Mac)
from xgboost import XGBClassifier

# ---------------------------
# 1) Load data and select features
# ---------------------------
df = pd.read_csv("waterDataset.csv")

# Keep the six core features (you trained your latest model on these)
FEATURES = ["EC", "TDS", "Na", "TH", "Cl", "pH"]
assert all(c in df.columns for c in FEATURES), f"Missing expected columns. Found: {df.columns}"

# ---------------------------
# 2) Map 5 classes -> 3 classes (as requested)
#    Good bucket:      ["Excellent","Good"]
#    Poor bucket:      ["Very Poor yet Drinkable","Poor"]
#    Unsuitable bucket:["Unsuitable for Drinking"]
# ---------------------------
def map_to_3_classes(label: str) -> str:
    if label in ["Excellent", "Good"]:
        return "Good"
    elif label in ["Poor", "Very Poor yet Drinkable"]:
        return "Poor"
    elif label == "Unsuitable for Drinking":
        return "Unsuitable"
    else:
        return None  # unexpected

df["WQ_3C"] = df["Water Quality Classification"].map(map_to_3_classes)
df = df.dropna(subset=["WQ_3C"])

X = df[FEATURES].copy()
y = df["WQ_3C"].copy()

print("Rows:", len(df))
print("Class counts (3-class):")
print(y.value_counts())

# ---------------------------
# 3) Compute class weights (for imbalance)
# ---------------------------
class_counts = y.value_counts()
total = len(y)
class_weights = {cls: total/(len(class_counts)*cnt) for cls, cnt in class_counts.items()}
print("\nClass weights:", class_weights)

def per_sample_weights(y_series, cw):
    return y_series.map(cw).values

# ---------------------------
# 4) Define candidates
#    A) Logistic Regression (multinomial) with scaler + class_weight='balanced'
#    B) XGBoost with sample_weight per fold
# ---------------------------
logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=3000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    n_estimators=600,
    max_depth=7,
    learning_rate=0.06,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    tree_method="hist"
)

candidates = [
    ("LogReg", logreg_pipe, "linear"),
    ("XGBoost", xgb, "tree"),
]

# ---------------------------
# 5) Manual CV loop (so we can pass sample_weight cleanly)
# ---------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model(name, model, X, y):
    f1s, bals = [], []
    all_cm = np.zeros((3,3), dtype=int)  # in order [Good, Poor, Unsuitable]
    order = ["Good", "Poor", "Unsuitable"]

    for fold, (tr, te) in enumerate(cv.split(X, y), start=1):
        X_tr, X_te = X.iloc[tr], X.iloc[te]
        y_tr, y_te = y.iloc[tr], y.iloc[te]

        if name == "XGBoost":
            # per-sample weights for training folds
            sw = per_sample_weights(y_tr, class_weights)
            model.fit(X_tr, y_tr, sample_weight=sw)
        else:
            model.fit(X_tr, y_tr)

        y_pred = model.predict(X_te)

        f1 = f1_score(y_te, y_pred, average="macro")
        ba = balanced_accuracy_score(y_te, y_pred)

        f1s.append(f1)
        bals.append(ba)

        # accumulate confusion matrix in a fixed label order
        cm = confusion_matrix(y_te, y_pred, labels=order)
        all_cm += cm

        print(f"[{name}] Fold {fold}: macro-F1={f1:.4f} | bal-acc={ba:.4f}")

    print(f"\n{name} | macro-F1: {np.mean(f1s):.4f} ± {np.std(f1s):.4f} | "
          f"bal-acc: {np.mean(bals):.4f} ± {np.std(bals):.4f}")
    print("\nConfusion matrix (summed over folds) in order [Good, Poor, Unsuitable]:")
    print(pd.DataFrame(all_cm, index=order, columns=order))
    return np.mean(f1s), np.mean(bals)

results = []
for name, model, kind in candidates:
    print("\n" + "="*60)
    print(f"Evaluating: {name}")
    f1m, bam = evaluate_model(name, model, X, y)
    results.append({"model": name, "kind": kind, "f1_macro_mean": f1m, "bal_acc_mean": bam})

res_df = pd.DataFrame(results).sort_values(by=["f1_macro_mean","bal_acc_mean"], ascending=False)
print("\n\nCV ranking:")
print(res_df)

# ---------------------------
# 6) Fit BEST model on FULL data & save
# ---------------------------
best_name = res_df.iloc[0]["model"]
print(f"\nBest model selected: {best_name}")

if best_name == "XGBoost":
    sw_full = per_sample_weights(y, class_weights)
    xgb.fit(X, y, sample_weight=sw_full)
    best_model = xgb
else:
    logreg_pipe.fit(X, y)
    best_model = logreg_pipe

out_file = "water_quality_rf_model_3class.pkl" if best_name=="XGBoost" else "water_quality_logreg_3class.pkl"
joblib.dump(best_model, out_file)
print(f"✅ Saved best 3-class model as: {out_file}")

# ---------------------------
# 7) Quick sanity check on FULL fit
# ---------------------------
y_pred_full = best_model.predict(X)
print("\nFull-data classification report (sanity check, not a test metric):")
print(classification_report(y, y_pred_full, digits=4))

Rows: 19029
Class counts (3-class):
WQ_3C
Poor          10026
Unsuitable     6608
Good           2395
Name: count, dtype: int64

Class weights: {'Poor': 0.632655096748454, 'Unsuitable': 0.9598970944309927, 'Good': 2.648434237995825}

Evaluating: LogReg




[LogReg] Fold 1: macro-F1=0.9691 | bal-acc=0.9840




[LogReg] Fold 2: macro-F1=0.9695 | bal-acc=0.9841




[LogReg] Fold 3: macro-F1=0.9708 | bal-acc=0.9826
[LogReg] Fold 4: macro-F1=0.9606 | bal-acc=0.9794
[LogReg] Fold 5: macro-F1=0.9633 | bal-acc=0.9795

LogReg | macro-F1: 0.9667 ± 0.0040 | bal-acc: 0.9819 ± 0.0021

Confusion matrix (summed over folds) in order [Good, Poor, Unsuitable]:
            Good  Poor  Unsuitable
Good        2387     8           0
Poor         307  9596         123
Unsuitable     0    53        6555

Evaluating: XGBoost




ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['Good' 'Poor' 'Unsuitable']

In [8]:
# ============================================
# 03_model_selection.ipynb  (single fixed cell)
# ============================================
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib

from xgboost import XGBClassifier

# ---------------------------
# 1) Load data & select features
# ---------------------------
df = pd.read_csv("waterDataset.csv")
FEATURES = ["EC", "TDS", "Na", "TH", "Cl", "pH"]
assert all(c in df.columns for c in FEATURES), f"Missing columns. Found: {df.columns}"

# ---------------------------
# 2) Collapse 5 classes -> 3 classes (as discussed)
# ---------------------------
def map_to_3_classes(label: str) -> str:
    if label in ["Excellent", "Good"]:
        return "Good"
    elif label in ["Poor", "Very Poor yet Drinkable"]:
        return "Poor"
    elif label == "Unsuitable for Drinking":
        return "Unsuitable"
    else:
        return None

df["WQ_3C"] = df["Water Quality Classification"].map(map_to_3_classes)
df = df.dropna(subset=["WQ_3C"])

X = df[FEATURES].copy()
y = df["WQ_3C"].copy()

print("Rows:", len(df))
print("Class counts (3-class):")
print(y.value_counts())

# ---------------------------
# 3) Class weights for imbalance
# ---------------------------
class_counts = y.value_counts()
K = len(class_counts)
N = len(y)
class_weights = {cls: N/(K*cnt) for cls, cnt in class_counts.items()}
print("\nClass weights:", class_weights)

def per_sample_weights(y_series, cw):
    return y_series.map(cw).values

# ---------------------------
# 4) Models
# ---------------------------
logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        solver="lbfgs",
        max_iter=3000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

# For XGB we will encode y -> integers [0,1,2]
le = LabelEncoder()
y_int = le.fit_transform(y)               # ['Good','Poor','Unsuitable'] -> e.g. [0,1,2]
label_order = list(le.classes_)           # keep string order for reporting

xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    n_estimators=600,
    max_depth=7,
    learning_rate=0.06,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    tree_method="hist"
)

candidates = [
    ("LogReg", logreg_pipe, "linear"),
    ("XGBoost", xgb, "tree"),
]

# ---------------------------
# 5) Manual CV (so we can pass weights to XGB)
# ---------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_logreg(model, X, y):
    f1s, bals = [], []
    order = ["Good", "Poor", "Unsuitable"]
    all_cm = np.zeros((3,3), dtype=int)

    for fold, (tr, te) in enumerate(cv.split(X, y), start=1):
        X_tr, X_te = X.iloc[tr], X.iloc[te]
        y_tr, y_te = y.iloc[tr], y.iloc[te]

        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)

        f1 = f1_score(y_te, y_pred, average="macro")
        ba = balanced_accuracy_score(y_te, y_pred)
        f1s.append(f1); bals.append(ba)

        cm = confusion_matrix(y_te, y_pred, labels=order)
        all_cm += cm
        print(f"[LogReg] Fold {fold}: macro-F1={f1:.4f} | bal-acc={ba:.4f}")

    print(f"\nLogReg | macro-F1: {np.mean(f1s):.4f} ± {np.std(f1s):.4f} | "
          f"bal-acc: {np.mean(bals):.4f} ± {np.std(bals):.4f}")
    print("\nConfusion matrix (summed) [Good, Poor, Unsuitable]:")
    print(pd.DataFrame(all_cm, index=order, columns=order))
    return np.mean(f1s), np.mean(bals)

def evaluate_xgb(model, X, y_str, y_int):
    f1s, bals = [], []
    order = label_order  # ['Good','Poor','Unsuitable']
    all_cm = np.zeros((3,3), dtype=int)

    for fold, (tr, te) in enumerate(cv.split(X, y_int), start=1):
        X_tr, X_te = X.iloc[tr], X.iloc[te]
        y_tr_int, y_te_int = y_int[tr], y_int[te]
        y_tr_str, y_te_str = y_str.iloc[tr], y_str.iloc[te]

        sw = per_sample_weights(pd.Series(y_tr_str), class_weights)
        model.fit(X_tr, y_tr_int, sample_weight=sw)

        y_pred_int = model.predict(X_te)
        y_pred_str = le.inverse_transform(y_pred_int)   # back to strings

        f1 = f1_score(y_str.iloc[te], y_pred_str, average="macro")
        ba = balanced_accuracy_score(y_str.iloc[te], y_pred_str)
        f1s.append(f1); bals.append(ba)

        cm = confusion_matrix(y_str.iloc[te], y_pred_str, labels=order)
        all_cm += cm
        print(f"[XGB ] Fold {fold}: macro-F1={f1:.4f} | bal-acc={ba:.4f}")

    print(f"\nXGBoost | macro-F1: {np.mean(f1s):.4f} ± {np.std(f1s):.4f} | "
          f"bal-acc: {np.mean(bals):.4f} ± {np.std(bals):.4f}")
    print("\nConfusion matrix (summed) [Good, Poor, Unsuitable]:")
    print(pd.DataFrame(all_cm, index=order, columns=order))
    return np.mean(f1s), np.mean(bals)

# Run both
print("\n" + "="*60 + "\nEvaluating: LogReg")
log_f1, log_ba = evaluate_logreg(logreg_pipe, X, y)

print("\n" + "="*60 + "\nEvaluating: XGBoost")
xgb_f1, xgb_ba = evaluate_xgb(xgb, X, y, y_int)

res_df = pd.DataFrame([
    {"model": "LogReg",  "f1_macro_mean": log_f1, "bal_acc_mean": log_ba, "kind": "linear"},
    {"model": "XGBoost", "f1_macro_mean": xgb_f1, "bal_acc_mean": xgb_ba, "kind": "tree"},
]).sort_values(by=["f1_macro_mean","bal_acc_mean"], ascending=False)
print("\n\nCV ranking:")
print(res_df)

# ---------------------------
# 6) Fit BEST on FULL data & save
# ---------------------------
best_name = res_df.iloc[0]["model"]
print(f"\nBest model selected: {best_name}")

if best_name == "XGBoost":
    sw_full = per_sample_weights(y, class_weights)
    # need integer labels for fit, but weights from string y are fine
    xgb.fit(X, y_int, sample_weight=per_sample_weights(y, class_weights))
    best_model = xgb
    out_file = "water_quality_xgb_3class.pkl"
else:
    logreg_pipe.fit(X, y)
    best_model = logreg_pipe
    out_file = "water_quality_logreg_3class.pkl"

joblib.dump(best_model, out_file)
print(f"✅ Saved best 3-class model as: {out_file}")

# ---------------------------
# 7) Sanity check
# ---------------------------
if best_name == "XGBoost":
    y_pred_full = le.inverse_transform(best_model.predict(X))
else:
    y_pred_full = best_model.predict(X)

print("\nFull-data classification report (sanity check):")
print(classification_report(y, y_pred_full, digits=4))


Rows: 19029
Class counts (3-class):
WQ_3C
Poor          10026
Unsuitable     6608
Good           2395
Name: count, dtype: int64

Class weights: {'Poor': 0.632655096748454, 'Unsuitable': 0.9598970944309927, 'Good': 2.648434237995825}

Evaluating: LogReg
[LogReg] Fold 1: macro-F1=0.9691 | bal-acc=0.9840
[LogReg] Fold 2: macro-F1=0.9695 | bal-acc=0.9841
[LogReg] Fold 3: macro-F1=0.9708 | bal-acc=0.9826
[LogReg] Fold 4: macro-F1=0.9606 | bal-acc=0.9794
[LogReg] Fold 5: macro-F1=0.9633 | bal-acc=0.9795

LogReg | macro-F1: 0.9667 ± 0.0040 | bal-acc: 0.9819 ± 0.0021

Confusion matrix (summed) [Good, Poor, Unsuitable]:
            Good  Poor  Unsuitable
Good        2387     8           0
Poor         307  9596         123
Unsuitable     0    53        6555

Evaluating: XGBoost
[XGB ] Fold 1: macro-F1=0.9851 | bal-acc=0.9855
[XGB ] Fold 2: macro-F1=0.9839 | bal-acc=0.9874
[XGB ] Fold 3: macro-F1=0.9847 | bal-acc=0.9842
[XGB ] Fold 4: macro-F1=0.9835 | bal-acc=0.9847
[XGB ] Fold 5: macro-F1=0.98