# AWARE — Random Forest (updated for `water_dataX.csv`)

This notebook:

1. Loads `/mnt/data/water_dataX.csv`
2. Cleans & renames columns
3. Creates `Risk` column using WHO/BIS-aligned thresholds
4. Trains a Random Forest classifier
5. Saves the trained model and imputer
6. Provides a `BLANK_INPUT` cell to make predictions on new data

Run cells top-to-bottom. If your CSV is elsewhere, change `DATA_PATH` in the first code cell.

In [78]:
# Cell: imports and settings
import pandas as pd
import numpy as np
import random
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import joblib
import os

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# --- DEFAULT PATHS ---
WINDOWS_DATA = Path(r"C:\Users\prana\Documents\PROJECTS\AWARE\extract\water_dataX.csv")
WINDOWS_MODEL = Path(r"C:\Users\prana\Documents\PROJECTS\AWARE\extract\rf_water_model.joblib")

MAC_DATA = Path("/Users/anishsharma/Developer/SE project/extract/water_dataX.csv")
MAC_MODEL = Path("/Users/anishsharma/Developer/SE project/extract/rf_water_model.joblib")

# Allow environment override
ENV_DATA = os.getenv("AWARE_DATA_PATH")
ENV_MODEL = os.getenv("AWARE_MODEL_OUTPUT")

# --- PICK DATA_PATH WITH TRY LOGIC ---
if ENV_DATA:
    DATA_PATH = Path(ENV_DATA)
elif WINDOWS_DATA.exists():
    DATA_PATH = WINDOWS_DATA
elif MAC_DATA.exists():
    DATA_PATH = MAC_DATA
else:
    raise FileNotFoundError(
        f"Could not find dataset.\nTried:\n  {WINDOWS_DATA}\n  {MAC_DATA}\n"
    )

# --- FIXED MODEL_OUTPUT LOGIC ---
if ENV_MODEL:
    MODEL_OUTPUT = Path(ENV_MODEL)
elif os.name == "nt":  # Windows
    MODEL_OUTPUT = WINDOWS_MODEL
else:                  # macOS or Linux
    MODEL_OUTPUT = MAC_MODEL

print("DATA_PATH =", DATA_PATH)
print("MODEL_OUTPUT =", MODEL_OUTPUT)

DATA_PATH = /Users/anishsharma/Developer/SE project/extract/water_dataX.csv
MODEL_OUTPUT = /Users/anishsharma/Developer/SE project/extract/rf_water_model.joblib


In [79]:
# Cell: load data
# Try UTF-8 first (recommended)
try:
    df = pd.read_csv(DATA_PATH, encoding='utf-8', low_memory=False)
except UnicodeDecodeError:
    print("UTF-8 failed, trying latin1…")
    df = pd.read_csv(DATA_PATH, encoding='latin1', low_memory=False)

# Optional: parse dates after load
if 'sample_date' in df.columns:
    df['sample_date'] = pd.to_datetime(df['sample_date'], errors='coerce', dayfirst=True)

print("Loaded shape:", df.shape)
display(df.head())
df.info()


UTF-8 failed, trying latin1…
Loaded shape: (1991, 12)


Unnamed: 0,STATION CODE,LOCATIONS,STATE,Temp,D.O. (mg/l),PH,CONDUCTIVITY (µmhos/cm),B.O.D. (mg/l),NITRATENAN N+ NITRITENANN (mg/l),FECAL COLIFORM (MPN/100ml),TOTAL COLIFORM (MPN/100ml)Mean,year
0,1393,"DAMANGANGA AT D/S OF MADHUBAN, DAMAN",DAMAN & DIU,30.6,6.7,7.5,203,NAN,0.1,11,27,2014
1,1399,ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOI...,GOA,29.8,5.7,7.2,189,2,0.2,4953,8391,2014
2,1475,ZUARI AT PANCHAWADI,GOA,29.5,6.3,6.9,179,1.7,0.1,3243,5330,2014
3,3181,RIVER ZUARI AT BORIM BRIDGE,GOA,29.7,5.8,6.9,64,3.8,0.5,5382,8443,2014
4,3182,RIVER ZUARI AT MARCAIM JETTY,GOA,29.5,5.8,7.3,83,1.9,0.4,3428,5500,2014


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1991 entries, 0 to 1990
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   STATION CODE                      1991 non-null   object
 1   LOCATIONS                         1991 non-null   object
 2   STATE                             1991 non-null   object
 3   Temp                              1991 non-null   object
 4   D.O. (mg/l)                       1991 non-null   object
 5   PH                                1991 non-null   object
 6   CONDUCTIVITY (µmhos/cm)           1991 non-null   object
 7   B.O.D. (mg/l)                     1991 non-null   object
 8   NITRATENAN N+ NITRITENANN (mg/l)  1991 non-null   object
 9   FECAL COLIFORM (MPN/100ml)        1991 non-null   object
 10  TOTAL COLIFORM (MPN/100ml)Mean    1991 non-null   object
 11  year                              1991 non-null   int64 
dtypes: int64(1), object(

In [80]:
# Cell: clean & rename columns (improved)
import re
from difflib import get_close_matches

def normalize_colname(s):
    """Normalize a column name to a simple form for matching."""
    if pd.isna(s):
        return ""
    s = str(s)
    # remove BOM/newlines/tabs, collapse spaces, strip
    s = s.replace('\ufeff', '').replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
    s = re.sub(r'\s+', ' ', s).strip()
    return s.lower()

if df is not None:
    # canonical rename map (lowercase keys)
    rename_map = {
        'temp': 'Temp',
        'd.o. (mg/l)': 'DO',  # keep original patterns if present
        'd.o. (mg/l)': 'DO',
        'ph': 'pH',
        'conductivity (µmhos/cm)': 'Conductivity',
        'b.o.d. (mg/l)': 'BOD',
        'nitrate': 'Nitrate',
        'fecal coliform (mpn/100ml)': 'FecalColiform',
        'total coliform (mpn/100ml)': 'TotalColiform',
        'year': 'Year',
        'station code': 'StationCode',
        'locations': 'MonitoringLocation',
        'state': 'State'
    }

    # Build a normalized -> original column mapping
    orig_cols = list(df.columns)
    norm_map = {orig: normalize_colname(orig) for orig in orig_cols}

    # Attempt mapping using exact normalized matches first
    col_map = {}
    for orig, norm in norm_map.items():
        if norm in rename_map:
            col_map[orig] = rename_map[norm]
        else:
            # heuristic substring matches for common patterns (nitrate, ph, conductivity, coliform, etc.)
            if 'nitrate' in norm or 'nitrat' in norm or 'no3' in norm:
                col_map[orig] = 'Nitrate'
            elif 'coliform' in norm and 'fecal' in norm:
                col_map[orig] = 'FecalColiform'
            elif 'total coliform' in norm or (('coliform' in norm) and ('total' in norm)):
                col_map[orig] = 'TotalColiform'
            elif 'conduct' in norm:
                col_map[orig] = 'Conductivity'
            elif norm.startswith('ph') or norm == 'ph':
                col_map[orig] = 'pH'
            elif norm.startswith('b.o.d') or 'bod' in norm:
                col_map[orig] = 'BOD'
            elif norm.startswith('d.o') or 'do ' in norm:
                col_map[orig] = 'DO'
            # you can add more heuristics here
            else:
                # optional: fuzzy match to known keys (helpful for typos)
                matches = get_close_matches(norm, list(rename_map.keys()), n=1, cutoff=0.8)
                if matches:
                    col_map[orig] = rename_map[matches[0]]
                # else: leave unmapped

    # Apply renaming
    df = df.rename(columns=col_map)
    print("Applied column renames (sample):")
    for k, v in list(col_map.items())[:20]:
        print(f"  {k} -> {v}")

    # Standardize whitespace in column names (safe canonical names)
    df.columns = [c.strip() if isinstance(c, str) else c for c in df.columns]

    # Expected numeric columns (canonical names)
    numeric_cols = ['Temp', 'DO', 'pH', 'Conductivity', 'BOD', 'Nitrate', 'FecalColiform', 'TotalColiform']

    # Pre-clean numeric text function
    def clean_numeric_series(s):
        # convert to string, remove commas, remove unit strings and footnote markers, convert >, < to numeric approx
        s = s.astype(str).str.strip()
        # handle '>' and '<' by removing symbol (could also prefix a flag if needed)
        s = s.str.replace(r'[<>]', '', regex=True)
        # remove commas and common non-numeric chars (units)
        s = s.str.replace(r',', '', regex=True)
        s = s.str.replace(r'[^\d\.\-eE]', '', regex=True)  # keep digits, dot, minus, exponent
        # empty strings -> NaN
        s = s.replace({'': np.nan, 'nan': np.nan, 'None': np.nan})
        return s

    # Create missing-indicator dictionary and coerce
    missing_report = {}
    for c in numeric_cols:
        if c in df.columns:
            before_nonnull = df[c].notna().sum()
            # clean then coerce
            df[c] = clean_numeric_series(df[c])
            df[c] = pd.to_numeric(df[c], errors='coerce')
            after_nonnull = df[c].notna().sum()
            coerced_to_nan = before_nonnull - after_nonnull
            missing_report[c] = {'present': True, 'coerced_to_nan': coerced_to_nan, 'non_null_after': after_nonnull}
        else:
            # column missing entirely
            df[c] = np.nan
            missing_report[c] = {'present': False, 'coerced_to_nan': None, 'non_null_after': 0}

    # Print missing / coercion summary
    print("\nNumeric columns summary:")
    for c, info in missing_report.items():
        if info['present']:
            print(f" {c}: non-null after coercion = {info['non_null_after']} (coerced_to_nan = {info['coerced_to_nan']})")
        else:
            print(f" {c}: MISSING (created column filled with NaN)")

    # Optional: flag suspicious values (simple example)
    if 'pH' in df.columns:
        bad_ph = df.loc[~df['pH'].between(0, 14) & df['pH'].notna(), 'pH']
        if len(bad_ph) > 0:
            print(f"\nWarning: {len(bad_ph)} pH values outside 0-14 range (these will remain in df as-is).")

    # Drop rows with no StationCode AND all numeric cols missing (your previous logic)
    if 'StationCode' in df.columns:
        df = df[df['StationCode'].notna()]

    # Show shape and head
    df = df.reset_index(drop=True)
    print('\nAfter cleaning shape:', df.shape)
    display(df[numeric_cols].head())

else:
    print('Dataset not loaded.')

Applied column renames (sample):
  STATION CODE -> StationCode
  LOCATIONS -> MonitoringLocation
  STATE -> State
  Temp -> Temp
  D.O. (mg/l) -> DO
  PH -> pH
  CONDUCTIVITY (µmhos/cm) -> Conductivity
  B.O.D. (mg/l) -> BOD
  NITRATENAN N+ NITRITENANN (mg/l) -> Nitrate
  FECAL COLIFORM (MPN/100ml) -> FecalColiform
  TOTAL COLIFORM (MPN/100ml)Mean -> TotalColiform
  year -> Year

Numeric columns summary:
 Temp: non-null after coercion = 1899 (coerced_to_nan = 92)
 DO: non-null after coercion = 1960 (coerced_to_nan = 31)
 pH: non-null after coercion = 1983 (coerced_to_nan = 8)
 Conductivity: non-null after coercion = 1966 (coerced_to_nan = 25)
 BOD: non-null after coercion = 1948 (coerced_to_nan = 43)
 Nitrate: non-null after coercion = 1766 (coerced_to_nan = 225)
 FecalColiform: non-null after coercion = 1675 (coerced_to_nan = 316)
 TotalColiform: non-null after coercion = 1859 (coerced_to_nan = 132)


After cleaning shape: (1991, 12)


Unnamed: 0,Temp,DO,pH,Conductivity,BOD,Nitrate,FecalColiform,TotalColiform
0,30.6,6.7,7.5,203.0,,0.1,11.0,27.0
1,29.8,5.7,7.2,189.0,2.0,0.2,4953.0,8391.0
2,29.5,6.3,6.9,179.0,1.7,0.1,3243.0,5330.0
3,29.7,5.8,6.9,64.0,3.8,0.5,5382.0,8443.0
4,29.5,5.8,7.3,83.0,1.9,0.4,3428.0,5500.0


In [81]:
# Cell: create WHO-based Risk column (improved, vectorized, configurable)
import numpy as np

if df is not None:
    # --- Configurable thresholds (document units) ---
    # Note: all thresholds assume units are consistent with cleaned numeric columns.
    # e.g. pH (unitless), DO (mg/L), BOD (mg/L), Conductivity (µS/cm), Nitrate (mg/L),
    # TotalColiform (MPN/100 mL), FecalColiform (MPN/100 mL)
    THRESHOLDS = {
        'pH':        {'low_high': (6.5, 8.5), 'medium_ranges': [(6.5, 7.0), (8.0, 8.5)]},  # keep for doc
        'DO':        {'high': 3.0, 'medium': 5.0},            # <3 -> High, <5 -> Medium, else Low
        'BOD':       {'medium': 1.0, 'high': 3.0},            # >3 High, >1 Medium
        'Conductivity': {'medium': 500, 'high': 1500},       # >1500 High, >500 Medium
        'Nitrate':   {'medium': 10, 'high': 45},             # >45 High, >10 Medium
        'TotalColiform': {'medium': 500, 'high': 2500},      # >2500 High, >500 Medium
        'FecalColiform': {'medium': 100, 'high': 500}        # >500 High, >100 Medium
    }

    # risk score mapping and inverse
    RISK_TO_SCORE = {'Low': 0, 'Medium': 1, 'High': 2}
    SCORE_TO_RISK = {v: k for k, v in RISK_TO_SCORE.items()}

    # helper to safely get series or a nan-series if missing
    def get_series(col):
        if col in df.columns:
            return df[col]
        else:
            # create column of NaNs to avoid KeyError; we'll log later
            df[col] = np.nan
            return df[col]

    # create missing-indicator columns & ensure numeric dtype
    params = ['pH', 'DO', 'BOD', 'Conductivity', 'Nitrate', 'TotalColiform', 'FecalColiform']
    for p in params:
        if p not in df.columns:
            print(f"Warning: column '{p}' not found. Creating as NaN.")
            df[p] = np.nan
        # missing indicator
        df[f'{p}_missing'] = df[p].isna().astype(int)

    # Vectorized risk assignment - produce numeric score columns (0,1,2) quickly
    # pH is special because both low and high extremes are risky
    # Initialize score columns with NaN
    for p in params:
        df[f'{p}_score'] = np.nan

    # pH handling (two-sided)
    if 'pH' in THRESHOLDS:
        col = df['pH']
        # High: pH < 6.5 or pH > 8.5
        high_mask = col < 6.5
        high_mask |= col > 8.5
        # Medium: (6.5 <= pH < 7.0) or (8.0 < pH <= 8.5)
        medium_mask = ((col >= 6.5) & (col < 7.0)) | ((col > 8.0) & (col <= 8.5))
        low_mask = (~high_mask) & (~medium_mask) & (col.notna())
        df.loc[high_mask, 'pH_score'] = RISK_TO_SCORE['High']
        df.loc[medium_mask, 'pH_score'] = RISK_TO_SCORE['Medium']
        df.loc[low_mask, 'pH_score'] = RISK_TO_SCORE['Low']

    # generic pattern for parameters where ">" means worse (e.g., BOD, Conductivity, Nitrate, Coliforms)
    for p in ['DO', 'BOD', 'Conductivity', 'Nitrate', 'TotalColiform', 'FecalColiform']:
        s = df[p]
        t = THRESHOLDS.get(p, {})
        if p == 'DO':
            # for DO lower is worse
            high_mask = s < t['high']   # <3 -> High
            medium_mask = (s < t['medium']) & (~high_mask)  # <5 and >=3 -> Medium
            low_mask = (~high_mask) & (~medium_mask) & (s.notna())
        else:
            high_mask = s > t['high']
            medium_mask = (s > t['medium']) & (~high_mask)
            low_mask = (~high_mask) & (~medium_mask) & (s.notna())

        df.loc[high_mask, f'{p}_score'] = RISK_TO_SCORE['High']
        df.loc[medium_mask, f'{p}_score'] = RISK_TO_SCORE['Medium']
        df.loc[low_mask, f'{p}_score'] = RISK_TO_SCORE['Low']

    # Log parameter-level distributions
    print("Per-parameter risk counts (score):")
    for p in params:
        counts = df[f'{p}_score'].value_counts(dropna=False).sort_index()
        # map index numbers back to risk labels for readability
        readable = {SCORE_TO_RISK.get(int(k), 'NaN') if not np.isnan(k) else 'NaN': v for k, v in counts.items()}
        print(f" {p}: {readable}")

    # --- Combine into overall risk ---
    # approach: take the maximum score across parameters (worst-case); alternative: weighted max
    score_cols = [f'{p}_score' for p in params]
    # convert to numeric and ignore NaNs for max
    df['overall_score'] = df[score_cols].max(axis=1, skipna=True)

    # If all score columns are NaN for a row, overall_score will be NaN -> keep as NaN (no data)
    # Map back to labels
    df['Risk'] = df['overall_score'].map(SCORE_TO_RISK)

    # Optional: if you want to require at least N non-missing parameter scores to set overall risk:
    # df['non_missing_scores'] = df[score_cols].notna().sum(axis=1)
    # df.loc[df['non_missing_scores'] < 2, 'Risk'] = np.nan  # require at least 2 params present (example)

    # Print distribution
    print("\nOverall Risk distribution (including NaN):")
    display(df['Risk'].value_counts(dropna=False))

    # Optional: flag suspicious values
    bad_ph = df.loc[~df['pH'].between(0, 14) & df['pH'].notna(), 'pH']
    if len(bad_ph) > 0:
        print(f"Warning: {len(bad_ph)} pH values outside 0-14 range. Inspect these rows manually.")

else:
    print('Dataset not loaded.')

Per-parameter risk counts (score):
 pH: {'Low': 1259, 'Medium': 508, 'High': 216, 'NaN': 8}
 DO: {'Low': 1729, 'Medium': 172, 'High': 59, 'NaN': 31}
 BOD: {'Low': 406, 'Medium': 930, 'High': 612, 'NaN': 43}
 Conductivity: {'Low': 1428, 'Medium': 293, 'High': 245, 'NaN': 25}
 Nitrate: {'Low': 1711, 'Medium': 53, 'High': 2, 'NaN': 225}
 TotalColiform: {'Low': 951, 'Medium': 502, 'High': 406, 'NaN': 132}
 FecalColiform: {'Low': 647, 'Medium': 482, 'High': 546, 'NaN': 316}

Overall Risk distribution (including NaN):


Risk
High      1203
Medium     714
Low         70
NaN          4
Name: count, dtype: int64



In [82]:
# Cell: create WHO-based Risk column (single copy-paste updated cell)


# --- CONFIG ---
# Documented thresholds (units must match cleaned numeric columns)
THRESHOLDS = {
    'pH':            {'low': 6.5, 'high': 8.5, 'medium_low': 6.5, 'medium_high': 7.0, 'medium2_low': 8.0, 'medium2_high': 8.5},
    'DO':            {'high': 3.0, 'medium': 5.0},            # DO (mg/L): <3 High, <5 Medium, else Low
    'BOD':           {'medium': 1.0, 'high': 3.0},            # BOD (mg/L): >3 High, >1 Medium
    'Conductivity':  {'medium': 500, 'high': 1500},          # µS/cm or µmhos/cm
    'Nitrate':       {'medium': 10, 'high': 45},             # mg/L
    'TotalColiform': {'medium': 500, 'high': 2500},          # MPN/100 mL
    'FecalColiform': {'medium': 100, 'high': 500}            # MPN/100 mL
}

# Controls
RISK_TO_SCORE = {'Low': 0, 'Medium': 1, 'High': 2}
SCORE_TO_RISK = {v: k for k, v in RISK_TO_SCORE.items()}
PARAMS = ['pH', 'DO', 'BOD', 'Conductivity', 'Nitrate', 'TotalColiform', 'FecalColiform']
MIN_NON_MISSING = 2   # require at least this many parameter scores to set an overall Risk
AGG_METHOD = 'max'    # 'max' (default worst-case) or 'weighted' (see WEIGHTS)
WEIGHTS = {'pH':1.0, 'DO':1.0, 'BOD':1.0, 'Conductivity':0.5, 'Nitrate':0.8, 'TotalColiform':1.0, 'FecalColiform':1.2}
MISSING_WARNING_PCT = 0.5    # warn if param missing in >50% rows
HIGH_PCT_THRESHOLD = 0.3     # warn if param High in >30% rows
KEEP_INTERMEDIATE = True     # set False to drop *_score and *_missing columns after

# --- Implementation ---
if df is None:
    print("Dataset not loaded (df is None).")
else:
    # Ensure required columns exist (create as NaN and warn if missing)
    for p in PARAMS:
        if p not in df.columns:
            print(f"Warning: column '{p}' not found. Creating as NaN.")
            df[p] = np.nan
        # missing indicator
        df[f'{p}_missing'] = df[p].isna().astype('int8')

    # Initialize score columns as float (NaN-able) and then convert to nullable Int
    for p in PARAMS:
        df[f'{p}_score'] = np.nan

    # --- pH (two-sided) ---
    p = 'pH'
    if p in THRESHOLDS:
        col = df[p]
        low_thresh, high_thresh = THRESHOLDS[p]['low'], THRESHOLDS[p]['high']
        # High if outside [low_thresh, high_thresh]
        high_mask = (col < low_thresh) | (col > high_thresh)
        medium_mask = (((col >= THRESHOLDS[p]['medium_low']) & (col < THRESHOLDS[p]['medium_high'])) |
                       ((col > THRESHOLDS[p]['medium2_low']) & (col <= THRESHOLDS[p]['medium2_high'])))
        low_mask = (~high_mask) & (~medium_mask) & col.notna()
        df.loc[high_mask, f'{p}_score'] = RISK_TO_SCORE['High']
        df.loc[medium_mask, f'{p}_score'] = RISK_TO_SCORE['Medium']
        df.loc[low_mask, f'{p}_score'] = RISK_TO_SCORE['Low']

    # --- Generic rules for other params (direction: higher is worse except DO) ---
    for p in [x for x in PARAMS if x != 'pH']:
        s = df[p]
        t = THRESHOLDS.get(p, {})
        if p == 'DO':
            # lower is worse for DO
            high_mask = s < t.get('high', np.nan)
            medium_mask = (s < t.get('medium', np.nan)) & (~high_mask)
            low_mask = (~high_mask) & (~medium_mask) & s.notna()
        else:
            high_mask = s > t.get('high', np.nan)
            medium_mask = (s > t.get('medium', np.nan)) & (~high_mask)
            low_mask = (~high_mask) & (~medium_mask) & s.notna()

        df.loc[high_mask, f'{p}_score'] = RISK_TO_SCORE['High']
        df.loc[medium_mask, f'{p}_score'] = RISK_TO_SCORE['Medium']
        df.loc[low_mask, f'{p}_score'] = RISK_TO_SCORE['Low']

    # Convert score columns to nullable integers for clarity
    score_cols = [f'{p}_score' for p in PARAMS]
    for sc in score_cols:
        df[sc] = df[sc].astype('Int64')  # keeps NaNs

    # --- Aggregation: overall_score ---
    if AGG_METHOD == 'max':
        # worst-case: take max of scores (ignoring NaNs)
        df['overall_score'] = df[score_cols].max(axis=1, skipna=True)
    elif AGG_METHOD == 'weighted':
        # weighted average across available scores then round to nearest integer score
        weight_arr = np.array([WEIGHTS[p] for p in PARAMS], dtype=float)
        scores_arr = df[score_cols].to_numpy(dtype=float)  # NaNs remain
        # numerator: sum(score * weight) ignoring NaNs
        numer = np.nansum(np.where(np.isnan(scores_arr), 0, scores_arr * weight_arr), axis=1)
        denom = np.nansum(np.where(np.isnan(scores_arr), 0, weight_arr), axis=1)
        # avoid division by zero: where denom==0 => set overall_score NaN
        overall = np.where(denom > 0, numer / denom, np.nan)
        # round to nearest integer score 0/1/2
        overall_rounded = np.round(overall).astype('Int64')
        df['overall_score'] = pd.Series(overall_rounded, index=df.index)

    # Require minimum non-missing parameter scores
    df['non_missing_scores'] = df[score_cols].notna().sum(axis=1)
    df.loc[df['non_missing_scores'] < MIN_NON_MISSING, 'overall_score'] = pd.NA

    # Map numeric overall_score back to Risk labels and make ordered categorical
    df['Risk'] = df['overall_score'].map(SCORE_TO_RISK)
    df['Risk'] = pd.Categorical(df['Risk'], categories=['Low', 'Medium', 'High'], ordered=True)

    # --- Top driver(s): which parameter(s) had the overall_score ---
    # Vectorized computation for top drivers (fast)
    score_matrix = df[score_cols].to_numpy(dtype=float)  # shape (n_rows, n_params)
    # Replace NaN with -inf to avoid selecting them as top drivers
    nan_mask = np.isnan(score_matrix)
    score_matrix_for_argmax = np.where(nan_mask, -9999.0, score_matrix)
    # max per row
    row_max = np.max(score_matrix_for_argmax, axis=1)
    top_driver_list = []
    for i, rm in enumerate(row_max):
        if rm == -9999.0:
            top_driver_list.append(pd.NA)
            continue
        drivers = [PARAMS[j] for j, val in enumerate(score_matrix_for_argmax[i]) if val == rm]
        top_driver_list.append(",".join(drivers) if drivers else pd.NA)
    df['top_driver'] = pd.Series(top_driver_list, index=df.index)

    # --- Diagnostics & warnings ---
    print("Per-parameter risk (score) counts:")
    for p in PARAMS:
        counts = df[f'{p}_score'].value_counts(dropna=False).sort_index()
        readable = { (SCORE_TO_RISK.get(int(k), 'NaN') if not pd.isna(k) else 'NaN'): v for k, v in counts.items() }
        print(f"  {p}: {readable}")

    print("\nOverall Risk distribution (including NaN):")
    display(df['Risk'].value_counts(dropna=False))

    # Data-quality warnings
    for p in PARAMS:
        pct_missing = df[p].isna().mean()
        if pct_missing > MISSING_WARNING_PCT:
            print(f"Warning: {p} missing in {pct_missing:.1%} of rows.")
        high_pct = (df[f'{p}_score'] == RISK_TO_SCORE['High']).mean()
        if high_pct > HIGH_PCT_THRESHOLD:
            print(f"Warning: {p} marked High in {high_pct:.1%} of rows — verify units/thresholds.")

    # Flag suspicious numeric ranges (example: pH outside 0-14)
    if 'pH' in df.columns:
        bad_ph_count = (~df['pH'].between(0, 14) & df['pH'].notna()).sum()
        if bad_ph_count > 0:
            print(f"Warning: {bad_ph_count} pH values outside 0-14 range. Inspect these rows manually.")

    # Optionally drop intermediate columns to keep dataframe clean
    if not KEEP_INTERMEDIATE:
        drop_cols = score_cols + [f'{p}_missing' for p in PARAMS] + ['non_missing_scores']
        df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

    # finished
    print("\nRisk computation complete.")

Per-parameter risk (score) counts:
  pH: {'Low': np.int64(1259), 'Medium': np.int64(508), 'High': np.int64(216), 'NaN': np.int64(8)}
  DO: {'Low': np.int64(1729), 'Medium': np.int64(172), 'High': np.int64(59), 'NaN': np.int64(31)}
  BOD: {'Low': np.int64(406), 'Medium': np.int64(930), 'High': np.int64(612), 'NaN': np.int64(43)}
  Conductivity: {'Low': np.int64(1428), 'Medium': np.int64(293), 'High': np.int64(245), 'NaN': np.int64(25)}
  Nitrate: {'Low': np.int64(1711), 'Medium': np.int64(53), 'High': np.int64(2), 'NaN': np.int64(225)}
  TotalColiform: {'Low': np.int64(951), 'Medium': np.int64(502), 'High': np.int64(406), 'NaN': np.int64(132)}
  FecalColiform: {'Low': np.int64(647), 'Medium': np.int64(482), 'High': np.int64(546), 'NaN': np.int64(316)}

Overall Risk distribution (including NaN):


Risk
High      1203
Medium     714
Low         70
NaN          4
Name: count, dtype: int64


Risk computation complete.


In [83]:
# ==== DATA PREPARATION BEFORE TRAINING ====

# 1. Remove rows with missing Risk labels
missing_before = df['Risk'].isna().sum()
if missing_before > 0:
    print(f"Dropping {missing_before} rows with missing Risk labels before training.")
    df = df[df['Risk'].notna()].reset_index(drop=True)

# 2. Encode target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(df['Risk'])
print("Encoded classes:", list(le.classes_))

# 3. Define features
features = ['Temp','DO','pH','Conductivity','BOD','Nitrate','FecalColiform','TotalColiform']
features = [f for f in features if f in df.columns]

# 4. Build X DataFrame
X = df[features].copy()

print("Prepared X and y_encoded:")
print("X shape:", X.shape)
print("y shape:", y_encoded.shape)

Dropping 4 rows with missing Risk labels before training.
Encoded classes: ['High', 'Low', 'Medium']
Prepared X and y_encoded:
X shape: (1987, 8)
y shape: (1987,)


In [84]:
# ==== TRAINING CELL (clean, minimal, final) ====

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np

RANDOM_STATE = 42
TEST_SIZE = 0.2
CV_FOLDS = 5
N_ITER = 20

# X must be a DataFrame and y must be encoded before this cell
numeric_cols = ['Temp','DO','pH','Conductivity','BOD','Nitrate','FecalColiform','TotalColiform']
numeric_cols = [c for c in numeric_cols if c in X.columns]
categorical_cols = [c for c in X.columns if c not in numeric_cols]

# --- Preprocessor ---
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('impute', SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    ]), numeric_cols),
    
    ('cat', Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# --- Full pipeline ---
pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))
])

# --- Hyperparameter search ---
param_dist = {
    'clf__n_estimators': [100, 200, 300, 500],
    'clf__max_depth': [None, 8, 12, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2', 0.5],
    'clf__class_weight': [None, 'balanced']
}

# Compute minimum class size and adjust CV folds
class_counts = pd.Series(y_encoded).value_counts()
min_count = int(class_counts.min())
print("Class counts:", class_counts.to_dict())

CV_FOLDS = min(5, max(2, min_count))  # adaptive CV folds
print("Using CV_FOLDS =", CV_FOLDS)

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring='f1_macro',
    cv=cv,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=1,
    refit=True
)

# --- Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

# --- Eval ---
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))

# --- Save artifact ---
artifact = {
    'pipeline': best_model,
    'label_encoder': le,
    'features': list(X.columns),
    'best_params': search.best_params_
}

joblib.dump(artifact, MODEL_OUTPUT, compress=3)
print("Saved model to:", MODEL_OUTPUT)

Class counts: {0: 1203, 2: 714, 1: 70}
Using CV_FOLDS = 5
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Accuracy: 0.9849246231155779
              precision    recall  f1-score   support

        High       1.00      0.99      1.00       241
         Low       0.92      0.79      0.85        14
      Medium       0.97      0.99      0.98       143

    accuracy                           0.98       398
   macro avg       0.96      0.92      0.94       398
weighted avg       0.98      0.98      0.98       398

Saved model to: /Users/anishsharma/Developer/SE project/extract/rf_water_model.joblib


In [85]:
# test examples

artifact = joblib.load(MODEL_OUTPUT)
pipeline = artifact['pipeline']
le = artifact['label_encoder']
features = artifact['features']

# 1) Sanity: shape and classes
print("Pipeline loaded. Classes:", le.classes_)

# 2) Single sample prediction and probabilities
sample = X_test.iloc[[0]][features]  # use a real row
pred = pipeline.predict(sample)[0]
proba = pipeline.predict_proba(sample)[0]
print("Pred:", le.inverse_transform([pred]) if hasattr(le, 'inverse_transform') else pred)
for cls, p in zip(pipeline.classes_, proba):
    print(f"{cls}: {p*100:.4f}%")

# 3) Batch shape test
batch = X_test.iloc[:5][features]
assert pipeline.predict_proba(batch).shape == (5, len(pipeline.classes_))

# 4) Missing value handling test (should not raise)
test_missing = batch.copy()
test_missing.iloc[0, :] = np.nan
_ = pipeline.predict_proba(test_missing)  # should run without crashing if pipeline handles missing

Pipeline loaded. Classes: ['High' 'Low' 'Medium']
Pred: ['High']
0: 98.5915%
1: 0.1380%
2: 1.2705%


In [86]:
# Some gpt test cell

# Robust loader + tester for saved model artifact
import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

# Load artifact
artifact = joblib.load(MODEL_OUTPUT)
print("Artifact type:", type(artifact))

# If artifact is a dict-like, print keys
if isinstance(artifact, dict):
    print("Artifact keys:", list(artifact.keys()))
else:
    # Not a dict - print repr and try to treat as pipeline directly
    print("Artifact is not a dict. repr:")
    print(repr(artifact))

# Attempt to locate pipeline / model / components
pipeline = None
le = None
features = None

if isinstance(artifact, dict):
    # Common variants handled:
    # 1) {'pipeline': <Pipeline>, 'label_encoder': le, 'features': [...]}
    if 'pipeline' in artifact:
        pipeline = artifact['pipeline']
        le = artifact.get('label_encoder', None)
        features = artifact.get('features', None)
        print("Using artifact['pipeline'].")
    # 2) {'model': <clf>, 'imputer': <preproc>, 'label_encoder': le, 'features': [...]}
    elif 'model' in artifact:
        model = artifact['model']
        imputer = artifact.get('imputer', None) or artifact.get('preprocessor', None)
        le = artifact.get('label_encoder', None) or artifact.get('le', None)
        features = artifact.get('features', None) or artifact.get('feature_names', None)
        if imputer is not None:
            # build a small pipeline (imputer might be a transformer; if it's a bare SimpleImputer it's still fine)
            pipeline = Pipeline([('preproc', imputer), ('clf', model)])
            print("Rebuilt pipeline from artifact['imputer'] + artifact['model'].")
        else:
            # no preprocessor: create a minimal pipeline that just contains the model
            pipeline = Pipeline([('clf', model)])
            print("Created pipeline from artifact['model'] (no imputer found).")
    # 3) older saving: saved classifier directly (not in dict)
    else:
        # check for common alternatives
        possible_le_keys = [k for k in artifact.keys() if 'label' in k.lower() or 'le' == k.lower()]
        if possible_le_keys:
            print("Found potential label encoder keys:", possible_le_keys)
        # try to find model-like object
        # fallback: if dict contains only one sklearn estimator, try to use it
        # not doing that automatically to avoid speculation
        print("Unhandled artifact dict layout. Please examine keys above.")
else:
    # artifact not a dict: maybe it is directly the pipeline/model
    try:
        # If it looks like a pipeline or estimator, use it directly
        from sklearn.base import BaseEstimator
        if isinstance(artifact, BaseEstimator) or hasattr(artifact, 'predict'):
            pipeline = artifact
            print("Artifact appears to be a model/pipeline object and will be used directly.")
    except Exception:
        pass

# At this point, pipeline may or may not be set
if pipeline is None:
    raise KeyError("Could not find a pipeline/model in the artifact automatically. "
                   "Please inspect the artifact keys printed above and let me know its structure, "
                   "or re-save your artifact as: joblib.dump({'pipeline': pipeline, 'label_encoder': le, 'features': features}, MODEL_OUTPUT)")

# If features missing, try to infer from saved metadata or ask user fallback
if features is None:
    # try to infer from pipeline if it has named transformer with get_feature_names_out
    try:
        if hasattr(pipeline, 'named_steps') and 'preproc' in pipeline.named_steps:
            pre = pipeline.named_steps['preproc']
            if hasattr(pre, 'get_feature_names_out'):
                features = list(pre.get_feature_names_out())
                print("Inferred feature names from pipeline.preproc.get_feature_names_out().")
    except Exception:
        pass

if features is None:
    # fallback: try artifact['features'] variations, else use X_test columns if available
    if isinstance(artifact, dict) and 'features' in artifact:
        features = artifact['features']
    elif 'X_test' in globals():
        print("Using X_test columns as features (ensure X_test exists and is correct).")
        features = list(X_test.columns)
    else:
        print("Features not found. You must supply `features` list or create X_test in the session.")
        raise KeyError("Missing 'features' information. Provide artifact['features'] or set X_test in the session.")

# Try to extract label encoder if None
if le is None:
    if isinstance(artifact, dict):
        for k in artifact.keys():
            if 'label' in k.lower() or k.lower() in ('le', 'label_encoder', 'labelencoder'):
                le = artifact[k]
                print(f"Using label encoder from artifact['{k}'].")
                break

# Now we have pipeline, features, possibly le. Run the same tests as before
print("\nReady to run quick tests.")
print("Pipeline type:", type(pipeline))
print("Number of features expected:", len(features))

# Create X_test if not present
if 'X_test' not in globals():
    print("X_test not found in memory. Creating a dummy X_test with NaNs for feature-shape checks.")
    X_test = pd.DataFrame([ {f: np.nan for f in features} ])

# Single-sample prediction
sample = X_test.iloc[[0]][features]
pred = pipeline.predict(sample)[0]
try:
    pred_label = le.inverse_transform([pred])[0] if (le is not None and hasattr(le, 'inverse_transform')) else pred
except Exception:
    pred_label = pred

print("\nSingle-sample prediction (raw):", pred)
print("Single-sample prediction (label):", pred_label)

# Probabilities (if available)
if hasattr(pipeline, "predict_proba"):
    proba = pipeline.predict_proba(sample)[0]
    cls_names = getattr(pipeline, "classes_", getattr(le, "classes_", None))
    if cls_names is None:
        cls_names = list(range(len(proba)))
    print("\nProbabilities:")
    for cls, p in zip(cls_names, proba):
        print(f" {cls}: {p*100:.4f}%")
else:
    print("Pipeline has no predict_proba method. Skipping probabilities.")

# Batch test
batch = X_test.iloc[:5][features]
if hasattr(pipeline, "predict_proba"):
    probs_batch = pipeline.predict_proba(batch)
    print("\nBatch predict_proba shape:", probs_batch.shape)
else:
    print("Batch predict_proba skipped (no predict_proba).")

# Missing-value test
test_missing = batch.copy()
test_missing.iloc[0, :] = np.nan
try:
    _ = pipeline.predict_proba(test_missing) if hasattr(pipeline, "predict_proba") else pipeline.predict(test_missing)
    print("Missing-value handling test: OK (no exception).")
except Exception as e:
    print("Missing-value handling test: FAILED. Exception:", e)

Artifact type: <class 'dict'>
Artifact keys: ['pipeline', 'label_encoder', 'features', 'best_params']
Using artifact['pipeline'].

Ready to run quick tests.
Pipeline type: <class 'sklearn.pipeline.Pipeline'>
Number of features expected: 8

Single-sample prediction (raw): 0
Single-sample prediction (label): High

Probabilities:
 0: 98.5915%
 1: 0.1380%
 2: 1.2705%

Batch predict_proba shape: (5, 3)
Missing-value handling test: OK (no exception).


In [87]:
# Use this cell to run a realistic single-row prediction with precise percentage output
import numpy as np
import pandas as pd
import joblib

# Load artifact if not already loaded
if 'artifact' not in globals():
    artifact = joblib.load(MODEL_OUTPUT)

# Reconstruct pipeline if not present
if 'pipeline' not in globals():
    if 'model' in artifact and ('imputer' in artifact or 'preprocessor' in artifact):
        imputer = artifact.get('imputer', artifact.get('preprocessor'))
        pipeline = Pipeline([('preproc', imputer), ('clf', artifact['model'])])
    elif isinstance(artifact, dict) and 'model' in artifact:
        pipeline = Pipeline([('clf', artifact['model'])])
    else:
        pipeline = artifact  # fallback if artifact is pipeline

le = artifact.get('label_encoder', artifact.get('le', None))
features = artifact.get('features', None)
if features is None:
    raise KeyError("Artifact missing 'features'. Provide artifact['features'] or set features list.")

# Create a realistic test sample:
# 1) Prefer to use a real row from the cleaned df if available
if 'df' in globals() and set(features).issubset(set(df.columns)):
    # pick a real non-NaN row if possible
    sample_row = df.dropna(subset=features).sample(n=1, random_state=42).iloc[0][features]
    sample_df = pd.DataFrame([sample_row.values], columns=features)
    print("Using a real row from df for prediction.")
else:
    # 2) fallback: build a median-based sample so values are numeric and sensible
    print("df not available or missing features — constructing median-based sample.")
    sample_values = {}
    for f in features:
        if f in globals().get('df', pd.DataFrame()).columns:
            sample_values[f] = df[f].median(skipna=True)
        else:
            # reasonable generic defaults (tweak if you want)
            if 'pH' in f.lower():
                sample_values[f] = 7.2
            elif 'do' in f.lower():
                sample_values[f] = 5.0
            elif 'bod' in f.lower():
                sample_values[f] = 1.0
            elif 'conduct' in f.lower():
                sample_values[f] = 200.0
            elif 'nitrate' in f.lower():
                sample_values[f] = 2.0
            elif 'coliform' in f.lower():
                sample_values[f] = 50.0
            else:
                sample_values[f] = 0.0
    sample_df = pd.DataFrame([sample_values], columns=features)

# Ensure the DataFrame has exact feature names and order
sample_df = sample_df[features]

# Run prediction
# --- Run prediction safely (no warnings, preserves feature names if possible) ---
preproc = pipeline.named_steps.get('preproc')
clf = pipeline.named_steps.get('clf')

try:
    # if the preprocessor supports get_feature_names_out, we build a named DataFrame
    transformed_cols = preproc.get_feature_names_out()
    X_trans = preproc.transform(sample_df)
    X_trans_df = pd.DataFrame(X_trans, columns=transformed_cols, index=sample_df.index)

    # direct classifier prediction (no feature-name warnings)
    probs = clf.predict_proba(X_trans_df)[0]
    pred_raw = clf.predict(X_trans_df)[0]

except Exception as e:
    # fallback to pipeline.predict_proba if preproc does NOT support named columns
    print("Could not create named DataFrame from preprocessor (fallback). Error:", e)
    probs = pipeline.predict_proba(sample_df)[0]
    pred_raw = pipeline.predict(sample_df)[0]

# decode label
try:
    pred_label = le.inverse_transform([pred_raw])[0]
except Exception:
    pred_label = pred_raw

# get human-readable class names
encoded_classes = getattr(pipeline, "classes_", None)
if le is not None and encoded_classes is not None:
    try:
        readable_classes = le.inverse_transform(encoded_classes)
    except Exception:
        readable_classes = encoded_classes
else:
    readable_classes = encoded_classes if encoded_classes is not None else [f"class_{i}" for i in range(len(probs))]
    # Print precise percentages
    print("\nPrediction result:")
    print(" Predicted label (raw):", pred_raw)
    print(" Predicted label (decoded):", pred_label)
    print("\nProbabilities (precise percentages):")
    for cls_name, p in zip(readable_classes, probs):
        print(f" {cls_name}: {p*100:.4f}%")
    print("\nPredicted probability (top):", round(100.0 * probs.max(), 4), "%")

Using a real row from df for prediction.




In [88]:
# Robust prediction cell — replace your old BLANK_INPUT cell with this
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
import warnings

# put your sample values here
BLANK_INPUT = {
    'Temp': 29.5,
    'DO': 5.8,
    'pH': 7.2,
    'Conductivity': 150,
    'BOD': 2.0,
    'Nitrate': 0.5,
    'FecalColiform': 120,
    'TotalColiform': 900
}

if not Path(MODEL_OUTPUT).exists():
    raise FileNotFoundError(f"Model artifact not found at {MODEL_OUTPUT}. Run training cell first.")

obj = joblib.load(MODEL_OUTPUT)
print("Loaded artifact keys:", list(obj.keys()) if isinstance(obj, dict) else "artifact is not a dict")

# Try to get pipeline, label encoder, features in a robust way
pipeline = None
le = None
feat_list = None

if isinstance(obj, dict):
    # preferred modern structure: 'pipeline' saved
    if 'pipeline' in obj:
        pipeline = obj['pipeline']
        le = obj.get('label_encoder', obj.get('le', None))
        feat_list = obj.get('features', None)
        print("Using artifact['pipeline'].")
    # older structure: 'model' + 'imputer'
    elif 'model' in obj and ('imputer' in obj or 'preprocessor' in obj):
        model = obj['model']
        imputer = obj.get('imputer', obj.get('preprocessor'))
        try:
            from sklearn.pipeline import Pipeline
            pipeline = Pipeline([('preproc', imputer), ('clf', model)])
            print("Rebuilt pipeline from artifact['imputer'] + artifact['model'].")
        except Exception as e:
            raise RuntimeError("Failed to rebuild pipeline from model+imputer: " + str(e))
        le = obj.get('label_encoder', obj.get('le', None))
        feat_list = obj.get('features', None)
    # artifact saved as single model object in dict (rare)
    elif 'model' in obj:
        pipeline = obj['model']
        le = obj.get('label_encoder', obj.get('le', None))
        feat_list = obj.get('features', None)
        print("Using artifact['model'] directly.")
    else:
        # fallback: maybe artifact contains pipeline under some other name
        # try to detect the first sklearn estimator-looking object
        possible_keys = [k for k, v in obj.items() if hasattr(v, 'predict')]
        if possible_keys:
            pipeline = obj[possible_keys[0]]
            le = obj.get('label_encoder', obj.get('le', None))
            feat_list = obj.get('features', None)
            print(f"Using artifact['{possible_keys[0]}'] as pipeline (fallback).")
else:
    # not a dict; maybe the artifact *is* the pipeline
    if hasattr(obj, 'predict'):
        pipeline = obj
        print("Artifact is a model/pipeline object directly.")
    else:
        raise RuntimeError("Loaded artifact format not recognized. Please re-save artifact with {'pipeline': pipeline, 'label_encoder': le, 'features': features}.")

# sanity checks
if pipeline is None:
    raise RuntimeError("Could not extract pipeline from artifact. Inspect artifact structure.")

# Get features list; if missing try pipeline/preprocessor or user-provided X
if feat_list is None:
    # try to get from pipeline.preproc if available
    try:
        pre = pipeline.named_steps.get('preproc', None)
        if pre is not None and hasattr(pre, 'get_feature_names_out'):
            feat_list = list(pre.get_feature_names_out())
            print("Inferred feature names from preprocessor.get_feature_names_out().")
    except Exception:
        pass

if feat_list is None:
    # as last resort try existing df (if available)
    if 'df' in globals():
        feat_list = [c for c in BLANK_INPUT.keys() if c in df.columns]
        if not feat_list:
            # use df numeric feature intersection
            feat_list = list(df.columns)
            print("Fallback: using df.columns as features (verify ordering).")
    else:
        raise KeyError("features list not found in artifact and df not available. Provide artifact['features'] or set feat_list manually.")

# Build input DataFrame with exact ordering
x_new = pd.DataFrame([BLANK_INPUT])
# Add any missing columns from feat_list with NaN
for c in feat_list:
    if c not in x_new.columns:
        x_new[c] = np.nan
# Reindex into the exact order
x_new = x_new.reindex(columns=feat_list)

# Run prediction using pipeline (which handles preprocessing)
# Suppress the benign sklearn warning about feature names if it appears — optional
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="X does not have valid feature names")
    # predict probabilities if available
    if hasattr(pipeline, "predict_proba"):
        probs = pipeline.predict_proba(x_new)[0]
    else:
        probs = None
    pred_raw = pipeline.predict(x_new)[0]

# Decode predicted label
pred_label = None
if le is not None:
    try:
        # label encoder expects array-like numeric labels or encoded labels
        pred_label = le.inverse_transform([pred_raw])[0]
    except Exception:
        try:
            # sometimes pipeline.predict already returns decoded strings
            pred_label = pred_raw
        except Exception:
            pred_label = pred_raw
else:
    # try pipeline.classes_ -> maybe they are string labels already
    if hasattr(pipeline, "classes_"):
        classes = pipeline.classes_
        try:
            pred_label = classes[list(classes).index(pred_raw)] if pred_raw in classes else pred_raw
        except Exception:
            pred_label = pred_raw
    else:
        pred_label = pred_raw

# Print results
print("\nPrediction result:")
print(" Predicted (raw):", pred_raw)
print(" Predicted (decoded):", pred_label)

if probs is not None:
    # find readable class names
    class_names = getattr(pipeline, "classes_", None)
    if le is not None and class_names is not None:
        try:
            readable = le.inverse_transform(class_names)
        except Exception:
            readable = class_names
    else:
        readable = class_names if class_names is not None else [f"class_{i}" for i in range(len(probs))]

    print("\nProbabilities (precise percentages):")
    for cls_name, p in zip(readable, probs):
        print(f" {cls_name}: {p*100:.4f}%")
    print("\nPredicted probability (top):", round(100.0 * np.max(probs), 4), "%")
else:
    print("\nNo predict_proba available for this model. Only label predicted.")

Loaded artifact keys: ['pipeline', 'label_encoder', 'features', 'best_params']
Using artifact['pipeline'].

Prediction result:
 Predicted (raw): 2
 Predicted (decoded): Medium

Probabilities (precise percentages):
 High: 8.4022%
 Low: 5.0351%
 Medium: 86.5628%

Predicted probability (top): 86.5628 %
