# AWARE — Random Forest (updated for `water_dataX.csv`)

This notebook:

1. Loads `/mnt/data/water_dataX.csv`
2. Cleans & renames columns
3. Creates `Risk` column using WHO/BIS-aligned thresholds
4. Trains a Random Forest classifier
5. Saves the trained model and imputer
6. Provides a `BLANK_INPUT` cell to make predictions on new data

Run cells top-to-bottom. If your CSV is elsewhere, change `DATA_PATH` in the first code cell.

In [41]:
# Cell: imports and settings
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import joblib

DATA_PATH = r"C:\Users\prana\Documents\PROJECTS\AWARE\extract\water_dataX.csv" # change if needed
MODEL_OUTPUT = r"C:\Users\prana\Documents\PROJECTS\AWARE\extract\rf_water_model.joblib"  # Fixed Windows path
print('DATA_PATH =', DATA_PATH)
print('MODEL_OUTPUT =', MODEL_OUTPUT)


DATA_PATH = C:\Users\prana\Documents\PROJECTS\AWARE\extract\water_dataX.csv
MODEL_OUTPUT = C:\Users\prana\Documents\PROJECTS\AWARE\extract\rf_water_model.joblib


In [42]:
# Cell: load data
try:
    df = pd.read_csv(DATA_PATH, encoding='latin1')
    print('Loaded shape:', df.shape)
    display(df.head())
except Exception as e:
    print('Error loading CSV:', e)
    df = None


Loaded shape: (1991, 12)


Unnamed: 0,STATION CODE,LOCATIONS,STATE,Temp,D.O. (mg/l),PH,CONDUCTIVITY (µmhos/cm),B.O.D. (mg/l),NITRATENAN N+ NITRITENANN (mg/l),FECAL COLIFORM (MPN/100ml),TOTAL COLIFORM (MPN/100ml)Mean,year
0,1393,"DAMANGANGA AT D/S OF MADHUBAN, DAMAN",DAMAN & DIU,30.6,6.7,7.5,203,NAN,0.1,11,27,2014
1,1399,ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOI...,GOA,29.8,5.7,7.2,189,2,0.2,4953,8391,2014
2,1475,ZUARI AT PANCHAWADI,GOA,29.5,6.3,6.9,179,1.7,0.1,3243,5330,2014
3,3181,RIVER ZUARI AT BORIM BRIDGE,GOA,29.7,5.8,6.9,64,3.8,0.5,5382,8443,2014
4,3182,RIVER ZUARI AT MARCAIM JETTY,GOA,29.5,5.8,7.3,83,1.9,0.4,3428,5500,2014


In [43]:
# Cell: clean & rename columns
if df is not None:
    # Common renames for this dataset
    rename_map = {
        'Temp': 'Temp',
        'D.O. (mg/l)': 'DO',
        'PH': 'pH',
        'PH ': 'pH',
        'PH\r\n': 'pH',
        'CONDUCTIVITY (µmhos/cm)': 'Conductivity',
        'CONDUCTIVITY (µmhos/cm) ': 'Conductivity',
        'CONDUCTIVITY (µmhos/cm)Mean': 'Conductivity',
        'CONDUCTIVITY (µmhos/cm) Mean': 'Conductivity',
        'CONDUCTIVITY (µmhos/cm)Mean ': 'Conductivity',
        'CONDUCTIVITY (µmhos/cm)Mean\r\n': 'Conductivity',
        'B.O.D. (mg/l)': 'BOD',
        'B.O.D. (mg/l) ': 'BOD',
        'NITRATENAN N+ NITRITENANN ': 'Nitrate',
        'NITRATENAN N+ NITRITENANN (mg/l)': 'Nitrate',
        'FECAL COLIFORM (MPN/100ml)': 'FecalColiform',
        'TOTAL COLIFORM (MPN/100ml)Mean': 'TotalColiform',
        'TOTAL COLIFORM (MPN/100ml)': 'TotalColiform',
        'TOTAL COLIFORM (MPN/100ml) Mean': 'TotalColiform',
        'year': 'Year',
        'Year': 'Year',
        'STATION CODE': 'StationCode',
        'LOCATIONS': 'MonitoringLocation',
        'STATE': 'State'
    }
    # apply rename for columns that match keys or case-insensitive match
    col_map = {}
    for c in df.columns:
        key = c.strip()
        # Try exact match first
        if key in rename_map:
            col_map[c] = rename_map[key]
        # Try uppercase match
        elif key.upper() in rename_map:
            col_map[c] = rename_map[key.upper()]
        # Try partial match for nitrate column (handles variations)
        elif 'NITRATENAN' in key.upper() and 'NITRITENANN' in key.upper():
            col_map[c] = 'Nitrate'
    df = df.rename(columns=col_map)

    # Coerce numeric columns
    numeric_cols = ['Temp','DO','pH','Conductivity','BOD','Nitrate','FecalColiform','TotalColiform']
    for c in numeric_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
        else:
            df[c] = np.nan

    # Drop completely empty rows (no station code and no numeric info)
    if 'StationCode' in df.columns:
        df = df[df['StationCode'].notna()]
    df = df.reset_index(drop=True)
    print('After cleaning shape:', df.shape)
    display(df[numeric_cols].head())
else:
    print('Dataset not loaded.')

After cleaning shape: (1991, 12)


Unnamed: 0,Temp,DO,pH,Conductivity,BOD,Nitrate,FecalColiform,TotalColiform
0,30.6,6.7,7.5,203.0,,0.1,11.0,27.0
1,29.8,5.7,7.2,189.0,2.0,0.2,4953.0,8391.0
2,29.5,6.3,6.9,179.0,1.7,0.1,3243.0,5330.0
3,29.7,5.8,6.9,64.0,3.8,0.5,5382.0,8443.0
4,29.5,5.8,7.3,83.0,1.9,0.4,3428.0,5500.0


In [44]:
# Cell: create WHO-based Risk column
if df is not None:
    def ph_risk(v):
        if pd.isna(v): return np.nan
        if v < 6.5 or v > 8.5: return 'High'
        if (6.5 <= v < 7.0) or (8.0 < v <= 8.5): return 'Medium'
        return 'Low'

    def do_risk(v):
        if pd.isna(v): return np.nan
        if v < 3.0: return 'High'
        if v < 5.0: return 'Medium'
        return 'Low'

    def bod_risk(v):
        if pd.isna(v): return np.nan
        if v > 3.0: return 'High'
        if v > 1.0: return 'Medium'
        return 'Low'

    def cond_risk(v):
        if pd.isna(v): return np.nan
        if v > 1500: return 'High'
        if v > 500: return 'Medium'
        return 'Low'

    def nitrate_risk(v):
        if pd.isna(v): return np.nan
        if v > 45: return 'High'
        if v > 10: return 'Medium'
        return 'Low'

    def totalcol_risk(v):
        if pd.isna(v): return np.nan
        if v > 2500: return 'High'
        if v > 500: return 'Medium'
        return 'Low'

    def fecalcol_risk(v):
        if pd.isna(v): return np.nan
        if v > 500: return 'High'
        if v > 100: return 'Medium'
        return 'Low'

    df['pH_Risk'] = df['pH'].apply(ph_risk)
    df['DO_Risk'] = df['DO'].apply(do_risk)
    df['BOD_Risk'] = df['BOD'].apply(bod_risk)
    df['Cond_Risk'] = df['Conductivity'].apply(cond_risk)
    df['Nitrate_Risk'] = df['Nitrate'].apply(nitrate_risk)
    df['TotalCol_Risk'] = df['TotalColiform'].apply(totalcol_risk)
    df['FecalCol_Risk'] = df['FecalColiform'].apply(fecalcol_risk)

    def combine_overall(row):
        risks = [row[c] for c in ['pH_Risk','DO_Risk','BOD_Risk','Cond_Risk','Nitrate_Risk','TotalCol_Risk','FecalCol_Risk']]
        risks = [r for r in risks if pd.notna(r)]
        if not risks:
            return np.nan
        if 'High' in risks:
            return 'High'
        if 'Medium' in risks:
            return 'Medium'
        return 'Low'

    df['Risk'] = df.apply(combine_overall, axis=1)
    print('Risk distribution:')
    display(df['Risk'].value_counts(dropna=False))
else:
    print('Dataset not loaded.')

Risk distribution:


Risk
High      1203
Medium     714
Low         70
NaN          4
Name: count, dtype: int64

In [45]:
# Cell: prepare data for training (remove NaN Risk values)
if df is not None and 'Risk' in df.columns:
    # Select features for the model
    features = ['Temp', 'DO', 'pH', 'Conductivity', 'BOD', 'Nitrate', 'FecalColiform', 'TotalColiform']
    
    # Remove rows where Risk is NaN (we can't train on these)
    df_clean = df[df['Risk'].notna()].copy()
    print(f'Original rows: {len(df)}, After removing NaN Risk: {len(df_clean)}')
    print(f'Removed {len(df) - len(df_clean)} rows with NaN Risk values')
    
    # Extract feature matrix X and target y
    X = df_clean[features].copy()
    y = df_clean['Risk'].copy()
    
    print(f'\nFinal data shape: X={X.shape}, y={y.shape}')
    print(f'Risk distribution after cleaning:\n{y.value_counts()}')
    
    # Handle missing values in features with imputer
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    X_imputed = pd.DataFrame(X_imputed, columns=features, index=X.index)
    
    # Encode target labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    print(f'\nEncoded labels: {dict(zip(le.classes_, range(len(le.classes_))))}')
    print(f'Features: {features}')
    print(f'\nX_imputed shape: {X_imputed.shape}')
    print(f'y_encoded shape: {y_encoded.shape}')
    print('\nData is ready for training!')
else:
    print('Dataset not loaded or Risk column not found. Run previous cells first.')

Original rows: 1991, After removing NaN Risk: 1987
Removed 4 rows with NaN Risk values

Final data shape: X=(1987, 8), y=(1987,)
Risk distribution after cleaning:
Risk
High      1203
Medium     714
Low         70
Name: count, dtype: int64

Encoded labels: {'High': 0, 'Low': 1, 'Medium': 2}
Features: ['Temp', 'DO', 'pH', 'Conductivity', 'BOD', 'Nitrate', 'FecalColiform', 'TotalColiform']

X_imputed shape: (1987, 8)
y_encoded shape: (1987,)

Data is ready for training!


In [46]:
# Cell: train Random Forest
if 'X_imputed' in globals() and 'y_encoded' in globals():
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    # Ensure X_train and X_test are DataFrames with feature names (train_test_split may convert to arrays)
    if not isinstance(X_train, pd.DataFrame):
        X_train = pd.DataFrame(X_train, columns=X_imputed.columns)
        X_test = pd.DataFrame(X_test, columns=X_imputed.columns)
    rf = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    # save model + imputer + label encoder
    joblib.dump({'model': rf, 'imputer': imputer, 'label_encoder': le, 'features': features}, MODEL_OUTPUT)
    print('Saved model to', MODEL_OUTPUT)
else:
    print('Data not prepared. Run the data preparation cell (Cell 5) first.')

Accuracy: 0.9824120603015075
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       241
         Low       1.00      0.64      0.78        14
      Medium       0.96      0.99      0.98       143

    accuracy                           0.98       398
   macro avg       0.99      0.88      0.92       398
weighted avg       0.98      0.98      0.98       398

Saved model to C:\Users\prana\Documents\PROJECTS\AWARE\extract\rf_water_model.joblib


In [47]:
# Cell: BLANK_INPUT for prediction
# Replace values in BLANK_INPUT with new sample values
# Note: Nitrite removed as it's not in the dataset
BLANK_INPUT = {
    'Temp': 29.5,
    'DO': 5.8,
    'pH': 7.2,
    'Conductivity': 150,
    'BOD': 2.0,
    'Nitrate': 0.5,

    'FecalColiform': 120,
    'TotalColiform': 900
}

if Path(MODEL_OUTPUT).exists():
    obj = joblib.load(MODEL_OUTPUT)
    model = obj['model']
    imputer = obj['imputer']
    le = obj['label_encoder']
    feat_list = obj['features']

    x_new = pd.DataFrame([BLANK_INPUT])
    # ensure columns in order
    x_new = x_new.reindex(columns=feat_list)
    x_new_imputed = imputer.transform(x_new)
    # Convert back to DataFrame with feature names to match training data format
    x_new_imputed = pd.DataFrame(x_new_imputed, columns=feat_list, index=x_new.index)
    pred = model.predict(x_new_imputed)
    print('Predicted (encoded):', pred)
    print('Predicted Risk:', le.inverse_transform(pred)[0])
else:
    print('Model not found. Run training cell first.')

Predicted (encoded): [2]
Predicted Risk: Medium
