# Random Forest classifier for Project AWARE

This notebook follows the SRS and these steps:

1. load data
2. select features
3. cleanup data
4. split data into train and test
5. create and train a Random Forest model
6. evaluate and produce predictions (Low/Medium/High)

The notebook will also include a blank input cell where you can enter values and get a prediction.


In [1]:
# Imports and settings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import joblib

In [2]:
# Paths - set DATA_PATH to your CSV file path
DATA_PATH = r"""C:\Users\prana\Documents\PROJECTS\AWARE\extract\WQ_combined_clean.csv"""
MODEL_OUTPUT = 'rf_aware_model.joblib'
TARGET_COLUMN = None  # if you know the target column (e.g., 'Risk'), set it here

print('DATA_PATH =',r"C:\Users\prana\Documents\PROJECTS\AWARE\extract\WQ_combined_clean.csv")

DATA_PATH = C:\Users\prana\Documents\PROJECTS\AWARE\extract\WQ_combined_clean.csv


In [None]:
# 1) Load data
try:
    df = pd.read_csv(DATA_PATH)
    print('Loaded dataset with shape:', df.shape)
    display(df.head())
except Exception as e:
    print('Error loading CSV. Please check DATA_PATH. Error:', e)
    df = None


In [None]:
# 2) Select features
if df is None:
    print('Dataset not loaded; set DATA_PATH and re-run.')
else:
    # Auto-detect target column
    if TARGET_COLUMN:
        target_col = TARGET_COLUMN
    else:
        possible = [c for c in df.columns if c.lower()=='risk' or c.lower()=='risk_level' or c.lower()=='label']
        if len(possible)>0:
            target_col = possible[0]
        else:
            proxies = ['cases','case_count','diarrhea_cases','diarrhoea_cases','turbidity','ph']
            found = [c for c in df.columns if c.lower() in proxies]
            target_col = None
            if found:
                print('No explicit target column found. You have columns that might be proxies:', found)
                print('You should set TARGET_COLUMN manually. For now notebook will proceed but will NOT train a classifier until a target is specified.')
    print('TARGET_COLUMN =', target_col)

    # Choose numeric features by default (exclude target)
    if target_col and target_col in df.columns:
        feature_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c!=target_col]
    else:
        feature_cols = list(df.select_dtypes(include=[np.number]).columns)
    print('Selected feature columns (numeric):', feature_cols)


In [None]:
# 3) Cleanup data
if df is not None:
    X = df[feature_cols].copy()
    # Impute numeric missing values with median
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    print('Missing values per column after imputation:')
    print(X_imputed.isna().sum())

    # Handle target encoding if present
    y = None
    if 'target_col' in globals() and target_col and target_col in df.columns:
        y = df[target_col].copy()
        if y.dtype == object or y.dtype.name == 'category':
            le = LabelEncoder()
            y = le.fit_transform(y.astype(str))
            print('Encoded target classes:', list(le.classes_))
        else:
            # if numeric continuous target, try to bin into Low/Medium/High
            if y.nunique()>3:
                y_binned = pd.qcut(y, q=3, labels=['Low','Medium','High'])
                le = LabelEncoder()
                y = le.fit_transform(y_binned.astype(str))
                print('Binned continuous target into classes:', list(le.classes_))
            else:
                print('Numeric target with few unique values; used as-is.')
    else:
        print('No target found yet. Set TARGET_COLUMN and re-run if you want to train.')

    X_processed = X_imputed
    y_processed = y
else:
    print('No dataframe loaded; cannot clean data.')


In [None]:
# 4) Split and 5) Train Random Forest
if df is not None and y_processed is not None:
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42, stratify=y_processed)
    print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)
    rf.fit(X_train, y_train)
    print('Model trained.')

    # Evaluate
    y_pred = rf.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('\nClassification report:')
    print(classification_report(y_test, y_pred))

    # Save model and imputer and label encoder (if exists)
    joblib.dump({'model': rf, 'imputer': imputer, 'label_encoder': globals().get('le', None)}, MODEL_OUTPUT)
    print('Saved model object to', MODEL_OUTPUT)
else:
    print('Either dataset not loaded or target not found. If you want to train, set TARGET_COLUMN to the name of the target column (e.g., "Risk") and re-run.')


In [None]:
# 6) Prediction on blank input
# This cell provides a BLANK_INPUT where you should fill a dictionary with feature values (numeric) and run the cell to get a prediction.

BLANK_INPUT = {
    # Example: 'turbidity': 5.2, 'ph': 7.1, 'case_count': 2
    # <-- Replace keys/values with the exact feature columns listed earlier.
}

if BLANK_INPUT:
    # Load model
    obj = joblib.load(MODEL_OUTPUT)
    model = obj['model']
    imputer = obj['imputer']
    le = obj.get('label_encoder', None)

    # Create DataFrame from BLANK_INPUT using feature_cols order
    x_new = pd.DataFrame([BLANK_INPUT])
    # Align columns
    x_new = x_new.reindex(columns=X_processed.columns)
    # Impute
    x_new_imputed = pd.DataFrame(imputer.transform(x_new), columns=x_new.columns)
    pred = model.predict(x_new_imputed)
    if le is not None:
        try:
            pred_label = le.inverse_transform(pred)
        except Exception:
            pred_label = pred
    else:
        pred_label = pred
    print('Prediction (encoded):', pred)
    print('Prediction (label):', pred_label)
else:
    print('BLANK_INPUT is empty. Fill BLANK_INPUT with feature values and re-run this cell to get a prediction.')


## Next steps / Notes

- Open this notebook on your machine (Jupyter / JupyterLab / VS Code) and set `DATA_PATH` if it's different.
- If the dataset already contains a `Risk` (Low/Medium/High) column, set `TARGET_COLUMN='Risk'` to train directly.
- If target isn't present, decide which column to use as a proxy (e.g., case counts or an existing risk score) and set `TARGET_COLUMN`.
- Customize feature selection if you want to include categorical columns (the current notebook uses only numeric features by default).

If you want, I can also: 
- return a Python script (.py) instead, or
- run the training here and show evaluation results (if you upload the CSV to this environment).