In [4]:
!pip install catboost



PATHOGEN PREDICTION

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from catboost import CatBoostClassifier
import joblib

df_pathogen = pd.read_csv("/content/pathogen_dataset.csv")

# ================================
# 2. Preprocessing
# ================================
# Drop non-feature columns
drop_cols = ["Sample_ID", "Pathogen_Type"]
feature_cols = [col for col in df_pathogen.columns if col not in drop_cols + ["Pathogen_Name"]]

X = df_pathogen[feature_cols].copy()
y = df_pathogen["Pathogen_Name"].astype(str)

# Convert numeric safely
X = X.apply(pd.to_numeric, errors="coerce")
X = X.fillna(X.median())

# Encode target
pathogen_encoder = LabelEncoder()
y_encoded = pathogen_encoder.fit_transform(y)
joblib.dump(pathogen_encoder, "pathogen_label_encoder.pkl")

# ================================
# 3. Train/Test Split
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ================================
# 4. Train CatBoost Model
# ================================
cat_model = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    random_seed=42,
    verbose=200
)

cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# ================================
# 5. Evaluation
# ================================
# y_pred = cat_model.predict(X_test)

# print("✅ Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=pathogen_encoder.classes_))
# Get class probabilities
y_proba = cat_model.predict_proba(X_test)

# Example: first sample probabilities
print("Probabilities:", y_proba[0])

# Map back to pathogen names
pathogen_classes = pathogen_encoder.classes_
proba_dict = dict(zip(pathogen_classes, y_proba[0]))
print("Pathogen Probabilities:", proba_dict)


# ================================
# 6. Save Model
# ================================
cat_model.save_model("pathogen_prediction_model.cbm")
print("✅ Pathogen Prediction Model saved as pathogen_prediction_model.cbm")


0:	learn: 0.9347955	test: 0.9590283	best: 0.9590283 (0)	total: 19.3ms	remaining: 19.2s
200:	learn: 0.9598940	test: 0.9590283	best: 0.9590283 (0)	total: 25.1s	remaining: 1m 39s
400:	learn: 1.0000000	test: 0.9590283	best: 0.9590283 (0)	total: 37.9s	remaining: 56.5s
600:	learn: 1.0000000	test: 0.9489651	best: 0.9590283 (0)	total: 50.8s	remaining: 33.7s
800:	learn: 1.0000000	test: 0.9489651	best: 0.9590283 (0)	total: 1m 3s	remaining: 15.8s
999:	learn: 1.0000000	test: 0.9489651	best: 0.9590283 (0)	total: 1m 16s	remaining: 0us

bestTest = 0.9590283401
bestIteration = 0

Shrink model to first 1 iterations.
Probabilities: [0.1197628  0.1197628  0.12122331 0.1197628  0.1197628  0.1197628
 0.16019987 0.1197628 ]
Pathogen Probabilities: {'Clean Water': np.float64(0.11976280326892626), 'Dengue Virus': np.float64(0.11976280326892626), 'E. coli O157:H7': np.float64(0.12122330729101641), 'Norovirus': np.float64(0.11976280326892626), 'Plasmodium falciparum': np.float64(0.11976280326892626), 'Salmonell

Pathogen Model (LightGBM)

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report
)
from lightgbm import LGBMClassifier
import joblib
import warnings

warnings.filterwarnings('ignore')
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

def load_data(path='/content/pathogen_dataset.csv'):
    df = pd.read_csv(path)
    df.rename(columns={
        'Temperature (°C)':            'temp',
        'pH':                          'ph',
        'BOD (mg/L)':                   'bod',
        'Dissolved_Oxygen (mg/L)':      'do',
        'ORP (mV)':                     'orp',
        'Salinity (‰)':                 'salinity',
        'Turbidity (NTU)':              'turbidity',
        'Total_Coliform (MPN/100mL)':   'Total_Coliform',
        'Fecal_Coliform (MPN/100mL)':   'Fecal_Coliform',
        'Severity_Factor (1-10)':       'severity',
        'Contagiousness_Factor (1-10)': 'contagiousness',
        'Outbreak_Factor (1-10)':       'outbreak'
    }, inplace=True)
    numeric_cols = ['temp','ph','bod','do','orp','salinity','turbidity',
                    'Total_Coliform','Fecal_Coliform',
                    'severity','contagiousness','outbreak']
    df[numeric_cols] = df[numeric_cols].astype(float)
    return df

def engineer_features(df):
    df['coliform_ratio'] = df['Fecal_Coliform'] / (df['Total_Coliform'] + 1)
    df['od_ratio']       = df['bod'] / (df['do'] + 0.1)
    df['sal_temp']       = df['salinity'] * df['temp']
    df['ph_turb']        = df['ph'] * df['turbidity']
    return df

def prepare_data(df):
    features = [
        'temp','ph','bod','do','orp','salinity','turbidity',
        'Total_Coliform','Fecal_Coliform',
        'coliform_ratio','od_ratio','sal_temp','ph_turb',
        'severity','contagiousness','outbreak'
    ]
    # Remove the following line:
    # df['type_enc'] = LabelEncoder().fit_transform(df['Pathogen_Type'])

    # And remove the following line:
    # features += ['type_enc']

    X = df[features].fillna(0)
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(df['Pathogen_Name'])

    (X_train, X_test,
     y_train, y_test) = train_test_split(
        X, y,
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=y
    )
    return (X_train, X_test, y_train, y_test), target_encoder

def tune_lightgbm(X_train, y_train):
    param_dist = {
        'n_estimators':    [100,200,300],
        'max_depth':       [6,8,10],
        'learning_rate':   [0.05,0.1,0.2],
        'num_leaves':      [31,50,70],
        'subsample':       [0.8,0.9,1.0],
        'colsample_bytree':[0.8,0.9,1.0]
    }
    model = LGBMClassifier(
        objective='multiclass',
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=-1
    )
    rs = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=30,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=1
    )
    rs.fit(X_train, y_train)
    return rs.best_estimator_, rs.best_params_, rs.best_score_

def evaluate(model, X_train, X_test, y_train, y_test, target_encoder):
    y_tr = model.predict(X_train)
    y_te = model.predict(X_test)
    y_te_proba = model.predict_proba(X_test)
    pred_names = target_encoder.inverse_transform(y_te)

    print("Train Accuracy:", accuracy_score(y_train, y_tr))
    print("Test Accuracy: ", accuracy_score(y_test, y_te))
    print("\nClassification Report:\n", classification_report(y_test, y_te))

    print("\n🔍 SAMPLE PREDICTIONS:")
    for i in range(min(5, len(y_te))):
        conf = np.max(y_te_proba[i])
        print(f"Sample {i+1}: Predicted = {pred_names[i]}, Confidence = {conf:.3f}")

    return y_te, y_te_proba, pred_names

def main():
    df = load_data()
    df = engineer_features(df)
    (X_train, X_test, y_train, y_test), target_encoder = prepare_data(df)

    print("Tuning LightGBM…")
    best_lgbm, best_params, best_cv = tune_lightgbm(X_train, y_train)
    print("Best CV:", best_cv)
    print("Best Params:", best_params)

    print("\nEvaluating tuned model:")
    # Correctly capture the return values from the evaluate function
    y_pred, y_pred_proba, predicted_pathogens = evaluate(best_lgbm, X_train, X_test, y_train, y_test, target_encoder)

    # Save model
    joblib.dump(best_lgbm, 'pathogen_model.pkl')
    print("\nModel saved to pathogen_model.pkl")

if __name__ == "__main__":
    main()


Tuning LightGBM…
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best CV: 0.915
Best Params: {'subsample': 1.0, 'num_leaves': 50, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 0.9}

Evaluating tuned model:
Train Accuracy: 1.0
Test Accuracy:  0.93

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         6
           2       1.00      0.93      0.96        14
           3       0.85      0.73      0.79        15
           4       1.00      0.86      0.92         7
           5       0.80      0.94      0.86        17
           6       0.95      1.00      0.98        20
           7       1.00      1.00      1.00        18

    accuracy                           0.93       100
   macro avg       0.95      0.93      0.94       100
weighted avg       0.93      0.93      0.93       100


🔍 SAMPLE PREDICTIONS:
Sample

Pathogen to Disease Mapping

In [7]:
# Pathogen to Disease Mapping
pathogen_to_disease = {
    "Vibrio cholerae": "Cholera",
    "Salmonella typhi": "Typhoid",
    "E. coli O157:H7": "Acute Diarrheal Disease",
    "Shigella": "Dysentery",
    "Norovirus": "Acute Gastroenteritis",
    "Plasmodium falciparum": "Malaria",
    "Dengue Virus": "Dengue Fever",
    "Clean Water": "No Disease"
}


In [8]:
def map_pathogen_to_disease(pathogen_probs, pathogen_to_disease):
    disease_probs = {}

    for pathogen, prob in pathogen_probs.items():
        disease = pathogen_to_disease.get(pathogen, None)
        if disease:
            if disease not in disease_probs:
                disease_probs[disease] = 0
            disease_probs[disease] = max(disease_probs[disease], prob)
    return disease_probs

# Example usage
pathogen_probs = {
    "Vibrio cholerae": 0.65,
    "Salmonella typhi": 0.20,
    "E. coli O157:H7": 0.10,
    "Shigella": 0.05
}

disease_probs = map_pathogen_to_disease(pathogen_probs, pathogen_to_disease)
print(disease_probs)


{'Cholera': 0.65, 'Typhoid': 0.2, 'Acute Diarrheal Disease': 0.1, 'Dysentery': 0.05}


Final Fusion

In [11]:
import joblib
import pickle
import numpy as np
import pandas as pd

# -----------------------------
# Load Models & Encoders
# -----------------------------
# Pathogen model (LightGBM)
pathogen_model = joblib.load("pathogen_model.pkl")
pathogen_encoder = joblib.load("pathogen_label_encoder.pkl")

# WQI model + encoders
wqi_model = joblib.load("water_quality_model.pkl")
kmeans = joblib.load("kmeans_model.pkl")
with open("label_encoders_wqi.pkl", "rb") as f:
    wqi_encoders = pickle.load(f)

# Outbreak model
outbreak_model = joblib.load("disease_outbreak_model.pkl")
season_encoder = joblib.load("season_label_encoder.pkl")
disease_encoder = joblib.load("disease_label_encoder.pkl")

# -----------------------------
# Constants
# -----------------------------
weights = {
    'BOD(mg/L) Max': 20,
    'BOD(mg/L) Min': 12,
    'Dissolved Oxygen(mg/L) Min': 10,
    'pH Min': 8,
    'pH Max': 7,
    'Dissolved Oxygen(mg/L) Max': 7,
    'Conductivity(µmho/cm) Min': 6,
    'Conductivity(µmho/cm) Max': 6,
    'Fecal Coliform(MPN/100ml) Max': 6,
    'Fecal Coliform(MPN/100ml) Min': 5,
    'Total Coliform(MPN/100ml) Max': 5,
    'Total Coliform(MPN/100ml) Min': 4,
    'Fecal Streptococci(MPN/100ml) Max': 4,
    'Fecal Streptococci(MPN/100ml) Min': 3,
    'Nitrate N(mg/L) Max': 5,
    'Nitrate N(mg/L) Min': 4,
    'Temperature (°C) Max': 5,
    'Temperature (°C) Min': 4,
    'Avg_Temp': 4,
    'Avg_Humidity': 3,
    'Avg_WindRegion_Cluster': 6,
    'Region_Cluster': 7,
    'Type': 7,
    'State': 3
}
total_weight = sum(weights.values())
normalized_weights = {k: v/total_weight for k, v in weights.items()}

pathogen_to_disease = {
    "Vibrio cholerae": "Cholera",
    "Salmonella typhi": "Typhoid",
    "E. coli O157:H7": "Acute Diarrheal Disease",
    "Shigella": "Dysentery",
    "Norovirus": "Acute Gastroenteritis",
    "Plasmodium falciparum": "Malaria",
    "Dengue Virus": "Dengue Fever",
    "Clean Water": "No Disease"
}

# -----------------------------
# Helper Functions
# -----------------------------
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Summer"
    elif month in [6, 7, 8, 9]:
        return "Monsoon"
    elif month in [10, 11]:
        return "Post-Monsoon"
    return "Unknown"

def preprocess_wqi_input(user_input: dict):
    processed = {}

    # Region_Cluster assignment
    if "Latitude" in user_input and "Longitude" in user_input:
        coords = pd.DataFrame([[user_input["Latitude"], user_input["Longitude"]]],
                              columns=["Latitude", "Longitude"])
        user_input["Region_Cluster"] = int(kmeans.predict(coords)[0])
    else:
        user_input["Region_Cluster"] = 0

    # Encode categoricals with safe fallback
    for col, le in wqi_encoders.items():
        if col in user_input:
            val = str(user_input[col])
            if val in le.classes_:
                processed[col] = le.transform([val])[0]
            else:
                processed[col] = -1
        else:
            processed[col] = 0

    # Apply weights
    for attr, weight in normalized_weights.items():
        if attr in processed:
            val = processed[attr]   # encoded categorical
        else:
            val = user_input.get(attr, 0)
            try: val = float(val)
            except: val = 0
        processed[f"{attr}_weighted"] = val * weight

    return processed

def predict_wqi(user_input: dict):
    processed_input = preprocess_wqi_input(user_input)
    ordered_features = [f for f in wqi_model.feature_names_in_]
    feature_vector = np.array([[processed_input.get(f, 0) for f in ordered_features]])
    return wqi_model.predict(feature_vector)[0]

def preprocess_outbreak_input(user_input: dict):
    processed = user_input.copy()
    # Region cluster
    if "Latitude" in user_input and "Longitude" in user_input:
        coords = pd.DataFrame([[user_input["Latitude"], user_input["Longitude"]]],
                              columns=["Latitude", "Longitude"])
        processed["Region_Cluster"] = int(kmeans.predict(coords)[0])
    else:
        processed["Region_Cluster"] = 0
    # Season encoding
    if "Month" in user_input:
        season_str = get_season(user_input["Month"])
        if season_str in season_encoder.classes_:
            processed["Season"] = season_encoder.transform([season_str])[0]
        else:
            processed["Season"] = -1
    else:
        processed["Season"] = -1
    return processed

# -----------------------------
# Fusion Function
# -----------------------------
def predict_disease_outbreak(user_input):
    # --- Pathogen model ---
    feature_order = pathogen_model.feature_name_
    # engineer features if missing
    if "coliform_ratio" not in user_input:
        user_input["coliform_ratio"] = user_input["Fecal_Coliform"] / (user_input["Total_Coliform"] + 1)
    if "od_ratio" not in user_input:
        user_input["od_ratio"] = user_input["BOD (mg/L)"] / (user_input["Dissolved_Oxygen (mg/L)"] + 0.1)
    if "sal_temp" not in user_input:
        user_input["sal_temp"] = user_input["Salinity (‰)"] * user_input["Temperature (°C)"]
    if "ph_turb" not in user_input:
        user_input["ph_turb"] = user_input["pH"] * user_input["Turbidity (NTU)"]
    pathogen_X = np.array([[user_input.get(f, 0) for f in feature_order]])
    pathogen_probs = pathogen_model.predict_proba(pathogen_X)[0]
    pathogen_classes = pathogen_encoder.classes_
    pathogen_dict = dict(zip(pathogen_classes, pathogen_probs))

    disease_probs_from_pathogen = {}
    for pathogen, prob in pathogen_dict.items():
        disease = pathogen_to_disease.get(pathogen, None)
        if disease:
            disease_probs_from_pathogen[disease] = max(disease_probs_from_pathogen.get(disease, 0), prob)

    # --- WQI model ---
    wqi_value = predict_wqi(user_input)
    polluted_flag = int(wqi_value < 50)

    # --- Outbreak model ---
    processed_outbreak = preprocess_outbreak_input(user_input)
    outbreak_features = np.array([[processed_outbreak.get(f, 0) for f in outbreak_model.feature_names_in_]])
    outbreak_probs = outbreak_model.predict_proba(outbreak_features)[0]
    outbreak_classes = disease_encoder.classes_
    outbreak_dict = dict(zip(outbreak_classes, outbreak_probs))

    # --- Fusion ---
    final_probs = {}
    for disease in set(list(disease_probs_from_pathogen.keys()) + list(outbreak_dict.keys())):
        final_probs[disease] = (
            0.5 * disease_probs_from_pathogen.get(disease, 0) +
            0.3 * outbreak_dict.get(disease, 0) +
            0.2 * polluted_flag
        )

    return {
        "WQI": wqi_value,
        "Polluted": bool(polluted_flag),
        "Final Disease Probabilities": final_probs
    }

# -----------------------------
# Example Usage
# -----------------------------
sample_input = {
    "Latitude": 10.123,
    "Longitude": 76.456,
    "Month": 7,
    "Temperature (°C)": 28.5,
    "pH": 6.8,
    "BOD (mg/L)": 3.1,
    "Dissolved_Oxygen (mg/L)": 5.2,
    "ORP (mV)": 100,
    "Salinity (‰)": 1.2,
    "Turbidity (NTU)": 10,
    "Total_Coliform": 500,
    "Fecal_Coliform": 200,
    "LAI": 25,
    "preci": 0.05,
    "Avg_Temp": 27,
    "Avg_Humidity": 70,
    "Avg_WindRegion_Cluster": 2,
    "Type": "River",
    "State": "Kerala"
}

result = predict_disease_outbreak(sample_input)
for i in result:
  if i=="Final Disease Probabilities":
    for j in result[i]:
      print(j,":",result[i][j])
  else:
    print(i,":",result[i],"\n")



WQI : 22.06758836712114 

Polluted : True 

Cholera : 0.20600804582280727
No Disease : 0.6388680658670449
Acute Diarrhoeal Disease : 0.29033333333333333
Acute Encephalitis Syndrome : 0.2
Acute Diarrheal Disease : 0.25930601483623616
Dysentery : 0.20181201664279858
Malaria : 0.20000175419039026
Chikungunya/ Dengue : 0.2492280701754386
Dengue Fever : 0.20000279728655046
Typhoid : 0.20000093527692392
Chikungunya : 0.2312017543859649
Chikungunya/Dengue : 0.31723684210526315
Acute Gastroenteritis : 0.2060003700772486
