In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import joblib
from sklearn.ensemble import RandomForestClassifier

In [6]:
# ================================
# 1. Load Dataset
# ================================
df = pd.read_csv("/content/Historical_data.csv")

# Load saved KMeans model
kmeans = joblib.load("/content/kmeans_model.pkl")
coords = df[['Latitude', 'Longitude']].copy()
df['Region_Cluster'] = kmeans.predict(coords)

# Drop unnecessary index column
df = df.drop(columns=[c for c in df.columns if "Unnamed" in c], errors="ignore")

In [7]:
# ================================
# 2. Clean Numerical Columns
# ================================
# Explicit conversion for Cases/Deaths
for col in ['Cases', 'Deaths']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Force ALL numeric-looking columns to numeric
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='ignore')

numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Fill categorical missing values with mode
cat_cols = df.select_dtypes(exclude=np.number).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

  df[col] = pd.to_numeric(df[col], errors='ignore')


In [8]:
# ================================
# 3. Add Season Feature (if Month exists)
# ================================
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Summer"
    elif month in [6, 7, 8, 9]:
        return "Monsoon"
    elif month in [10, 11]:
        return "Post-Monsoon"
    return "Unknown"


if 'Month' in df.columns:
    df['Season'] = df['Month'].apply(get_season)

# Encode Season
if 'Season' in df.columns:
    season_encoder = LabelEncoder()
    df['Season'] = season_encoder.fit_transform(df['Season'])
    joblib.dump(season_encoder, "season_label_encoder.pkl")

In [9]:
disease_counts = df['Disease'].value_counts()
disease_counts

Unnamed: 0_level_0,count
Disease,Unnamed: 1_level_1
Acute Diarrhoeal Disease,5126
Dengue,1619
Chikungunya,731
Cholera,666
Malaria,544
Acute Encephalitis Syndrome,111
Acute Gastroenteritis,100
Dengue And Chikungunya,53
Suspected Dengue,14
Dengue And Malaria,3


In [10]:
# ================================
# 4. Encode Target Variable
# ================================
label_encoder = LabelEncoder()
df["Disease"] = label_encoder.fit_transform(df["Disease"].astype(str))

# Save encoder for later use
joblib.dump(label_encoder, "disease_label_encoder.pkl")


['disease_label_encoder.pkl']

In [11]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8985 entries, 0 to 8984
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   week_of_outbreak  8985 non-null   object 
 1   state_ut          8985 non-null   object 
 2   District          8985 non-null   object 
 3   Disease           8985 non-null   int64  
 4   Cases             8985 non-null   float64
 5   Deaths            8985 non-null   float64
 6   Day               8985 non-null   int64  
 7   Month             8985 non-null   int64  
 8   Year              8985 non-null   int64  
 9   Latitude          8985 non-null   float64
 10  Longitude         8985 non-null   float64
 11  Precipitation     8985 non-null   float64
 12  LAI               8985 non-null   float64
 13  Temperature       8985 non-null   float64
 14  Region_Cluster    8985 non-null   int32  
 15  Season            8985 non-null   int64  
dtypes: float64(7), int32(1), int64(5), object(

Unnamed: 0,week_of_outbreak,state_ut,District,Disease,Cases,Deaths,Day,Month,Year,Latitude,Longitude,Precipitation,LAI,Temperature,Region_Cluster,Season
0,1st week,Meghalaya,East Jaintia Hills,0,160.0,1.0,2,1,2022,25.251576,92.48405,0.020354,34.5,291.533333,3,3
1,2nd week,Maharashtra,Gadchiroli,15,7.0,2.0,10,1,2022,19.75907,80.162281,0.007479,9.0,299.97,0,3
2,3rd week,Tamil Nadu,Pudukottai,0,8.0,1.0,18,1,2022,10.382651,78.819126,0.107413,12.0,300.766667,8,3
3,3rd week,Gujarat,Patan,0,7.0,1.0,11,1,2022,23.774057,71.683735,0.065094,9.0,299.08,5,3
4,3rd week,Kerala,Ernakulam,0,14.0,1.0,24,12,2021,9.98408,76.274146,0.041256,33.0,303.028,8,3


Feature Selection

In [12]:
# ================================
# 6. Feature Selection
# ================================
X = df.drop(columns=["Disease", "Cases", "Deaths","state_ut","District","Day","Month","Year","week_of_outbreak"])
y = df["Disease"]

# ================================
# 6.5. Drop rare classes (< 60 samples)
# ================================
min_samples = 60
valid_classes = y.value_counts()[y.value_counts() >= min_samples].index
mask = y.isin(valid_classes)

X = X[mask]
y = y[mask]

# ================================
# 7. Train/Test Split
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

smote = SMOTE(random_state=42, k_neighbors=3)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

Random Forest

In [13]:
# ================================
# 8. Train Model
# ================================
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    class_weight="balanced",
    random_state=42
)

rf_model.fit(X_train_res, y_train_res)

# ================================
# 9. Evaluation
# ================================
y_pred = rf_model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ================================
# 10. Save Model
# ================================
joblib.dump(rf_model, "disease_outbreak_model.pkl")
print("✅ Model saved as disease_outbreak_model.pkl")


✅ Accuracy: 0.5460674157303371

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.59      0.68      1026
           1       0.27      0.45      0.34        22
           2       0.10      0.20      0.13        20
           3       0.32      0.62      0.42       146
           6       0.27      0.41      0.33       133
           7       0.49      0.48      0.48       324
          15       0.32      0.45      0.38       109

    accuracy                           0.55      1780
   macro avg       0.37      0.46      0.40      1780
weighted avg       0.63      0.55      0.57      1780

✅ Model saved as disease_outbreak_model.pkl


In [14]:
from google.colab import files
files.download("/content/disease_outbreak_model.pkl")  # replace with your filename


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Pipeline

In [15]:
import joblib
import numpy as np
import pandas as pd

# -----------------------------
# Load Models & Encoders
# -----------------------------
outbreak_model = joblib.load("/content/disease_outbreak_model.pkl")
kmeans = joblib.load("kmeans_model.pkl")
season_encoder = joblib.load("season_label_encoder.pkl")
disease_encoder = joblib.load("disease_label_encoder.pkl")

# -----------------------------
# Season Mapping Function
# -----------------------------
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Summer"
    elif month in [6, 7, 8, 9]:
        return "Monsoon"
    elif month in [10, 11]:
        return "Post-Monsoon"
    return "Unknown"

# -----------------------------
# Preprocessing Function
# -----------------------------
def preprocess_outbreak_input(user_input: dict):
    """
    Convert raw outbreak input (lat, lon, month, etc.)
    into processed features for the outbreak model.
    """

    processed = user_input.copy()

    # 1. Assign Region_Cluster using KMeans
    if "Latitude" in user_input and "Longitude" in user_input:
        coords = pd.DataFrame(
            [[user_input["Latitude"], user_input["Longitude"]]],
            columns=["Latitude", "Longitude"]
        )
        processed["Region_Cluster"] = int(kmeans.predict(coords)[0])
    else:
        processed["Region_Cluster"] = 0

    # 2. Encode Season from Month
    if "Month" in user_input:
        season_str = get_season(user_input["Month"])
        if season_str in season_encoder.classes_:
            processed["Season"] = season_encoder.transform([season_str])[0]
        else:
            processed["Season"] = -1  # unseen fallback
    else:
        processed["Season"] = -1

    return processed

# -----------------------------
# Prediction Function
# -----------------------------
def predict_outbreak(user_input: dict):
    """
    Predict outbreak probabilities for given user input.
    """
    processed_input = preprocess_outbreak_input(user_input)

    # Build feature vector in correct order
    feature_order = outbreak_model.feature_names_in_
    feature_vector = np.array([[processed_input.get(f, 0) for f in feature_order]])

    # Predict probabilities
    probs = outbreak_model.predict_proba(feature_vector)[0]
    classes = disease_encoder.classes_

    return dict(zip(classes, probs))

# -----------------------------
# Example Usage
# -----------------------------
sample_input = {
    "Latitude": 10.123,
    "Longitude": 76.456,
    "Month": 7,
    "Temp": 29,
    "preci": 0.15,
    "LAI": 25,
    "Cases": 50,
    "Deaths": 2
}

result = predict_outbreak(sample_input)
print("Predicted Outbreak Probabilities:")
for i in result:
  print(i,result[i])

Predicted Outbreak Probabilities:
Acute Diarrhoeal Disease 0.3011111111111111
Acute Encephalitis Syndrome 0.0
Acute Gastroenteritis 0.02
Chikungunya 0.10400584795321638
Chikungunya/ Dengue 0.16409356725146199
Chikungunya/Dengue 0.3907894736842106
Cholera 0.02


