In [1]:
## importing necessary libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


  from pandas.core import (


In [2]:
df = pd.read_csv("cleaned_dengue_data.csv")  ## data loaded

In [3]:
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Ward,Date_of_diagnosis,Temperature,Humidity,Rainfall,Symptoms,Platelet_Count,Diagnosis,Outcome,Month,Year,Age_group
0,P00002,48,F,Mehdipatnam,2022-06-21,25.7,82.8,45.0,"Fever, joint pain",110619,Dengue+,Recovered,6,2022,Adult
1,P00003,19,M,Serilingampally,2022-04-01,29.8,63.6,0.0,"Fever, retro-orbital pain",62974,Dengue+,Hospitalized,4,2022,Young Adult
2,P00006,33,M,Mehdipatnam,2023-07-26,23.8,92.9,10.0,"Fever, retro-orbital pain",78755,Dengue+,Hospitalized,7,2023,Young Adult
3,P00010,49,F,Kukatpally,2022-08-20,26.0,79.1,1.0,"Fever, bleeding",20787,Dengue+,Critical,8,2022,Adult
4,P00011,44,F,Musheerabad,2024-07-14,25.0,82.9,60.0,"Fever, muscle pain",134421,Dengue+,Recovered,7,2024,Adult


In [4]:
df.shape

(7971, 15)

In [5]:
## so here there is no use of patient_id and data_of_diagnosis because we have already created year and month



### dropping of unnecessary columns

In [6]:
df.drop(columns=["Patient_ID","Date_of_diagnosis"],inplace=True)

In [7]:
df.shape

(7971, 13)

In [8]:
df.head()

Unnamed: 0,Age,Gender,Ward,Temperature,Humidity,Rainfall,Symptoms,Platelet_Count,Diagnosis,Outcome,Month,Year,Age_group
0,48,F,Mehdipatnam,25.7,82.8,45.0,"Fever, joint pain",110619,Dengue+,Recovered,6,2022,Adult
1,19,M,Serilingampally,29.8,63.6,0.0,"Fever, retro-orbital pain",62974,Dengue+,Hospitalized,4,2022,Young Adult
2,33,M,Mehdipatnam,23.8,92.9,10.0,"Fever, retro-orbital pain",78755,Dengue+,Hospitalized,7,2023,Young Adult
3,49,F,Kukatpally,26.0,79.1,1.0,"Fever, bleeding",20787,Dengue+,Critical,8,2022,Adult
4,44,F,Musheerabad,25.0,82.9,60.0,"Fever, muscle pain",134421,Dengue+,Recovered,7,2024,Adult


## Encoding

In [9]:
le = LabelEncoder()   


In [10]:
df['Gender'] = le.fit_transform(df['Gender'])  ## binary category (label encoding)

In [11]:
age_order = {'Child': 0, 'Young Adult': 1, 'Adult': 2, 'Senior': 3}  
df['Age_group'] = df['Age_group'].map(age_order)   ## ordinal encoding



In [12]:
df = pd.get_dummies(df, columns=['Ward', 'Diagnosis'], drop_first=True)  ## One hot encoding


In [13]:
symptom_set = set() 
df['Symptoms'].str.split(', ').apply(symptom_set.update)  ## extracting the unique symptoms from data.

0       None
1       None
2       None
3       None
4       None
        ... 
7966    None
7967    None
7968    None
7969    None
7970    None
Name: Symptoms, Length: 7971, dtype: object

In [14]:
symptom_set  ## set of unique symptoms

{'Fever',
 'bleeding',
 'chills',
 'fatigue',
 'headache',
 'joint pain',
 'muscle pain',
 'nausea',
 'rash',
 'retro-orbital pain'}

In [15]:
## encoding each as binary column
for symptom in symptom_set:
    df[f"Symptom_{symptom}"] = df["Symptoms"].apply(lambda x:1 if symptom in x else 0)

    

In [16]:
df.head()

Unnamed: 0,Age,Gender,Temperature,Humidity,Rainfall,Symptoms,Platelet_Count,Outcome,Month,Year,...,Symptom_muscle pain,Symptom_fatigue,Symptom_Fever,Symptom_headache,Symptom_joint pain,Symptom_nausea,Symptom_bleeding,Symptom_retro-orbital pain,Symptom_chills,Symptom_rash
0,48,0,25.7,82.8,45.0,"Fever, joint pain",110619,Recovered,6,2022,...,0,0,1,0,1,0,0,0,0,0
1,19,1,29.8,63.6,0.0,"Fever, retro-orbital pain",62974,Hospitalized,4,2022,...,0,0,1,0,0,0,0,1,0,0
2,33,1,23.8,92.9,10.0,"Fever, retro-orbital pain",78755,Hospitalized,7,2023,...,0,0,1,0,0,0,0,1,0,0
3,49,0,26.0,79.1,1.0,"Fever, bleeding",20787,Critical,8,2022,...,0,0,1,0,0,0,1,0,0,0
4,44,0,25.0,82.9,60.0,"Fever, muscle pain",134421,Recovered,7,2024,...,1,0,1,0,0,0,0,0,0,0


In [17]:
df.drop(columns=["Symptoms"],inplace=True) ## dropping of Previous Symptoms column

In [18]:
df['Outcome_encoded'] = le.fit_transform(df['Outcome'])  ## label encoding

In [19]:
df

Unnamed: 0,Age,Gender,Temperature,Humidity,Rainfall,Platelet_Count,Outcome,Month,Year,Age_group,...,Symptom_fatigue,Symptom_Fever,Symptom_headache,Symptom_joint pain,Symptom_nausea,Symptom_bleeding,Symptom_retro-orbital pain,Symptom_chills,Symptom_rash,Outcome_encoded
0,48,0,25.7,82.8,45.0,110619,Recovered,6,2022,2,...,0,1,0,1,0,0,0,0,0,3
1,19,1,29.8,63.6,0.0,62974,Hospitalized,4,2022,1,...,0,1,0,0,0,0,1,0,0,2
2,33,1,23.8,92.9,10.0,78755,Hospitalized,7,2023,1,...,0,1,0,0,0,0,1,0,0,2
3,49,0,26.0,79.1,1.0,20787,Critical,8,2022,2,...,0,1,0,0,0,1,0,0,0,0
4,44,0,25.0,82.9,60.0,134421,Recovered,7,2024,2,...,0,1,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7966,20,1,24.1,92.5,3.0,136281,Recovered,8,2022,1,...,0,1,0,1,0,0,0,0,0,3
7967,34,1,26.1,80.6,0.2,127239,Recovered,8,2024,1,...,0,1,0,0,0,0,0,0,0,3
7968,27,1,27.4,71.4,2.6,24424,Critical,5,2023,1,...,0,1,0,0,0,0,1,0,0,0
7969,33,1,23.2,70.2,0.0,33809,Critical,12,2024,1,...,0,1,1,1,0,0,0,0,0,0


In [20]:
df.columns

Index(['Age', 'Gender', 'Temperature', 'Humidity', 'Rainfall',
       'Platelet_Count', 'Outcome', 'Month', 'Year', 'Age_group',
       'Ward_Banjara Hills', 'Ward_Begumpet', 'Ward_Charminar',
       'Ward_Gachibowli', 'Ward_Jubilee Hills', 'Ward_Kukatpally',
       'Ward_LB Nagar', 'Ward_Malakpet', 'Ward_Mehdipatnam',
       'Ward_Musheerabad', 'Ward_Quthbullapur', 'Ward_Secunderabad',
       'Ward_Serilingampally', 'Ward_Uppal', 'Symptom_muscle pain',
       'Symptom_fatigue', 'Symptom_Fever', 'Symptom_headache',
       'Symptom_joint pain', 'Symptom_nausea', 'Symptom_bleeding',
       'Symptom_retro-orbital pain', 'Symptom_chills', 'Symptom_rash',
       'Outcome_encoded'],
      dtype='object')

In [21]:
df.to_csv('preprocessed_dengue_data.csv', index=False)


In [None]:
df['High_Risk'] = df['Outcome'].apply(lambda x: 1 if x in ['Critical', 'Dead'] else 0)
## model2

In [None]:
# Group by Ward and Month to count cases
demand = df.copy()
demand['Ward'] = df.filter(like='Ward_').idxmax(axis=1).str.replace('Ward_', '')
ward_demand = demand.groupby(['Ward', 'Month']).size().reset_index(name='Case_Count')

# Define demand level
def demand_label(x):
    if x >= 100:
        return 'High'
    elif x >= 50:
        return 'Medium'
    else:
        return 'Low'

ward_demand['Demand_Level'] = ward_demand['Case_Count'].apply(demand_label)

# Merge back into df
df['Ward'] = demand['Ward']  # Add actual ward names
df = df.merge(ward_demand[['Ward', 'Month', 'Demand_Level']], on=['Ward', 'Month'], how='left')

# Encode demand level
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Demand_encoded'] = le.fit_transform(df['Demand_Level'])


# model 3