In [2]:
import pandas as pd

# Load the datasets
df_disease = pd.read_csv("../../data/idsp.csv", encoding='ISO-8859-1')
df_aqi = pd.read_csv("../../data/aqi.csv")

# Standardize column names
df_disease.rename(columns={
    'year': 'Year',
    'state': 'State',
    'disease_illness_name': 'Disease',
    'cases': 'Cases'
}, inplace=True)

df_aqi.rename(columns={
    'date': 'Date',
    'state': 'State',
    'aqi_value': 'AQI'
}, inplace=True)

# Clean up: parse types
#df_disease['Year'] = df_disease['Year'].astype(int)
# Clean and parse dates correctly
df_aqi['Date'] = pd.to_datetime(df_aqi['Date'], dayfirst=True, errors='coerce')
df_aqi['Year'] = df_aqi['Date'].dt.year
df_aqi['Month'] = df_aqi['Date'].dt.month

df_disease['Date'] = pd.to_datetime(df_disease['outbreak_starting_date'], dayfirst=True, errors='coerce')
df_disease['Year'] = df_disease['Date'].dt.year
df_disease['Month'] = df_disease['Date'].dt.month

#print(df_disease)

# Get last 3 years from disease data
latest_year = df_disease['Year'].max()
last_3_years = [latest_year - i for i in range(3)]

# After filtering, make a copy before assigning new columns
recent_disease = df_disease[df_disease['Year'].isin(last_3_years)].copy()
recent_aqi = df_aqi[df_aqi['Year'].isin(last_3_years)].copy()

# Step 1: Total disease cases per state & disease (last 3 years)
state_disease_cases = (
    recent_disease.groupby(['State', 'Disease'])['Cases']
    .sum()
    .reset_index()
)

# Step 2: Rank diseases within each state to get top 2
state_disease_cases['Rank'] = (
    state_disease_cases.groupby('State')['Cases']
    .rank(method='first', ascending=False)
)

top2_diseases_per_state = state_disease_cases[state_disease_cases['Rank'] <= 2]

# Step 3: Compute average AQI per state over last 3 years
avg_aqi_per_state = (
    recent_aqi.groupby('State')['AQI']
    .mean()
    .reset_index()
    .rename(columns={'AQI': 'Avg_AQI'})
)

# Step 4: Merge top 2 diseases per state with average AQI
final_df = pd.merge(top2_diseases_per_state, avg_aqi_per_state, on='State', how='left')

# Step 5: Sort for readability
final_df = final_df.sort_values(by=['State', 'Rank'])

# Save to CSV or display
#final_df.to_csv("top2_diseases_per_state_with_aqi.csv", index=False)
print(final_df)


                          State                  Disease  Cases  Rank  \
0   Andaman and Nicobar Islands  Acute Diarrheal Disease     94   1.0   
1   Andaman and Nicobar Islands          Fever with Rash      8   2.0   
2                Andhra Pradesh  Acute Diarrheal Disease   3189   1.0   
3                Andhra Pradesh                  Cholera   1081   2.0   
4                     Arunachal                   Rabies      1   1.0   
..                          ...                      ...    ...   ...   
70                Uttar Pradesh           Food Poisoning   2218   2.0   
72                  Uttarakhand                   Dengue   1092   1.0   
71                  Uttarakhand  Acute Diarrheal Disease    626   2.0   
73                  West Bengal  Acute Diarrheal Disease   3276   1.0   
74                  West Bengal           Food Poisoning   2360   2.0   

       Avg_AQI  
0    57.706897  
1    57.706897  
2    78.529013  
3    78.529013  
4          NaN  
..         ...  
70  