**Get city wise pollutants**

In [1]:
import pandas as pd

# Load and parse data
file_path = "../../data/aqi.csv"
df = pd.read_csv(file_path)

# Parse dates safely
df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y", errors="coerce")

# Filter for date range (2022–2025)
df = df[(df["date"] >= "2022-01-01") & (df["date"] <= "2025-05-31")].copy()

# Step 1: Compute average AQI per city (area)
avg_aqi = (
    df.groupby("area")["aqi_value"]
    .mean()
    .reset_index()
    .rename(columns={"aqi_value": "avg_aqi_2022_2025"})
)

# Step 2: Clean and explode prominent pollutants
pollutant_df = df[["area", "prominent_pollutants"]].dropna().copy()
pollutant_df["prominent_pollutants"] = pollutant_df["prominent_pollutants"].str.replace(" ", "")
pollutant_df = pollutant_df.assign(prominent_pollutants=pollutant_df["prominent_pollutants"].str.split(','))
pollutant_exploded = pollutant_df.explode("prominent_pollutants")

# Step 3: Count each pollutant per area
pollutant_counts = (
    pollutant_exploded
    .groupby(["area", "prominent_pollutants"])
    .size()
    .reset_index(name="count")
)

# Step 4: Get top 5 pollutants with counts for each city
def get_top_pollutants(df_sub):
    df_sorted = df_sub.sort_values("count", ascending=False).head(5)
    return pd.Series({
        f"top{i+1}_pollutant": f"{row.prominent_pollutants} ({row.count})"
        for i, row in enumerate(df_sorted.itertuples(index=False))
    })

top5_pollutants = (
    pollutant_counts
    .groupby("area")
    .apply(get_top_pollutants)
    .reset_index()
)

# Step 5: Merge with average AQI
final = top5_pollutants.merge(avg_aqi, on="area", how="left")

# Rename for clarity
final = final.rename(columns={"area": "City"})
final = final.rename(columns={0: "pollutants"})  # if 0 exists

# Step 6: Count occurrences of each air_quality_status per area
status_counts = (
    df.groupby(['area', 'air_quality_status'])
    .size()
    .unstack(fill_value=0)  # Converts rows to columns per status
    .reset_index()
)

# Optional: Rename columns for clarity
status_counts.columns.name = None # remove group name
status_counts = status_counts.rename(columns={"area": "City"})


# Merge air quality status counts with final output
final = final.merge(status_counts, on="City", how="left")

# Output
final.to_csv("city_pollutants_aqi_summary.csv", index=False)
print(final.head(10))

       City         level_1   pollutants  avg_aqi_2022_2025  Good  Moderate  \
0  Agartala  top1_pollutant  PM2.5 (583)         126.697585   220       249   
1  Agartala  top2_pollutant   PM10 (457)         126.697585   220       249   
2  Agartala  top3_pollutant      O3 (64)         126.697585   220       249   
3  Agartala  top4_pollutant     SO2 (52)         126.697585   220       249   
4  Agartala  top5_pollutant       CO (2)         126.697585   220       249   
5      Agra  top1_pollutant   PM10 (953)          84.232889   248       330   
6      Agra  top2_pollutant  PM2.5 (522)          84.232889   248       330   
7      Agra  top3_pollutant     O3 (180)          84.232889   248       330   
8      Agra  top4_pollutant    NO2 (105)          84.232889   248       330   
9      Agra  top5_pollutant     SO2 (64)          84.232889   248       330   

   Poor  Satisfactory  Severe  Very Poor  
0   215           318       0         33  
1   215           318       0         33  
2

  .apply(get_top_pollutants)


**Get citywise monthly average**

In [3]:
# Load and parse data
file_path = "../../data/aqi.csv"
df = pd.read_csv(file_path)

df["date"] = pd.to_datetime(df["date"], dayfirst=True, errors="coerce")
df["month"] = df["date"].dt.to_period("M")

monthly_avg = (
    df.groupby(["area", "month","state"])["aqi_value"]
    .mean()
    .reset_index()
)
monthly_avg["month"] = monthly_avg["month"].astype(str)
monthly_avg.to_csv("all_citywise_trend.csv")


**Merge Population and per capita income for Tier1 and Tier2 cities**

In [4]:
# Load AQI data (must have: City, aqi_value)
df_aqi = pd.read_csv("city_pollutants_aqi_summary.csv")  # Make sure it has a column like: City, aqi_value

# Load income/population data (must have: City, population, per_capita_income)
df_income = pd.read_csv("income.csv")  # Columns: City, population, per_capita_income

# Standardize city names (remove leading/trailing spaces, lowercase for consistency)
for df in [df_aqi, df_income]:
    df['City'] = df['City'].str.strip().str.lower()

# Merge in steps
merged = df_aqi.merge(df_income, on='City', how='inner')
#merged = merged.merge(df_income, on='City', how='left')

# Capitalize city names for readability
merged['City'] = merged['City'].str.title()
merged['0'] = merged['pollutants']
# Save merged output (optional)
merged.to_csv("merged_city_data.csv", index=False)

# Preview merged result
print(merged.head(10))

        City         level_1   pollutants  avg_aqi_2022_2025  Good  Moderate  \
0  Ahmedabad  top1_pollutant  PM2.5 (708)         113.598573    26       632   
1  Ahmedabad  top2_pollutant   PM10 (675)         113.598573    26       632   
2  Ahmedabad  top3_pollutant    NO2 (138)         113.598573    26       632   
3  Ahmedabad  top4_pollutant     O3 (130)         113.598573    26       632   
4  Ahmedabad  top5_pollutant      CO (95)         113.598573    26       632   
5  Bengaluru  top1_pollutant  PM10 (1035)          74.885333   169       143   
6  Bengaluru  top2_pollutant  PM2.5 (490)          74.885333   169       143   
7  Bengaluru  top3_pollutant     CO (338)          74.885333   169       143   
8  Bengaluru  top4_pollutant     O3 (156)          74.885333   169       143   
9  Bengaluru  top5_pollutant     NO2 (55)          74.885333   169       143   

   Poor  Satisfactory  Severe  Very Poor  population  per_capita_income  \
0    33           430       0          0    

In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load your two datasets
df_aqi = pd.read_csv("merged_city_data.csv")      # includes AQI + population + income
df_city = pd.read_csv("income.csv")           # only includes population, income

# Step 2: Clean and standardize city names for merge
df_aqi["City"] = df_aqi["City"].str.strip().str.title()
df_city["City"] = df_city["City"].str.strip().str.title()
# Clean population and income
#df_city["population"] = df_city["population"].str.replace(",", "").astype(int)
#df_city["per_capita_income"] = df_city["per_capita_income"].str.replace(",", "").astype(int)

# Cap PCI at 300,000 INR
df_city["per_capita_income_clean"] = df_city["per_capita_income"]

# Step 1: Group AQI per city
df_aqi_clean = df_aqi.groupby("City", as_index=False)["avg_aqi_2022_2025"].mean()

# 🔴 Optional: Make sure city metadata has one row per city
# If df_city has duplicate city entries, this will remove them
df_city_dedup = df_city.drop_duplicates(subset="City")

# Step 2: Merge
df_merged = pd.merge(df_city_dedup, df_aqi_clean, on="City", how="left")

# ✅ New line — create scored_df AFTER merge
scored_df = df_merged.copy()

# Step 5: Check for missing values after merge (optional)
missing_aqi = scored_df[scored_df["avg_aqi_2022_2025"].isna()]
if not missing_aqi.empty:
    print("Warning: AQI missing for the following cities:")
    print(missing_aqi["City"].unique())

# Step 6: Clean numeric columns
for col in ["population", "per_capita_income_clean"]:
    scored_df[col] = (
        scored_df[col]
        .astype(str)
        .str.replace(",", "")
        .astype(float)
    )

scored_df["avg_aqi_2022_2025"] = pd.to_numeric(scored_df["avg_aqi_2022_2025"], errors="coerce")

# Drop missing AQI rows
scored_df.dropna(subset=["avg_aqi_2022_2025"], inplace=True)

# Normalize
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scored_df[["Population_Score", "Income_Score", "AQI_Score"]] = scaler.fit_transform(
    scored_df[["population", "per_capita_income_clean", "avg_aqi_2022_2025"]]
)

# Composite score
scored_df["Composite_Score"] = (
  0.3 *  scored_df["Population_Score"] +
  0.2 * scored_df["Income_Score"] +
  0.5 *  scored_df["AQI_Score"]
)


# Sort and save
top_cities = scored_df.sort_values("Composite_Score", ascending=False)
top_cities.to_csv("scored_city_market_ranking.csv", index=False)
print(top_cities[["City", "Composite_Score"]].round(2))


                  City  Composite_Score
0                Delhi             0.91
1               Mumbai             0.45
16           Ghaziabad             0.43
10               Patna             0.43
2            Bengaluru             0.39
22          Chandigarh             0.38
15              Meerut             0.36
5            Ahmedabad             0.35
7              Kolkata             0.30
6                 Pune             0.30
9               Jaipur             0.30
8                Surat             0.29
4            Hyderabad             0.29
11             Lucknow             0.29
26              Nagpur             0.26
23              Bhopal             0.24
13       Visakhapatnam             0.23
20               Kochi             0.23
25              Kanpur             0.21
3              Chennai             0.19
24              Indore             0.18
14              Nashik             0.13
12          Coimbatore             0.09
21            Varanasi             0.05


In [8]:
# Create formatted columns
top_cities["Population (2025 est.)"] = (top_cities["population"] / 1e6).round(1).astype(str) + "M"
top_cities["PCI (₹ lakh)"] = (top_cities["per_capita_income_clean"] / 1e5).round(2).astype(str)  # ₹1 lakh = 100,000
top_cities["Composite Score"] = top_cities["Composite_Score"].round(2)

# AQI Rank logic
def classify_aqi_rank(aqi):
    if aqi >= 200:
        return "Very High"
    elif aqi >= 100:
        return "High"
    elif aqi >= 50:
        return "Moderate"
    else:
        return "Low"

top_cities["AQI_Rank*"] = top_cities["avg_aqi_2022_2025"].apply(classify_aqi_rank)

# Display formatted table
formatted = top_cities[["City", "AQI_Rank*", "Population (2025 est.)", "PCI (₹ lakh)", "Composite_Score"]]
formatted.to_csv("composite_score.csv")
print(formatted.head(10).to_string(index=False))


      City AQI_Rank* Population (2025 est.) PCI (₹ lakh)  Composite_Score
     Delhi Very High                  34.7M         4.61         0.908824
    Mumbai      High                  22.1M         4.13         0.451488
 Ghaziabad      High                   1.2M         1.35         0.432555
     Patna      High                   2.7M          1.2         0.431216
 Bengaluru  Moderate                  14.4M          7.6         0.386382
Chandigarh      High                   1.3M          4.0         0.377751
    Meerut      High                   1.9M          1.3         0.356111
 Ahmedabad      High                   9.1M         3.89         0.352562
   Kolkata      High                  15.8M         1.54         0.303717
      Pune      High                   7.5M         2.78         0.299606
