In [None]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", None)

In [3]:
enrol_df = pd.read_csv("enrolment.csv")
demo_df  = pd.read_csv("demographic.csv")
bio_df   = pd.read_csv("biometric.csv")

all_dfs = [enrol_df, demo_df, bio_df]

print("Loaded datasets:")
print(f"  Enrolment: {len(enrol_df):,} rows")
print(f"  Demographic: {len(demo_df):,} rows")
print(f"  Biometric: {len(bio_df):,} rows")


Loaded datasets:
  Enrolment: 93,184 rows
  Demographic: 183,245 rows
  Biometric: 424,731 rows


In [None]:
print("=== Enrolment Columns ===")
print(enrol_df.columns.tolist())
enrol_df.head()

=== Enrolment Columns ===
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,01-09-2025,Maharashtra,Ahmadnagar,413701,1,2,1
1,01-09-2025,Maharashtra,Ahmadnagar,413703,1,1,0
2,01-09-2025,Maharashtra,Ahmadnagar,413705,3,0,0
3,01-09-2025,Maharashtra,Ahmadnagar,414002,2,1,0
4,01-09-2025,Maharashtra,Ahmadnagar,414006,2,0,0


In [None]:
print("Unique districts BEFORE cleanup:")
print(f"  Enrolment: {enrol_df['district'].nunique()}")
print(f"  Demographic: {demo_df['district'].nunique()}")
print(f"  Biometric: {bio_df['district'].nunique()}")

Unique districts BEFORE cleanup:
  Enrolment: 53
  Demographic: 53
  Biometric: 52


In [None]:
print("=== Enrolment Districts ===")
print(sorted(enrol_df["district"].unique()))
print("\n=== Demographic Districts ===")
print(sorted(demo_df["district"].unique()))
print("\n=== Biometric Districts ===")
print(sorted(bio_df["district"].unique()))

=== Enrolment Districts ===
['Ahilyanagar', 'Ahmadnagar', 'Ahmed Nagar', 'Ahmednagar', 'Akola', 'Amravati', 'Aurangabad', 'Beed', 'Bhandara', 'Bid', 'Buldana', 'Buldhana', 'Chandrapur', 'Chatrapati Sambhaji Nagar', 'Chhatrapati Sambhajinagar', 'Dharashiv', 'Dhule', 'Gadchiroli', 'Gondia', 'Gondiya', 'Gondiya *', 'Hingoli', 'Hingoli *', 'Jalgaon', 'Jalna', 'Kolhapur', 'Latur', 'Mumbai', 'Mumbai City', 'Mumbai Suburban', 'Mumbai( Sub Urban )', 'Nagpur', 'Nanded', 'Nandurbar', 'Nandurbar *', 'Nashik', 'Osmanabad', 'Palghar', 'Parbhani', 'Pune', 'Raigad', 'Raigarh', 'Raigarh(MH)', 'Ratnagiri', 'Sangli', 'Satara', 'Sindhudurg', 'Solapur', 'Thane', 'Wardha', 'Washim', 'Washim *', 'Yavatmal']

=== Demographic Districts ===
['Ahilyanagar', 'Ahmadnagar', 'Ahmed Nagar', 'Akola', 'Amravati', 'Aurangabad', 'Beed', 'Bhandara', 'Bid', 'Buldana', 'Buldhana', 'Chandrapur', 'Chatrapati Sambhaji Nagar', 'Chhatrapati Sambhajinagar', 'Dharashiv', 'Dhule', 'Dist : Thane', 'Gadchiroli', 'Gondia', 'Gondiya',

In [None]:
import re

def clean_district_name(name):
    """Basic text cleaning: lowercase, strip, remove *, normalize spaces"""
    if pd.isna(name):
        return name
    name = str(name).lower().strip()
    name = name.replace('*', '') 
    name = re.sub(r'\s+', ' ', name) 
    name = name.strip()
    return name

print("=== Step 1: Basic Text Cleaning ===")
for df_name, df in [("Enrolment", enrol_df), ("Demographic", demo_df), ("Biometric", bio_df)]:
    before = df['district'].nunique()
    df['district'] = df['district'].apply(clean_district_name)
    after = df['district'].nunique()
    print(f"  {df_name}: {before} -> {after} unique districts")

canonical_map = {
    'ahmadnagar': 'ahilyanagar',
    'ahmed nagar': 'ahilyanagar',
    'ahmednagar': 'ahilyanagar',
    'aurangabad': 'chhatrapati sambhajinagar',
    'chatrapati sambhaji nagar': 'chhatrapati sambhajinagar',
    'osmanabad': 'dharashiv',
    'bid': 'beed',
    'buldana': 'buldhana',
    'gondiya': 'gondia',
    'raigarh': 'raigad',
    'raigarh(mh)': 'raigad',
    'mumbai city': 'mumbai',
    'mumbai( sub urban )': 'mumbai suburban',
    'dist : thane': 'thane',
}

print("\n=== Step 2: Canonical Mapping Replacements ===")
all_replacements = [] 

for df_name, df in [("Enrolment", enrol_df), ("Demographic", demo_df), ("Biometric", bio_df)]:
    replacements_made = {}
    for old_name, new_name in canonical_map.items():
        if old_name == new_name: 
            continue
        count = (df['district'] == old_name).sum()
        if count > 0:
            replacements_made[old_name] = (new_name, count)
            all_replacements.append({'dataset': df_name, 'raw_name': old_name, 'canonical_name': new_name, 'rows': count})
    
    df['district'] = df['district'].replace(canonical_map)
    
    if replacements_made:
        print(f"\n{df_name}:")
        for old, (new, cnt) in sorted(replacements_made.items()):
            print(f"  '{old}' -> '{new}' ({cnt:,} rows)")
    else:
        print(f"\n{df_name}: No additional replacements needed")

print("\n=== Step 3: Final District Counts ===")
enrol_count = enrol_df['district'].nunique()
demo_count = demo_df['district'].nunique()
bio_count = bio_df['district'].nunique()
print(f"  Enrolment: {enrol_count}")
print(f"  Demographic: {demo_count}")
print(f"  Biometric: {bio_count}")

max_count = max(enrol_count, demo_count, bio_count)
if 34 <= max_count <= 38:
    print(f"\n✓ District count ({max_count}) is within expected range (34-38)")
else:
    print(f"\n⚠ WARNING: District count ({max_count}) outside expected range (34-38)")
    print("  Check for remaining duplicates:")
    all_districts = sorted(set(enrol_df['district'].unique()) | set(demo_df['district'].unique()) | set(bio_df['district'].unique()))
    for d in all_districts:
        print(f"    - {d}")

if all_replacements:
    mapping_df = pd.DataFrame(all_replacements)
    mapping_df.to_csv('mh_district_name_mapping.csv', index=False)
    print(f"\n✓ Saved district name mapping to mh_district_name_mapping.csv ({len(all_replacements)} replacements)")


=== Step 1: Basic Text Cleaning ===
  Enrolment: 36 -> 36 unique districts
  Demographic: 36 -> 36 unique districts
  Biometric: 36 -> 36 unique districts

=== Step 2: Canonical Mapping Replacements ===

Enrolment: No additional replacements needed

Demographic: No additional replacements needed

Biometric: No additional replacements needed

=== Step 3: Final District Counts ===
  Enrolment: 36
  Demographic: 36
  Biometric: 36

✓ District count (36) is within expected range (34-38)


In [None]:
enrol_df["date"] = pd.to_datetime(enrol_df["date"], dayfirst=True)
demo_df["date"] = pd.to_datetime(demo_df["date"], dayfirst=True)
bio_df["date"] = pd.to_datetime(bio_df["date"], dayfirst=True)

for df in all_dfs:
    df["month"] = df["date"].dt.month

print("Date range:")
print(f"  Enrolment: {enrol_df['date'].min()} to {enrol_df['date'].max()}")
print(f"  Demographic: {demo_df['date'].min()} to {demo_df['date'].max()}")
print(f"  Biometric: {bio_df['date'].min()} to {bio_df['date'].max()}")

Date range:
  Enrolment: 2025-03-09 00:00:00 to 2026-01-03 00:00:00
  Demographic: 2025-03-01 00:00:00 to 2026-01-03 00:00:00
  Biometric: 2025-03-01 00:00:00 to 2026-01-03 00:00:00


In [None]:
print("=== Enrolment Info ===")
enrol_df.info()
print("\n=== Demographic Info ===")
demo_df.info()
print("\n=== Biometric Info ===")
bio_df.info()

=== Enrolment Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93184 entries, 0 to 93183
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            93184 non-null  datetime64[ns]
 1   state           93184 non-null  object        
 2   district        93184 non-null  object        
 3   pincode         93184 non-null  int64         
 4   age_0_5         93184 non-null  int64         
 5   age_5_17        93184 non-null  int64         
 6   age_18_greater  93184 non-null  int64         
 7   month           93184 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(4), object(2)
memory usage: 5.3+ MB

=== Demographic Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183245 entries, 0 to 183244
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   date           183245 non

In [None]:
enrol_pincodes = set(enrol_df["pincode"].dropna())
demo_pincodes = set(demo_df["pincode"].dropna())
bio_pincodes = set(bio_df["pincode"].dropna())

print("Pincode analysis:")
print(f"  Enrolment pincodes: {len(enrol_pincodes)}")
print(f"  Demographic pincodes: {len(demo_pincodes)}")
print(f"  Biometric pincodes: {len(bio_pincodes)}")

only_in_enrol = enrol_pincodes - demo_pincodes - bio_pincodes
only_in_demo = demo_pincodes - enrol_pincodes - bio_pincodes
only_in_bio = bio_pincodes - enrol_pincodes - demo_pincodes
in_all = enrol_pincodes & demo_pincodes & bio_pincodes

print(f"\n  Pincodes only in enrolment: {len(only_in_enrol)}")
print(f"  Pincodes only in demographic: {len(only_in_demo)}")
print(f"  Pincodes only in biometric: {len(only_in_bio)}")
print(f"  Pincodes in all three: {len(in_all)}")

print(f"\nNULL pincodes:")
print(f"  Enrolment: {enrol_df['pincode'].isna().sum()}")
print(f"  Demographic: {demo_df['pincode'].isna().sum()}")
print(f"  Biometric: {bio_df['pincode'].isna().sum()}")

Pincode analysis:
  Enrolment pincodes: 1585
  Demographic pincodes: 1609
  Biometric pincodes: 1621

  Pincodes only in enrolment: 0
  Pincodes only in demographic: 1
  Pincodes only in biometric: 13
  Pincodes in all three: 1585

NULL pincodes:
  Enrolment: 0
  Demographic: 0
  Biometric: 0


In [None]:
enrol_agg = enrol_df.groupby(["state", "district", "month"])[["age_0_5", "age_5_17", "age_18_greater"]].sum().reset_index()
demo_agg = demo_df.groupby(["state", "district", "month"])[["demo_age_5_17", "demo_age_17_"]].sum().reset_index()
bio_agg = bio_df.groupby(["state", "district", "month"])[["bio_age_5_17", "bio_age_17_"]].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on=["state", "district", "month"], how="left").merge(bio_agg, on=["state", "district", "month"], how="left")
combined_df.fillna(0, inplace=True)

print(f"Combined monthly table: {len(combined_df)} rows")
combined_df.head()

Combined monthly table: 282 rows


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,Maharashtra,ahilyanagar,1,1117,209,1,128.0,1708.0,3491,4383
1,Maharashtra,ahilyanagar,4,71,15,10,0.0,0.0,8618,16428
2,Maharashtra,ahilyanagar,6,905,887,48,0.0,0.0,14713,34733
3,Maharashtra,ahilyanagar,7,182,158,28,0.0,0.0,15042,29652
4,Maharashtra,ahilyanagar,9,2223,1286,33,1678.0,11661.0,117785,36773


In [None]:
combined_df["E"] = combined_df["age_0_5"] + combined_df["age_5_17"] + combined_df["age_18_greater"]
combined_df["DU"] = combined_df["demo_age_5_17"] + combined_df["demo_age_17_"]
combined_df["BU"] = combined_df["bio_age_5_17"] + combined_df["bio_age_17_"]
combined_df["U"] = combined_df["DU"] + combined_df["BU"]
combined_df["T"] = combined_df["E"] + combined_df["U"]

combined_df.head(20)

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T
0,Maharashtra,ahilyanagar,1,1117,209,1,128.0,1708.0,3491,4383,1327,1836.0,7874,9710.0,11037.0
1,Maharashtra,ahilyanagar,4,71,15,10,0.0,0.0,8618,16428,96,0.0,25046,25046.0,25142.0
2,Maharashtra,ahilyanagar,6,905,887,48,0.0,0.0,14713,34733,1840,0.0,49446,49446.0,51286.0
3,Maharashtra,ahilyanagar,7,182,158,28,0.0,0.0,15042,29652,368,0.0,44694,44694.0,45062.0
4,Maharashtra,ahilyanagar,9,2223,1286,33,1678.0,11661.0,117785,36773,3542,13339.0,154558,167897.0,171439.0
5,Maharashtra,ahilyanagar,10,1474,353,21,1199.0,35740.0,33707,39925,1848,36939.0,73632,110571.0,112419.0
6,Maharashtra,ahilyanagar,11,1781,413,48,2016.0,118258.0,19013,30946,2242,120274.0,49959,170233.0,172475.0
7,Maharashtra,ahilyanagar,12,4061,311,3,1877.0,42892.0,19828,29159,4375,44769.0,48987,93756.0,98131.0
8,Maharashtra,akola,1,359,36,8,35.0,384.0,1136,2060,403,419.0,3196,3615.0,4018.0
9,Maharashtra,akola,6,700,243,20,40.0,222.0,7295,19847,963,262.0,27142,27404.0,28367.0


In [None]:
district_monthly_counts = combined_df.groupby(["district", "month"]).agg(
    total_months=("month", "count"),
    active_months=("T", lambda x: (x > 0).sum())
).reset_index()

district_monthly_counts["zero_months"] = district_monthly_counts["total_months"] - district_monthly_counts["active_months"]
district_monthly_counts["activity_ratio"] = district_monthly_counts["active_months"] / district_monthly_counts["total_months"]
district_monthly_counts["zero_month_ratio"] = district_monthly_counts["zero_months"] / district_monthly_counts["total_months"]

combined_df = combined_df.merge(
    district_monthly_counts[["district", "month", "activity_ratio", "zero_month_ratio"]],
    on=["district", "month"],
    how="left"
)

district_monthly_counts.head()

Unnamed: 0,district,month,total_months,active_months,zero_months,activity_ratio,zero_month_ratio
0,ahilyanagar,1,1,1,0,1.0,0.0
1,ahilyanagar,4,1,1,0,1.0,0.0
2,ahilyanagar,6,1,1,0,1.0,0.0
3,ahilyanagar,7,1,1,0,1.0,0.0
4,ahilyanagar,9,1,1,0,1.0,0.0


In [None]:
district_volume_metrics = combined_df.groupby(["state", "district"]).agg(
    avg_monthly_enrolment=("E", "mean"),
    monthly_valatility=("T", lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
    peak_load_ratio=("T", lambda x: x.max() / x.mean() if x.mean() > 0 else 0)
).reset_index()

print(f"District volume metrics: {len(district_volume_metrics)} districts")
district_volume_metrics.head()

District volume metrics: 36 districts


Unnamed: 0,state,district,avg_monthly_enrolment,monthly_valatility,peak_load_ratio
0,Maharashtra,ahilyanagar,1954.75,0.686793,2.008469
1,Maharashtra,akola,876.714286,0.503653,1.70413
2,Maharashtra,amravati,1226.0,0.474771,1.664638
3,Maharashtra,beed,1442.777778,0.548287,1.845016
4,Maharashtra,bhandara,420.5,0.476813,1.419819


In [None]:
district_update_burden = combined_df.groupby(["state", "district"]).agg(
    avg_monthly_enrollments=("E", "sum"),
    avg_monthly_demo_updates=("DU", "sum"),
    avg_monthly_bio_updates=("BU", "sum")
).reset_index()

district_update_burden["U"] = district_update_burden["avg_monthly_demo_updates"] + district_update_burden["avg_monthly_bio_updates"]
district_update_burden["biometric_burden"] = district_update_burden["avg_monthly_bio_updates"] / (district_update_burden["avg_monthly_bio_updates"] + district_update_burden["avg_monthly_demo_updates"])
district_update_burden["update_dominant"] = np.where(district_update_burden["U"] > district_update_burden["avg_monthly_enrollments"], 1, 0)
district_update_burden["enrollment_update_balance"] = district_update_burden["avg_monthly_enrollments"] / (district_update_burden["avg_monthly_enrollments"] + district_update_burden["U"])

# Handle NaN from division by zero
district_update_burden["biometric_burden"] = district_update_burden["biometric_burden"].fillna(0.5)
district_update_burden["enrollment_update_balance"] = district_update_burden["enrollment_update_balance"].fillna(0.5)

district_update_burden.head()

Unnamed: 0,state,district,avg_monthly_enrollments,avg_monthly_demo_updates,avg_monthly_bio_updates,U,biometric_burden,update_dominant,enrollment_update_balance
0,Maharashtra,ahilyanagar,15638,217157.0,454196,671353.0,0.676538,1,0.022763
1,Maharashtra,akola,6137,59817.0,172566,232383.0,0.742593,1,0.025729
2,Maharashtra,amravati,8582,93336.0,326550,419886.0,0.777711,1,0.02003
3,Maharashtra,beed,12985,149564.0,335632,485196.0,0.691745,1,0.026065
4,Maharashtra,bhandara,2523,36597.0,94490,131087.0,0.720819,1,0.018883


In [None]:
district_ratios = combined_df.groupby(["state", "district"]).agg(
    activity_ratio=("activity_ratio", "mean"),
    zero_month_ratio=("zero_month_ratio", "mean")
).reset_index()

district_first_month = combined_df.sort_values(["state", "district", "month"]).groupby(["state", "district"], as_index=False).first()
district_first_month = district_first_month[["state", "district", "month", "age_0_5", "age_5_17", "age_18_greater",
                                              "demo_age_5_17", "demo_age_17_", "bio_age_5_17", "bio_age_17_",
                                              "E", "DU", "BU", "U", "T"]]

district_df = district_first_month.merge(
    district_ratios, on=["state", "district"], how="left"
).merge(
    district_volume_metrics, on=["state", "district"], how="left"
).merge(
    district_update_burden[["state", "district", "biometric_burden", "update_dominant", "enrollment_update_balance"]],
    on=["state", "district"], how="left"
)

print(f"District-level feature table: {len(district_df)} districts")
district_df.head()

District-level feature table: 36 districts


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance
0,Maharashtra,ahilyanagar,1,1117,209,1,128.0,1708.0,3491,4383,1327,1836.0,7874,9710.0,11037.0,1.0,0.0,1954.75,0.686793,2.008469,0.676538,1,0.022763
1,Maharashtra,akola,1,359,36,8,35.0,384.0,1136,2060,403,419.0,3196,3615.0,4018.0,1.0,0.0,876.714286,0.503653,1.70413,0.742593,1,0.025729
2,Maharashtra,amravati,1,351,44,16,65.0,775.0,1355,4055,411,840.0,5410,6250.0,6661.0,1.0,0.0,1226.0,0.474771,1.664638,0.777711,1,0.02003
3,Maharashtra,beed,1,372,106,8,84.0,1152.0,1628,2385,486,1236.0,4013,5249.0,5735.0,1.0,0.0,1442.777778,0.548287,1.845016,0.691745,1,0.026065
4,Maharashtra,bhandara,1,125,7,0,19.0,254.0,481,956,132,273.0,1437,1710.0,1842.0,1.0,0.0,420.5,0.476813,1.419819,0.720819,1,0.018883


In [None]:
def normalize(x):
    maxx, minx = x.max(), x.min()
    if maxx == minx:
        return x * 0 + 0.5
    normalized = (x - minx) / (maxx - minx)
    return normalized

def inverse_normalize(x):
    inversed = 1 - normalize(x)
    return inversed

print("Normalization functions defined.")

Normalization functions defined.


In [None]:
district_df["access"] = (district_df["activity_ratio"] + normalize(district_df["avg_monthly_enrolment"])) / 2
district_df["responsiveness"] = normalize(district_df["U"] / (district_df["E"] + district_df["U"]))
district_df["inclusion"] = normalize((district_df["age_0_5"] + district_df["age_5_17"]) / district_df["E"])
district_df["stability"] = (inverse_normalize(district_df["monthly_valatility"]) + inverse_normalize(district_df["peak_load_ratio"])) / 2
district_df["visibility"] = district_df["activity_ratio"]

district_df["DEI"] = (district_df["access"] + district_df["responsiveness"] + district_df["inclusion"] + district_df["stability"] + district_df["visibility"]) / 5

district_df["ASS"] = (inverse_normalize(district_df["activity_ratio"]) + inverse_normalize(district_df["avg_monthly_enrolment"])) / 2
district_df["UBS"] = (normalize(district_df["biometric_burden"]) + normalize(district_df["update_dominant"])) / 2
district_df["SRS"] = (normalize(district_df["monthly_valatility"]) + normalize(district_df["zero_month_ratio"])) / 2

district_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,access,responsiveness,inclusion,stability,visibility,DEI,ASS,UBS,SRS
0,Maharashtra,ahilyanagar,1,1117,209,1,128.0,1708.0,3491,4383,1327,1836.0,7874,9710.0,11037.0,1.0,0.0,1954.75,0.686793,2.008469,0.676538,1,0.022763,0.681634,0.0,0.992404,0.26609,1.0,0.588026,0.568366,0.432253,0.724105
1,Maharashtra,akola,1,359,36,8,35.0,384.0,1136,2060,403,419.0,3196,3615.0,4018.0,1.0,0.0,876.714286,0.503653,1.70413,0.742593,1,0.025729,0.567232,0.253533,0.799901,0.723472,1.0,0.668828,0.682768,0.543609,0.396888
2,Maharashtra,amravati,1,351,44,16,65.0,775.0,1355,4055,411,840.0,5410,6250.0,6661.0,1.0,0.0,1226.0,0.474771,1.664638,0.777711,1,0.02003,0.604299,0.744441,0.607591,0.791965,1.0,0.749659,0.645701,0.602812,0.345285
3,Maharashtra,beed,1,372,106,8,84.0,1152.0,1628,2385,486,1236.0,4013,5249.0,5735.0,1.0,0.0,1442.777778,0.548287,1.845016,0.691745,1,0.026065,0.627303,0.451389,0.834074,0.583468,1.0,0.699247,0.622697,0.457889,0.476636
4,Maharashtra,bhandara,1,125,7,0,19.0,254.0,481,956,132,273.0,1437,1710.0,1842.0,1.0,0.0,420.5,0.476813,1.419819,0.720819,1,0.018883,0.518819,0.617774,1.0,0.893023,1.0,0.805923,0.731181,0.506902,0.348935


In [None]:
district_df.drop(["access", "responsiveness", "inclusion", "stability", "visibility"], axis=1, inplace=True)

print("Columns after dropping intermediates:")
print(district_df.columns.tolist())

Columns after dropping intermediates:
['state', 'district', 'month', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'zero_month_ratio', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio', 'biometric_burden', 'update_dominant', 'enrollment_update_balance', 'DEI', 'ASS', 'UBS', 'SRS']


In [None]:
up_analysis_cols = [
    'state', 'district', 'month', 'age_0_5', 'age_5_17', 'age_18_greater',
    'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_',
    'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'zero_month_ratio',
    'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio',
    'biometric_burden', 'update_dominant', 'enrollment_update_balance',
    'DEI', 'ASS', 'UBS', 'SRS'
]

up_final_cols = ['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']

district_df = district_df[up_analysis_cols]

mh_cols = district_df.columns.tolist()
assert mh_cols == up_analysis_cols, f"Column mismatch!\nExpected: {up_analysis_cols}\nGot: {mh_cols}\nDiff: {set(mh_cols) ^ set(up_analysis_cols)}"
print("✓ Column schema matches UP exactly")

for col in ['DEI', 'ASS', 'UBS', 'SRS']:
    min_val = district_df[col].min()
    max_val = district_df[col].max()
    assert min_val >= 0, f"{col} has negative values: {min_val}"
    assert max_val <= 1, f"{col} exceeds 1: {max_val}"
    print(f"✓ {col}: range [{min_val:.4f}, {max_val:.4f}]")

print(f"\n✓ All {len(district_df)} districts validated successfully")

✓ Column schema matches UP exactly
✓ DEI: range [0.4827, 0.8517]
✓ ASS: range [0.2500, 0.7500]
✓ UBS: range [0.2500, 0.7500]
✓ SRS: range [0.2500, 0.7500]

✓ All 36 districts validated successfully


In [None]:
district_df.to_csv("maharashtra_district_analysis.csv", index=False)
print("Saved: maharashtra_district_analysis.csv")

final_df = district_df[["state", "district", "DEI", "ASS", "UBS", "SRS"]]
final_df.to_csv("maharashtra_district_final_scores.csv", index=False)
print("Saved: maharashtra_district_final_scores.csv")

print(f"\n=== OUTPUT SUMMARY ===")
print(f"Total districts: {len(district_df)}")
print(f"\nScore statistics:")
print(district_df[["DEI", "ASS", "UBS", "SRS"]].describe())

Saved: maharashtra_district_analysis.csv
Saved: maharashtra_district_final_scores.csv

=== OUTPUT SUMMARY ===
Total districts: 36

Score statistics:
             DEI        ASS        UBS        SRS
count  36.000000  36.000000  36.000000  36.000000
mean    0.692989   0.617087   0.448445   0.464109
std     0.092824   0.111182   0.112037   0.148819
min     0.482672   0.250000   0.250000   0.250000
25%     0.626163   0.591295   0.357133   0.356615
50%     0.690468   0.641457   0.426292   0.435627
75%     0.762774   0.689163   0.528468   0.558405
max     0.851740   0.750000   0.750000   0.750000


In [None]:
print("=== Maharashtra District Analysis (first 10 rows) ===")
district_df.head(10)

=== Maharashtra District Analysis (first 10 rows) ===


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,DEI,ASS,UBS,SRS
0,Maharashtra,ahilyanagar,1,1117,209,1,128.0,1708.0,3491,4383,1327,1836.0,7874,9710.0,11037.0,1.0,0.0,1954.75,0.686793,2.008469,0.676538,1,0.022763,0.588026,0.568366,0.432253,0.724105
1,Maharashtra,akola,1,359,36,8,35.0,384.0,1136,2060,403,419.0,3196,3615.0,4018.0,1.0,0.0,876.714286,0.503653,1.70413,0.742593,1,0.025729,0.668828,0.682768,0.543609,0.396888
2,Maharashtra,amravati,1,351,44,16,65.0,775.0,1355,4055,411,840.0,5410,6250.0,6661.0,1.0,0.0,1226.0,0.474771,1.664638,0.777711,1,0.02003,0.749659,0.645701,0.602812,0.345285
3,Maharashtra,beed,1,372,106,8,84.0,1152.0,1628,2385,486,1236.0,4013,5249.0,5735.0,1.0,0.0,1442.777778,0.548287,1.845016,0.691745,1,0.026065,0.699247,0.622697,0.457889,0.476636
4,Maharashtra,bhandara,1,125,7,0,19.0,254.0,481,956,132,273.0,1437,1710.0,1842.0,1.0,0.0,420.5,0.476813,1.419819,0.720819,1,0.018883,0.805923,0.731181,0.506902,0.348935
5,Maharashtra,buldhana,1,393,61,50,53.0,793.0,1532,2800,504,846.0,4332,5178.0,5682.0,1.0,0.0,1445.285714,0.613899,2.035546,0.669467,1,0.024421,0.482672,0.622431,0.420331,0.593865
6,Maharashtra,chandrapur,1,225,10,9,43.0,568.0,1149,2101,244,611.0,3250,3861.0,4105.0,1.0,0.0,683.625,0.593624,2.13094,0.660738,1,0.01662,0.665667,0.703258,0.405615,0.55764
7,Maharashtra,chhatrapati sambhajinagar,1,611,290,6,165.0,1483.0,2303,3512,907,1648.0,5815,7463.0,8370.0,1.0,0.0,2432.111111,0.526909,1.885142,0.664096,1,0.040035,0.684214,0.517709,0.411277,0.43844
8,Maharashtra,dharashiv,1,282,68,1,44.0,676.0,990,1387,351,720.0,2377,3097.0,3448.0,1.0,0.0,799.142857,0.623544,2.31054,0.568429,1,0.022396,0.602929,0.691,0.25,0.611098
9,Maharashtra,dhule,1,310,93,5,31.0,540.0,1139,1930,408,571.0,3069,3640.0,4048.0,1.0,0.0,1798.75,0.42144,1.659018,0.779885,1,0.049938,0.735696,0.584921,0.606477,0.25


In [None]:
print("=== Maharashtra District Final Scores ===")
final_df.sort_values("DEI", ascending=False).head(20)

=== Maharashtra District Final Scores ===


Unnamed: 0,state,district,DEI,ASS,UBS,SRS
17,Maharashtra,mumbai,0.85174,0.451623,0.515433,0.289141
30,Maharashtra,sindhudurg,0.843373,0.75,0.389966,0.432814
19,Maharashtra,nagpur,0.812126,0.599563,0.536889,0.265906
33,Maharashtra,wardha,0.810521,0.728811,0.612924,0.267393
4,Maharashtra,bhandara,0.805923,0.731181,0.506902,0.348935
27,Maharashtra,ratnagiri,0.805331,0.718924,0.496558,0.468755
32,Maharashtra,thane,0.797057,0.25,0.352058,0.305794
15,Maharashtra,kolhapur,0.787496,0.608453,0.357291,0.444755
25,Maharashtra,pune,0.783207,0.325784,0.380821,0.311371
18,Maharashtra,mumbai suburban,0.755964,0.530325,0.294934,0.29688
