In [1]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", None)

In [3]:
enrol_df = pd.read_csv("enrolment.csv")
demo_df = pd.read_csv("demographic.csv")
bio_df = pd.read_csv("biometric.csv")

In [4]:
all_dfs = [enrol_df, demo_df, bio_df]

In [5]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,01-04-2025,Rajasthan,Jalor,343049,54,27,14
1,01-04-2025,Rajasthan,Jodhpur,342301,210,101,12
2,01-05-2025,Rajasthan,Sirohi,307026,246,301,10
3,01-05-2025,Rajasthan,Bikaner,334001,421,164,20
4,01-06-2025,Rajasthan,Jaisalmer,345028,46,74,14


In [6]:
enrol_df["date"] = pd.to_datetime(enrol_df["date"], dayfirst=True)
demo_df["date"] = pd.to_datetime(demo_df["date"], dayfirst=True)
bio_df["date"] = pd.to_datetime(bio_df["date"], dayfirst=True)

In [7]:
print("Initial district counts:")
print(enrol_df["district"].nunique(), demo_df["district"].nunique(), bio_df["district"].nunique())

Initial district counts:
45 46 46


In [8]:
enrol_df["district"].value_counts().sort_index()

district
Ajmer               3041
Alwar               3236
Balotra                2
Banswara            1244
Baran                986
Barmer              2065
Beawar                 1
Bharatpur           2460
Bhilwara            2298
Bikaner             1850
Bundi               1136
Chittaurgarh         298
Chittorgarh         1679
Churu               1810
Dausa               1760
Deeg                  46
Deeg                   9
Dhaulpur             100
Dholpur             1017
Didwana-Kuchaman       4
Dungarpur           1764
Ganganagar          1576
Hanumangarh         1122
Jaipur              5597
Jaisalmer            719
Jalor               1571
Jalore                 6
Jhalawar            1352
Jhunjhunu            308
Jhunjhunun          2876
Jodhpur             2973
Karauli             1720
Khairthal-Tijara       1
Kota                2044
Kotputli-Behror        2
Nagaur              3242
Pali                2840
Pratapgarh           857
Rajsamand           1505
Salumbar        

In [9]:
bio_df["district"].value_counts().sort_index()

district
Ajmer               11821
Alwar               10690
Balotra               471
Banswara             4027
Baran                3179
Barmer               6119
Beawar                401
Bharatpur            7260
Bhilwara             7333
Bikaner              5927
Bundi                3888
Chittaurgarh         2413
Chittorgarh          7043
Churu                5564
Dausa                5872
Deeg                  457
Deeg                  796
Dhaulpur              516
Dholpur              2923
Didwana-Kuchaman      714
Dungarpur            5634
Ganganagar           6262
Hanumangarh          4177
Jaipur              17826
Jaisalmer            2855
Jalor                4668
Jalore                946
Jhalawar             4072
Jhunjhunu            3769
Jhunjhunun          10574
Jodhpur              9395
Karauli              5521
Khairthal-Tijara      540
Kota                 6644
Kotputli-Behror       592
Nagaur              10369
Pali                 9640
Phalodi               263
Pra

In [10]:
demo_df["district"].value_counts().sort_index()

district
Ajmer               5108
Alwar               4643
Balotra              483
Banswara            1776
Baran               1405
Barmer              2640
Beawar               454
Bharatpur           3211
Bhilwara            3174
Bikaner             2660
Bundi               1692
Chittaurgarh         893
Chittorgarh         2952
Churu               2451
Dausa               2563
Deeg                 210
Deeg                 647
Dhaulpur             168
Dholpur             1324
Didwana-Kuchaman     808
Dungarpur           2476
Ganganagar          2714
Hanumangarh         1796
Jaipur              7778
Jaisalmer           1227
Jalor               2018
Jalore               934
Jhalawar            1819
Jhunjhunu           2022
Jhunjhunun          4572
Jodhpur             4087
Karauli             2505
Khairthal-Tijara     435
Kota                2968
Kotputli-Behror      651
Nagaur              4471
Pali                4183
Phalodi              284
Pratapgarh          1343
Rajsamand       

In [11]:
import re

def clean_district_name(name):
    """Basic text cleaning: lowercase, strip, remove *, normalize spaces"""
    if pd.isna(name):
        return name
    name = str(name).lower().strip()
    name = name.replace('*', '') 
    name = re.sub(r'\s+', ' ', name) 
    name = name.strip()
    return name

print("=== Step 1: Basic Text Cleaning ===")
for df_name, df in [("Enrolment", enrol_df), ("Demographic", demo_df), ("Biometric", bio_df)]:
    before = df['district'].nunique()
    df['district'] = df['district'].apply(clean_district_name)
    after = df['district'].nunique()
    print(f"  {df_name}: {before} -> {after} unique districts")

=== Step 1: Basic Text Cleaning ===
  Enrolment: 45 -> 44 unique districts
  Demographic: 46 -> 45 unique districts
  Biometric: 46 -> 45 unique districts


In [None]:
cleanup_map = {
    "jalor": "jalore",
    "jhunjhunun": "jhunjhunu",
    "chittaurgarh": "chittorgarh",
    "dhaulpur": "dholpur",
    "ganganagar": "sri ganganagar",
}

print("\n=== Step 2: Canonical Mapping ===")
for df_name, df in [("Enrolment", enrol_df), ("Demographic", demo_df), ("Biometric", bio_df)]:
    replacements = {}
    for old_name, new_name in cleanup_map.items():
        if old_name == new_name:
            continue
        count = (df['district'] == old_name).sum()
        if count > 0:
            replacements[old_name] = (new_name, count)
    
    df['district'] = df['district'].replace(cleanup_map)
    
    if replacements:
        print(f"\n{df_name}:")
        for old, (new, cnt) in sorted(replacements.items()):
            print(f"  '{old}' -> '{new}' ({cnt:,} rows)")
    else:
        print(f"\n{df_name}: No additional replacements needed")

print("\n=== Step 3: Final District Counts ===")
print(enrol_df["district"].nunique(), demo_df["district"].nunique(), bio_df["district"].nunique())


=== Step 2: Canonical Mapping ===

Enrolment:
  'chittaurgarh' -> 'chittorgarh' (298 rows)
  'dhaulpur' -> 'dholpur' (100 rows)
  'ganganagar' -> 'sri ganganagar' (1,576 rows)
  'jalor' -> 'jalore' (1,571 rows)
  'jhunjhunun' -> 'jhunjhunu' (2,876 rows)

Demographic:
  'chittaurgarh' -> 'chittorgarh' (893 rows)
  'dhaulpur' -> 'dholpur' (168 rows)
  'ganganagar' -> 'sri ganganagar' (2,714 rows)
  'jalor' -> 'jalore' (2,018 rows)
  'jhunjhunun' -> 'jhunjhunu' (4,572 rows)

Biometric:
  'chittaurgarh' -> 'chittorgarh' (2,413 rows)
  'dhaulpur' -> 'dholpur' (516 rows)
  'ganganagar' -> 'sri ganganagar' (6,262 rows)
  'jalor' -> 'jalore' (4,668 rows)
  'jhunjhunun' -> 'jhunjhunu' (10,574 rows)

=== Step 3: Final District Counts ===
40 41 41


In [13]:
all_districts = sorted(set(enrol_df['district'].unique()) | 
                       set(demo_df['district'].unique()) | 
                       set(bio_df['district'].unique()))
print(f"Total unique districts across all datasets: {len(all_districts)}")
for d in all_districts:
    print(f"  - {d}")

Total unique districts across all datasets: 41
  - ajmer
  - alwar
  - balotra
  - banswara
  - baran
  - barmer
  - beawar
  - bharatpur
  - bhilwara
  - bikaner
  - bundi
  - chittorgarh
  - churu
  - dausa
  - deeg
  - dholpur
  - didwana-kuchaman
  - dungarpur
  - hanumangarh
  - jaipur
  - jaisalmer
  - jalore
  - jhalawar
  - jhunjhunu
  - jodhpur
  - karauli
  - khairthal-tijara
  - kota
  - kotputli-behror
  - nagaur
  - pali
  - phalodi
  - pratapgarh
  - rajsamand
  - salumbar
  - sawai madhopur
  - sikar
  - sirohi
  - sri ganganagar
  - tonk
  - udaipur


In [14]:
print(enrol_df["pincode"].nunique(), demo_df["pincode"].nunique(), bio_df["pincode"].nunique())

979 990 993


In [15]:
enrol_pincodes = set(enrol_df["pincode"].dropna())
demo_pincodes = set(demo_df["pincode"].dropna())
bio_pincodes = set(bio_df["pincode"].dropna())

print("Enrollment pincodes:", len(enrol_pincodes))
print("Demographic pincodes:", len(demo_pincodes))
print("Biometric pincodes:", len(bio_pincodes))
print()

only_in_enrol = enrol_pincodes - demo_pincodes - bio_pincodes
only_in_demo = demo_pincodes - enrol_pincodes - bio_pincodes
only_in_bio = bio_pincodes - enrol_pincodes - demo_pincodes
in_all = enrol_pincodes & demo_pincodes & bio_pincodes

print(f"Pincodes only in enrollment: {len(only_in_enrol)}")
print(f"Pincodes only in demographic: {len(only_in_demo)}")
print(f"Pincodes only in biometric: {len(only_in_bio)}")
print(f"Pincodes in all three: {len(in_all)}")
print()

print("NULL pincodes:")
print(f"Enrollment: {enrol_df['pincode'].isna().sum()}")
print(f"Demographic: {demo_df['pincode'].isna().sum()}")
print(f"Biometric: {bio_df['pincode'].isna().sum()}")

Enrollment pincodes: 979
Demographic pincodes: 990
Biometric pincodes: 993

Pincodes only in enrollment: 0
Pincodes only in demographic: 0
Pincodes only in biometric: 2
Pincodes in all three: 978

NULL pincodes:
Enrollment: 0
Demographic: 0
Biometric: 0


In [16]:
for df in all_dfs:
    df["month"] = df["date"].dt.month

In [17]:
enrol_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67523 entries, 0 to 67522
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            67523 non-null  datetime64[ns]
 1   state           67523 non-null  object        
 2   district        67523 non-null  object        
 3   pincode         67523 non-null  int64         
 4   age_0_5         67523 non-null  int64         
 5   age_5_17        67523 non-null  int64         
 6   age_18_greater  67523 non-null  int64         
 7   month           67523 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(4), object(2)
memory usage: 3.9+ MB


In [18]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105878 entries, 0 to 105877
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   date           105878 non-null  datetime64[ns]
 1   state          105878 non-null  object        
 2   district       105878 non-null  object        
 3   pincode        105878 non-null  int64         
 4   demo_age_5_17  105878 non-null  int64         
 5   demo_age_17_   105878 non-null  int64         
 6   month          105878 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(3), object(2)
memory usage: 5.3+ MB


In [19]:
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235880 entries, 0 to 235879
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date          235880 non-null  datetime64[ns]
 1   state         235880 non-null  object        
 2   district      235880 non-null  object        
 3   pincode       235880 non-null  int64         
 4   bio_age_5_17  235880 non-null  int64         
 5   bio_age_17_   235880 non-null  int64         
 6   month         235880 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(3), object(2)
memory usage: 11.7+ MB


In [20]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,month
0,2025-04-01,Rajasthan,jalore,343049,54,27,14,4
1,2025-04-01,Rajasthan,jodhpur,342301,210,101,12,4
2,2025-05-01,Rajasthan,sirohi,307026,246,301,10,5
3,2025-05-01,Rajasthan,bikaner,334001,421,164,20,5
4,2025-06-01,Rajasthan,jaisalmer,345028,46,74,14,6


In [21]:
demo_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,month
0,2025-03-01,Rajasthan,sirohi,307513,18,201,3
1,2025-03-01,Rajasthan,jaipur,303903,47,647,3
2,2025-03-01,Rajasthan,jaisalmer,345021,76,745,3
3,2025-03-01,Rajasthan,nagaur,341022,81,1073,3
4,2025-03-01,Rajasthan,jodhpur,342901,85,982,3


In [22]:
bio_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,month
0,2025-03-01,Rajasthan,dausa,303501,298,463,3
1,2025-03-01,Rajasthan,jaisalmer,345027,306,319,3
2,2025-03-01,Rajasthan,bikaner,334602,100,130,3
3,2025-03-01,Rajasthan,rajsamand,313341,173,159,3
4,2025-03-01,Rajasthan,sikar,332031,77,251,3


In [23]:
enrol_agg = enrol_df.groupby(["state", "district", "month"])[["age_0_5", "age_5_17", "age_18_greater"]].sum().reset_index()
demo_agg = demo_df.groupby(["state", "district", "month"])[["demo_age_5_17", "demo_age_17_"]].sum().reset_index()
bio_agg = bio_df.groupby(["state", "district", "month"])[["bio_age_5_17", "bio_age_17_"]].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on = ["state", "district", "month"], how = "left").merge(bio_agg, on = ["state", "district", "month"], how = "left")
combined_df.fillna(0, inplace=True)
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0
1,Rajasthan,ajmer,4,1007,196,36,827.0,6198.0,7015.0,9850.0
2,Rajasthan,ajmer,6,1057,623,21,645.0,5375.0,11507.0,13169.0
3,Rajasthan,ajmer,7,673,241,18,1284.0,7687.0,12359.0,10676.0
4,Rajasthan,ajmer,9,3125,980,17,1246.0,7841.0,24130.0,7804.0


In [24]:
combined_df["E"] = combined_df["age_0_5"] + combined_df["age_5_17"] + combined_df["age_18_greater"]
combined_df["DU"] = combined_df["demo_age_5_17"] + combined_df["demo_age_17_"]
combined_df["BU"] = combined_df["bio_age_5_17"] + combined_df["bio_age_17_"]
combined_df["U"] = combined_df["DU"] + combined_df["BU"]
combined_df["T"] = combined_df["E"] + combined_df["U"]
combined_df.head(20)

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0,608,1680.0,2981.0,4661.0,5269.0
1,Rajasthan,ajmer,4,1007,196,36,827.0,6198.0,7015.0,9850.0,1239,7025.0,16865.0,23890.0,25129.0
2,Rajasthan,ajmer,6,1057,623,21,645.0,5375.0,11507.0,13169.0,1701,6020.0,24676.0,30696.0,32397.0
3,Rajasthan,ajmer,7,673,241,18,1284.0,7687.0,12359.0,10676.0,932,8971.0,23035.0,32006.0,32938.0
4,Rajasthan,ajmer,9,3125,980,17,1246.0,7841.0,24130.0,7804.0,4122,9087.0,31934.0,41021.0,45143.0
5,Rajasthan,ajmer,10,1164,403,6,708.0,5469.0,7855.0,7869.0,1573,6177.0,15724.0,21901.0,23474.0
6,Rajasthan,ajmer,11,1967,552,66,1261.0,14641.0,7348.0,9007.0,2585,15902.0,16355.0,32257.0,34842.0
7,Rajasthan,ajmer,12,1264,397,6,2187.0,20339.0,12284.0,11331.0,1667,22526.0,23615.0,46141.0,47808.0
8,Rajasthan,alwar,1,572,410,4,277.0,2063.0,2609.0,2295.0,986,2340.0,4904.0,7244.0,8230.0
9,Rajasthan,alwar,4,784,209,39,574.0,4625.0,10615.0,14555.0,1032,5199.0,25170.0,30369.0,31401.0


In [25]:
district_monthly_counts = combined_df.groupby(["district", "month"]).agg(total_months = ("month", "count"), active_months = ("T", lambda x : (x > 0).sum())).reset_index()
district_monthly_counts.head()

Unnamed: 0,district,month,total_months,active_months
0,ajmer,1,1,1
1,ajmer,4,1,1
2,ajmer,6,1,1
3,ajmer,7,1,1
4,ajmer,9,1,1


In [26]:
district_monthly_counts["zero_months"] = district_monthly_counts["total_months"] - district_monthly_counts["active_months"]
district_monthly_counts["activity_ratio"] = district_monthly_counts["active_months"] / district_monthly_counts["total_months"]
district_monthly_counts["zero_month_ratio"] = district_monthly_counts["zero_months"] / district_monthly_counts["total_months"]

combined_df = combined_df.merge(district_monthly_counts[["district", "month", "activity_ratio", "zero_month_ratio"]], on = ["district", "month"], how = "left")
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0,608,1680.0,2981.0,4661.0,5269.0,1.0,0.0
1,Rajasthan,ajmer,4,1007,196,36,827.0,6198.0,7015.0,9850.0,1239,7025.0,16865.0,23890.0,25129.0,1.0,0.0
2,Rajasthan,ajmer,6,1057,623,21,645.0,5375.0,11507.0,13169.0,1701,6020.0,24676.0,30696.0,32397.0,1.0,0.0
3,Rajasthan,ajmer,7,673,241,18,1284.0,7687.0,12359.0,10676.0,932,8971.0,23035.0,32006.0,32938.0,1.0,0.0
4,Rajasthan,ajmer,9,3125,980,17,1246.0,7841.0,24130.0,7804.0,4122,9087.0,31934.0,41021.0,45143.0,1.0,0.0


In [27]:
print("Columns:", combined_df.columns.tolist())

Columns: ['state', 'district', 'month', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'zero_month_ratio']


In [28]:
district_volume_metrics = combined_df.groupby(["state", "district"]).agg(avg_monthly_enrolment = ("E", "mean"),
                                               monthly_valatility = ("T", lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
                                               peak_load_ratio = ("T", lambda x: x.max() / x.mean() if x.mean() > 0 else 0)).reset_index()


combined_df = combined_df.merge(district_volume_metrics, on=["state", "district"], how="left")
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0,608,1680.0,2981.0,4661.0,5269.0,1.0,0.0,1803.375,0.40584,1.548437
1,Rajasthan,ajmer,4,1007,196,36,827.0,6198.0,7015.0,9850.0,1239,7025.0,16865.0,23890.0,25129.0,1.0,0.0,1803.375,0.40584,1.548437
2,Rajasthan,ajmer,6,1057,623,21,645.0,5375.0,11507.0,13169.0,1701,6020.0,24676.0,30696.0,32397.0,1.0,0.0,1803.375,0.40584,1.548437
3,Rajasthan,ajmer,7,673,241,18,1284.0,7687.0,12359.0,10676.0,932,8971.0,23035.0,32006.0,32938.0,1.0,0.0,1803.375,0.40584,1.548437
4,Rajasthan,ajmer,9,3125,980,17,1246.0,7841.0,24130.0,7804.0,4122,9087.0,31934.0,41021.0,45143.0,1.0,0.0,1803.375,0.40584,1.548437


In [29]:
district_update_burden = combined_df.groupby(["state", "district"]).agg(avg_monthly_enrollments = ("E", "sum"), avg_monthly_demo_updates = ("DU", "sum"), avg_monthly_bio_updates = ("BU", "sum")).reset_index()

district_update_burden["U"] = district_update_burden["avg_monthly_demo_updates"] + district_update_burden["avg_monthly_bio_updates"]
district_update_burden["biometric_burden"] = district_update_burden["avg_monthly_bio_updates"] / (district_update_burden["avg_monthly_bio_updates"] + district_update_burden["avg_monthly_demo_updates"])
district_update_burden["update_dominant"] = np.where(district_update_burden["U"] > district_update_burden["avg_monthly_enrollments"], 1, 0)
district_update_burden["enrollment_update_balance"] = district_update_burden["avg_monthly_enrollments"] / (district_update_burden["avg_monthly_enrollments"] + district_update_burden["U"])

combined_df = combined_df.merge(district_update_burden[["state", "district", "biometric_burden", "update_dominant", "enrollment_update_balance"]], on=["state", "district"], how="left")
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0,608,1680.0,2981.0,4661.0,5269.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409
1,Rajasthan,ajmer,4,1007,196,36,827.0,6198.0,7015.0,9850.0,1239,7025.0,16865.0,23890.0,25129.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409
2,Rajasthan,ajmer,6,1057,623,21,645.0,5375.0,11507.0,13169.0,1701,6020.0,24676.0,30696.0,32397.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409
3,Rajasthan,ajmer,7,673,241,18,1284.0,7687.0,12359.0,10676.0,932,8971.0,23035.0,32006.0,32938.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409
4,Rajasthan,ajmer,9,3125,980,17,1246.0,7841.0,24130.0,7804.0,4122,9087.0,31934.0,41021.0,45143.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409


In [30]:
cols_to_drop = [c for c in combined_df.columns if c.endswith('_y')]
if cols_to_drop:
    combined_df.drop(cols_to_drop, axis=1, inplace=True)
    combined_df.rename(columns={c: c.replace('_x', '') for c in combined_df.columns if c.endswith('_x')}, inplace=True)
print("Columns after cleanup:", combined_df.columns.tolist())

Columns after cleanup: ['state', 'district', 'month', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'zero_month_ratio', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio', 'biometric_burden', 'update_dominant', 'enrollment_update_balance']


In [31]:
combined_df = combined_df.groupby(["state", "district"], as_index = False).first()
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0,608,1680.0,2981.0,4661.0,5269.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409
1,Rajasthan,alwar,1,572,410,4,277.0,2063.0,2609.0,2295.0,986,2340.0,4904.0,7244.0,8230.0,1.0,0.0,2992.0,0.419058,1.647541,0.75268,1,0.06821
2,Rajasthan,balotra,1,0,1,0,4.0,23.0,0.0,0.0,1,27.0,0.0,27.0,28.0,1.0,0.0,1.0,0.902778,1.902778,0.482578,1,0.003472
3,Rajasthan,banswara,1,192,239,2,92.0,864.0,683.0,813.0,433,956.0,1496.0,2452.0,2885.0,1.0,0.0,1442.833333,0.51192,1.717438,0.610976,1,0.067025
4,Rajasthan,baran,1,172,98,3,61.0,662.0,585.0,1154.0,273,723.0,1739.0,2462.0,2735.0,1.0,0.0,918.875,0.543877,1.939508,0.694387,1,0.058418


In [32]:
def normalize(x):
    maxx, minx = x.max(), x.min()
    if maxx == minx:
        return x * 0 + 0.5
    normalized = (x - minx) / (maxx - minx)
    return normalized

def inverse_normalize(x):
    inversed = 1 - normalize(x)
    return inversed

In [33]:
combined_df["access"] = (combined_df["activity_ratio"] + normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["responsiveness"] = normalize(combined_df["U"] / (combined_df["E"] + combined_df["U"]))
combined_df["inclusion"] = normalize((combined_df["age_0_5"] + combined_df["age_5_17"]) / combined_df["E"])
combined_df["stability"] = (inverse_normalize(combined_df["monthly_valatility"]) + inverse_normalize(combined_df["peak_load_ratio"])) / 2
combined_df["visibility"] = combined_df["activity_ratio"]

combined_df["DEI"] = (combined_df["access"] + combined_df["responsiveness"] + combined_df["inclusion"] + combined_df["stability"] + combined_df["visibility"]) / 5
combined_df["ASS"] = (inverse_normalize(combined_df["activity_ratio"]) + inverse_normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["UBS"] = (normalize(combined_df["biometric_burden"]) + normalize(combined_df["update_dominant"])) / 2
combined_df["SRS"] = (normalize(combined_df["monthly_valatility"]) + normalize(combined_df["zero_month_ratio"])) / 2

combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,access,responsiveness,inclusion,stability,visibility,DEI,ASS,UBS,SRS
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0,608,1680.0,2981.0,4661.0,5269.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409,0.720723,0.479025,0.988487,0.546515,1.0,0.74695,0.529277,0.583626,0.47067
1,Rajasthan,alwar,1,572,410,4,277.0,2063.0,2609.0,2295.0,986,2340.0,4904.0,7244.0,8230.0,1.0,0.0,2992.0,0.419058,1.647541,0.75268,1,0.06821,0.866285,0.458933,0.995943,0.497258,1.0,0.763684,0.383715,0.62634,0.477857
2,Rajasthan,balotra,1,0,1,0,4.0,23.0,0.0,0.0,1,27.0,0.0,27.0,28.0,1.0,0.0,1.0,0.902778,1.902778,0.482578,1,0.003472,0.5,0.841735,1.0,0.125892,1.0,0.693525,0.75,0.491289,0.740873
3,Rajasthan,banswara,1,192,239,2,92.0,864.0,683.0,813.0,433,956.0,1496.0,2452.0,2885.0,1.0,0.0,1442.833333,0.51192,1.717438,0.610976,1,0.067025,0.67657,0.321086,0.995381,0.417093,1.0,0.682026,0.57343,0.555488,0.528349
4,Rajasthan,baran,1,172,98,3,61.0,662.0,585.0,1154.0,273,723.0,1739.0,2462.0,2735.0,1.0,0.0,918.875,0.543877,1.939508,0.694387,1,0.058418,0.612405,0.549924,0.989011,0.305447,1.0,0.691358,0.637595,0.597193,0.545726


In [34]:
combined_df.drop(["access", "responsiveness", "inclusion", "stability", "visibility"], axis=1, inplace=True)
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,DEI,ASS,UBS,SRS
0,Rajasthan,ajmer,1,389,212,7,161.0,1519.0,1518.0,1463.0,608,1680.0,2981.0,4661.0,5269.0,1.0,0.0,1803.375,0.40584,1.548437,0.667253,1,0.058409,0.74695,0.529277,0.583626,0.47067
1,Rajasthan,alwar,1,572,410,4,277.0,2063.0,2609.0,2295.0,986,2340.0,4904.0,7244.0,8230.0,1.0,0.0,2992.0,0.419058,1.647541,0.75268,1,0.06821,0.763684,0.383715,0.62634,0.477857
2,Rajasthan,balotra,1,0,1,0,4.0,23.0,0.0,0.0,1,27.0,0.0,27.0,28.0,1.0,0.0,1.0,0.902778,1.902778,0.482578,1,0.003472,0.693525,0.75,0.491289,0.740873
3,Rajasthan,banswara,1,192,239,2,92.0,864.0,683.0,813.0,433,956.0,1496.0,2452.0,2885.0,1.0,0.0,1442.833333,0.51192,1.717438,0.610976,1,0.067025,0.682026,0.57343,0.555488,0.528349
4,Rajasthan,baran,1,172,98,3,61.0,662.0,585.0,1154.0,273,723.0,1739.0,2462.0,2735.0,1.0,0.0,918.875,0.543877,1.939508,0.694387,1,0.058418,0.691358,0.637595,0.597193,0.545726


In [35]:
print(f"Total districts: {len(combined_df)}")
print(f"\nScore summary statistics:")
for col in ['DEI', 'ASS', 'UBS', 'SRS']:
    print(f"  {col}: min={combined_df[col].min():.4f}, max={combined_df[col].max():.4f}, mean={combined_df[col].mean():.4f}")

for col in ['DEI', 'ASS', 'UBS', 'SRS']:
    assert combined_df[col].min() >= 0, f"{col} has negative values"
    assert combined_df[col].max() <= 1, f"{col} exceeds 1"
print("\n✓ All scores in valid [0, 1] range")

Total districts: 40

Score summary statistics:
  DEI: min=0.5302, max=0.9000, mean=0.7223
  ASS: min=0.2500, max=0.7500, mean=0.5830
  UBS: min=0.2500, max=0.7500, mean=0.5768
  SRS: min=0.2500, max=0.7500, mean=0.4901

✓ All scores in valid [0, 1] range


In [36]:
combined_df.to_csv("rajasthan_district_analysis.csv", index=False)
final_df = combined_df[["state", "district", "DEI", "ASS", "UBS", "SRS"]]
final_df.to_csv("rajasthan_district_final_scores.csv", index=False)
print("✓ Saved rajasthan_district_analysis.csv")
print("✓ Saved rajasthan_district_final_scores.csv")

✓ Saved rajasthan_district_analysis.csv
✓ Saved rajasthan_district_final_scores.csv


In [37]:
final_df.sort_values('DEI', ascending=False)

Unnamed: 0,state,district,DEI,ASS,UBS,SRS
6,Rajasthan,beawar,0.9,0.75,0.392789,0.25
33,Rajasthan,salumbar,0.898448,0.75,0.405585,0.25
28,Rajasthan,kotputli-behror,0.851674,0.749878,0.25,0.25
19,Rajasthan,jaipur,0.812235,0.25,0.578499,0.447581
1,Rajasthan,alwar,0.763684,0.383715,0.62634,0.477857
18,Rajasthan,hanumangarh,0.758575,0.646169,0.591202,0.476435
23,Rajasthan,jhunjhunu,0.748488,0.552142,0.598466,0.488835
35,Rajasthan,sikar,0.747153,0.562483,0.603208,0.473069
0,Rajasthan,ajmer,0.74695,0.529277,0.583626,0.47067
29,Rajasthan,nagaur,0.745825,0.432654,0.636655,0.474577
