In [147]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [148]:
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", None)

In [149]:
enrol_df = pd.read_csv(r"D:\UIDAI hackathon\UP\Uttar Pradesh_enroll.csv")
demo_df = pd.read_csv(r"D:\UIDAI hackathon\UP\demographic.csv")
bio_df = pd.read_csv(r"D:\UIDAI hackathon\UP\biometric.csv")

In [150]:
up_df = [enrol_df, demo_df, bio_df]

In [151]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,01-04-2025,Uttar Pradesh,Maharajganj,273305,141,176,13
1,01-04-2025,Uttar Pradesh,Banda,210001,250,177,30
2,01-04-2025,Uttar Pradesh,Lucknow,226005,83,92,23
3,01-04-2025,Uttar Pradesh,Kanpur Nagar,208004,73,44,16
4,01-04-2025,Uttar Pradesh,Bahraich,271824,68,58,11


In [152]:
enrol_df["date"] = pd.to_datetime(enrol_df["date"], dayfirst=True)
demo_df["date"] = pd.to_datetime(demo_df["date"], dayfirst=True)
bio_df["date"] = pd.to_datetime(bio_df["date"], dayfirst=True)

In [153]:
print(enrol_df["district"].nunique(), demo_df["district"].nunique(), bio_df["district"].nunique())

89 89 95


### Record Redundancy

In [191]:
raw_df = pd.concat(up_df, ignore_index=True)

district_redundancy = raw_df.groupby("district").size().reset_index(name="raw_record_count")

district_redundancy_median = district_redundancy["raw_record_count"].median()
district_redundancy_75th_percentile = district_redundancy["raw_record_count"].quantile(0.75)

def redundancy_flag(x):
    if x > district_redundancy_75th_percentile:
        return "Red"
    elif x > district_redundancy_median:
        return "Amber"
    else:
        return "Green"

district_redundancy["redundancy_flag"] = (
    district_redundancy["raw_record_count"].apply(redundancy_flag)
)

print(district_redundancy[district_redundancy["redundancy_flag"] == "Red"]["district"].count())
district_redundancy.head()


19


Unnamed: 0,district,raw_record_count,redundancy_flag
0,agra,11851,Amber
1,aligarh,14394,Red
2,ambedkar nagar,12373,Amber
3,amethi,14305,Red
4,amroha,6549,Green


In [155]:
enrol_df["district"].value_counts().sort_index()

district
Agra                          2361
Aligarh                       2477
Allahabad                     3119
Ambedkar Nagar                2085
Amethi                        2161
Amroha                         974
Auraiya                       1065
Ayodhya                        788
Azamgarh                      3536
Baghpat                       1148
Bagpat                           4
Bahraich                      1868
Ballia                        2732
Balrampur                     1130
Banda                          888
Bara Banki                    2513
Barabanki                      424
Bareilly                      1890
Basti                         1944
Bhadohi                        448
Bijnor                        2077
Budaun                        1678
Bulandshahar                   169
Bulandshahr                   2624
Chandauli                     1293
Chitrakoot                     583
Deoria                        2031
Etah                          1023
Etawah     

In [156]:
bio_df["district"].value_counts().sort_index()

district
Agra                           6574
Aligarh                        8231
Allahabad                      9953
Ambedkar Nagar                 7171
Ambedkar Nagar *                  1
Amethi                         8557
Amroha                         3075
Auraiya                        3881
Auraiya *                         1
Ayodhya                        7513
Azamgarh                      11236
Baghpat                        4084
Baghpat *                         3
Bagpat                           42
Bahraich                       5345
Ballia                         8845
Balrampur                      3299
Banda                          2906
Bara Banki                     8847
Barabanki                      3139
Bareilly                       5942
Basti                          6436
Bhadohi                        2887
Bijnor                         6313
Budaun                         5754
Bulandshahar                   1102
Bulandshahr                   10026
Chandauli          

In [157]:
demo_df["district"].value_counts().sort_index()

district
Agra                          2916
Aligarh                       3686
Allahabad                     4280
Ambedkar Nagar                3116
Amethi                        3587
Amroha                        1300
Auraiya                       1716
Ayodhya                       3067
Azamgarh                      4904
Baghpat                       1789
Baghpat *                        2
Bagpat                          16
Bahraich                      2313
Ballia                        3974
Balrampur                     1445
Banda                         1212
Bara Banki                    3819
Barabanki                     1309
Bareilly                      2560
Basti                         2776
Bhadohi                       1226
Bijnor                        2794
Budaun                        2519
Bulandshahar                   465
Bulandshahr                   4370
Chandauli                     1922
Chandauli *                      1
Chitrakoot                     766
Chitrakoot 

In [158]:
cleanup_map = {
    "Ambedkar Nagar *": "Ambedkar Nagar",
    "Gautam Buddh Nagar": "Gautam Buddha Nagar",
    "Auraiya *": "Auraiya",
    "Chandauli *": "Chandauli",
    "Chitrakoot *": "Chitrakoot",
    "Gautam Buddha Nagar *": "Gautam Buddh Nagar",
    "Jyotiba Phule Nagar *": "Amroha",
    "Mahoba *": "Mahoba",
    "Sant Kabir Nagar *": "Sant Kabir Nagar",
    "Baghpat *": "Baghpat",
    "Chandauli *": "Chandauli",
    "Allahabad": "Prayagraj",
    "Faizabad": "Ayodhya",
    "Jyotiba Phule Nagar": "Amroha",
    "Bara Banki": "Barabanki",
    "Bulandshahar": "Bulandshahr",
    "Kushi Nagar": "Kushinagar",
    "Kushinagar *": "Kushinagar",
    "Rae Bareli": "Raebareli",
    "Siddharth Nagar": "Siddharthnagar",
    "Shravasti": "Shrawasti",
    "Mahrajganj": "Maharajganj",
    "Bagpat": "Baghpat",
    "Sant Ravidas Nagar": "Bhadohi",
    "Sant Ravidas Nagar Bhadohi": "Bhadohi"
}

for df in up_df:
    df["district"] = df["district"].replace(cleanup_map)
    
print(enrol_df["district"].nunique(), demo_df["district"].nunique(), bio_df["district"].nunique())

75 75 76


In [159]:
enrol_df["district"] = enrol_df["district"].str.lower()
demo_df["district"] = demo_df["district"].str.lower()
bio_df["district"] = bio_df["district"].str.lower()

In [160]:
print(enrol_df["pincode"].nunique(), demo_df["pincode"].nunique(), bio_df["pincode"].nunique())

1742 1762 1771


In [161]:
# Analyze pincode differences
enrol_pincodes = set(enrol_df["pincode"].dropna())
demo_pincodes = set(demo_df["pincode"].dropna())
bio_pincodes = set(bio_df["pincode"].dropna())

print("Enrollment pincodes:", len(enrol_pincodes))
print("Demographic pincodes:", len(demo_pincodes))
print("Biometric pincodes:", len(bio_pincodes))
print()

# Find differences
only_in_enrol = enrol_pincodes - demo_pincodes - bio_pincodes
only_in_demo = demo_pincodes - enrol_pincodes - bio_pincodes
only_in_bio = bio_pincodes - enrol_pincodes - demo_pincodes
in_all = enrol_pincodes & demo_pincodes & bio_pincodes

print(f"Pincodes only in enrollment: {len(only_in_enrol)}")
print(f"Pincodes only in demographic: {len(only_in_demo)}")
print(f"Pincodes only in biometric: {len(only_in_bio)}")
print(f"Pincodes in all three: {len(in_all)}")
print()

# Check for NULL/NaN values
print("NULL pincodes:")
print(f"Enrollment: {enrol_df["pincode"].isna().sum()}")
print(f"Demographic: {demo_df["pincode"].isna().sum()}")
print(f"Biometric: {bio_df["pincode"].isna().sum()}")
print()

# Show some examples of unique pincodes
print("Sample pincodes only in enrollment:", list(only_in_enrol))
print("Sample pincodes only in demographic:", list(only_in_demo))
print("Sample pincodes only in biometric:", list(only_in_bio))

Enrollment pincodes: 1742
Demographic pincodes: 1762
Biometric pincodes: 1771

Pincodes only in enrollment: 0
Pincodes only in demographic: 3
Pincodes only in biometric: 9
Pincodes in all three: 1739

NULL pincodes:
Enrollment: 0
Demographic: 0
Biometric: 0

Sample pincodes only in enrollment: []
Sample pincodes only in demographic: [204106, 209748, 272141]
Sample pincodes only in biometric: [230145, 229315, 244101, 230123, 272269, 243406, 209746, 221719, 401208]


In [163]:
enrol_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131509 entries, 0 to 131508
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            131509 non-null  datetime64[ns]
 1   state           131509 non-null  object        
 2   district        131509 non-null  object        
 3   pincode         131509 non-null  int64         
 4   age_0_5         131509 non-null  int64         
 5   age_5_17        131509 non-null  int64         
 6   age_18_greater  131509 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.0+ MB


In [164]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196621 entries, 0 to 196620
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   date           196621 non-null  datetime64[ns]
 1   state          196621 non-null  object        
 2   district       196621 non-null  object        
 3   pincode        196621 non-null  int64         
 4   demo_age_5_17  196621 non-null  int64         
 5   demo_age_17_   196621 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 9.0+ MB


In [165]:
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449911 entries, 0 to 449910
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date          449911 non-null  datetime64[ns]
 1   state         449911 non-null  object        
 2   district      449911 non-null  object        
 3   pincode       449911 non-null  int64         
 4   bio_age_5_17  449911 non-null  int64         
 5   bio_age_17_   449911 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 20.6+ MB


In [166]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,2025-04-01,Uttar Pradesh,maharajganj,273305,141,176,13
1,2025-04-01,Uttar Pradesh,banda,210001,250,177,30
2,2025-04-01,Uttar Pradesh,lucknow,226005,83,92,23
3,2025-04-01,Uttar Pradesh,kanpur nagar,208004,73,44,16
4,2025-04-01,Uttar Pradesh,bahraich,271824,68,58,11


In [167]:
demo_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,2025-03-01,Uttar Pradesh,varanasi,221307,138,1301
1,2025-03-01,Uttar Pradesh,budaun,202523,102,1712
2,2025-03-01,Uttar Pradesh,fatehpur,212631,77,1221
3,2025-03-01,Uttar Pradesh,pilibhit,262203,78,1073
4,2025-03-01,Uttar Pradesh,deoria,274208,50,590


In [168]:
bio_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,2025-04-01,Uttar Pradesh,mau,121705,137,55
1,2025-04-01,Uttar Pradesh,hardoi,241121,462,139
2,2025-04-01,Uttar Pradesh,kanpur nagar,208008,19,41
3,2025-04-01,Uttar Pradesh,azamgarh,224166,42,27
4,2025-04-01,Uttar Pradesh,ballia,221717,226,292


In [171]:
enrol_agg = enrol_df.groupby("district")[["age_0_5", "age_5_17", "age_18_greater"]].sum().reset_index()
demo_agg = demo_df.groupby("district")[["demo_age_5_17", "demo_age_17_"]].sum().reset_index()
bio_agg = bio_df.groupby("district")[["bio_age_5_17", "bio_age_17_"]].sum().reset_index()

In [172]:
combined_df = enrol_agg.merge(demo_agg, on = "district", how = "inner").merge(bio_agg, on = "district", how = "inner")
combined_df.head()

Unnamed: 0,district,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,agra,17964,14564,1067,18590,182869,185441,121256
1,aligarh,15189,13336,666,15213,189100,177603,85999
2,ambedkar nagar,4423,5057,60,8096,75040,69682,53502
3,amethi,4512,4035,69,6894,71361,71953,52955
4,amroha,6577,3177,83,6910,88444,77106,44993


In [173]:
combined_df["total_enrolled"] = combined_df["age_0_5"] + combined_df["age_5_17"] + combined_df["age_18_greater"]
combined_df["total_demo"] = combined_df["demo_age_5_17"] + combined_df["demo_age_17_"]
combined_df["total_bio"] = combined_df["bio_age_5_17"] + combined_df["bio_age_17_"]
combined_df.head()

Unnamed: 0,district,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,total_enrolled,total_demo,total_bio
0,agra,17964,14564,1067,18590,182869,185441,121256,33595,201459,306697
1,aligarh,15189,13336,666,15213,189100,177603,85999,29191,204313,263602
2,ambedkar nagar,4423,5057,60,8096,75040,69682,53502,9540,83136,123184
3,amethi,4512,4035,69,6894,71361,71953,52955,8616,78255,124908
4,amroha,6577,3177,83,6910,88444,77106,44993,9837,95354,122099


In [174]:
combined_df["total_interactions"] = combined_df["total_enrolled"] + combined_df["total_demo"] + combined_df["total_bio"]
combined_df.head()

Unnamed: 0,district,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,total_enrolled,total_demo,total_bio,total_interactions
0,agra,17964,14564,1067,18590,182869,185441,121256,33595,201459,306697,541751
1,aligarh,15189,13336,666,15213,189100,177603,85999,29191,204313,263602,497106
2,ambedkar nagar,4423,5057,60,8096,75040,69682,53502,9540,83136,123184,215860
3,amethi,4512,4035,69,6894,71361,71953,52955,8616,78255,124908,211779
4,amroha,6577,3177,83,6910,88444,77106,44993,9837,95354,122099,227290


### Enrollment-to-Update Ratio

In [175]:
combined_df["etu_ratio"] = combined_df["total_enrolled"] / (combined_df["total_demo"] + combined_df["total_bio"])
combined_df.head()

Unnamed: 0,district,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,total_enrolled,total_demo,total_bio,total_interactions,etu_ratio
0,agra,17964,14564,1067,18590,182869,185441,121256,33595,201459,306697,541751,0.066112
1,aligarh,15189,13336,666,15213,189100,177603,85999,29191,204313,263602,497106,0.062385
2,ambedkar nagar,4423,5057,60,8096,75040,69682,53502,9540,83136,123184,215860,0.046239
3,amethi,4512,4035,69,6894,71361,71953,52955,8616,78255,124908,211779,0.042409
4,amroha,6577,3177,83,6910,88444,77106,44993,9837,95354,122099,227290,0.045237


In [176]:
etu_median = combined_df["etu_ratio"].median()
etu_25_percentile = combined_df["etu_ratio"].quantile(0.25)

red, amber, green = [], [], []

for index, row in combined_df.iterrows():
    if row["etu_ratio"] < etu_25_percentile:
        red.append(row["district"])
    elif row["etu_ratio"] < etu_median:
        amber.append(row["district"])
    else:
        green.append(row["district"])
        
print("Red Zone Districts:", red)
print("Amber Zone Districts:", amber)
print("Green Zone Districts:", green)

Red Zone Districts: ['auraiya', 'ballia', 'barabanki', 'bhadohi', 'bijnor', 'deoria', 'farrukhabad', 'hamirpur', 'jalaun', 'jhansi', 'kushinagar', 'lalitpur', 'mahoba', 'mau', 'muzaffarnagar', 'prayagraj', 'raebareli', 'sant kabir nagar', 'sonbhadra']
Amber Zone Districts: ['ambedkar nagar', 'amethi', 'amroha', 'ayodhya', 'bulandshahr', 'chandauli', 'firozabad', 'ghazipur', 'hathras', 'jaunpur', 'kannauj', 'kasganj', 'kaushambi', 'mirzapur', 'pratapgarh', 'saharanpur', 'sambhal', 'shamli']
Green Zone Districts: ['agra', 'aligarh', 'azamgarh', 'baghpat', 'bahraich', 'balrampur', 'banda', 'bareilly', 'basti', 'budaun', 'chitrakoot', 'etah', 'etawah', 'fatehpur', 'gautam buddha nagar', 'ghaziabad', 'gonda', 'gorakhpur', 'hapur', 'hardoi', 'kanpur dehat', 'kanpur nagar', 'kheri', 'lucknow', 'maharajganj', 'mainpuri', 'mathura', 'meerut', 'moradabad', 'pilibhit', 'rampur', 'shahjahanpur', 'shrawasti', 'siddharthnagar', 'sitapur', 'sultanpur', 'unnao', 'varanasi']


### Biometric Burden Ratio

In [177]:
combined_df["bio_burder"] = combined_df["total_bio"] / (combined_df["total_demo"] + combined_df["total_bio"])

bio_burder_median = combined_df["bio_burder"].median()
bio_burder_80th_percentile = combined_df["bio_burder"].quantile(0.80)

red, amber, green = [], [], []

for index, row in combined_df.iterrows():
    if row["bio_burder"] >= bio_burder_80th_percentile:
        red.append(row["district"])
    elif (row["bio_burder"] >= bio_burder_median) and (row["bio_burder"] < bio_burder_80th_percentile):
        amber.append(row["district"])
    else:
        green.append(row["district"])

print("Red Zone Districts:", red)
print("Amber Zone Districts:", amber)
print("Green Zone Districts:", green)

Red Zone Districts: ['ayodhya', 'bahraich', 'barabanki', 'chitrakoot', 'etah', 'hardoi', 'hathras', 'jhansi', 'kasganj', 'kaushambi', 'lalitpur', 'mahoba', 'shamli', 'sitapur', 'sonbhadra']
Amber Zone Districts: ['agra', 'ambedkar nagar', 'amethi', 'auraiya', 'ballia', 'banda', 'basti', 'bhadohi', 'deoria', 'etawah', 'farrukhabad', 'gonda', 'hamirpur', 'jalaun', 'jaunpur', 'kannauj', 'mirzapur', 'muzaffarnagar', 'pratapgarh', 'prayagraj', 'saharanpur', 'sant kabir nagar', 'sultanpur']
Green Zone Districts: ['aligarh', 'amroha', 'azamgarh', 'baghpat', 'balrampur', 'bareilly', 'bijnor', 'budaun', 'bulandshahr', 'chandauli', 'fatehpur', 'firozabad', 'gautam buddha nagar', 'ghaziabad', 'ghazipur', 'gorakhpur', 'hapur', 'kanpur dehat', 'kanpur nagar', 'kheri', 'kushinagar', 'lucknow', 'maharajganj', 'mainpuri', 'mathura', 'mau', 'meerut', 'moradabad', 'pilibhit', 'raebareli', 'rampur', 'sambhal', 'shahjahanpur', 'shrawasti', 'siddharthnagar', 'unnao', 'varanasi']


### Identity Stability Index

In [178]:
combined_df["instability_index"] = (1 - ((combined_df["total_demo"] + combined_df["total_bio"]) / (combined_df["total_enrolled"] + combined_df["total_demo"] + combined_df["total_bio"])))

instability_median = combined_df["instability_index"].median()
instability_25th_percentile = combined_df["instability_index"].quantile(0.25)

red, amber, green = [], [], []

for index, row in combined_df.iterrows():
    if row["instability_index"] < instability_25th_percentile:
        red.append(row["district"])
    elif (row["instability_index"] >= instability_25th_percentile) and (row["instability_index"] < instability_median):
        amber.append(row["district"])
    else:
        green.append(row["district"])

print("Red Zone Districts:", red)
print("Amber Zone Districts:", amber)
print("Green Zone Districts:", green)

Red Zone Districts: ['auraiya', 'ballia', 'barabanki', 'bhadohi', 'bijnor', 'deoria', 'farrukhabad', 'hamirpur', 'jalaun', 'jhansi', 'kushinagar', 'lalitpur', 'mahoba', 'mau', 'muzaffarnagar', 'prayagraj', 'raebareli', 'sant kabir nagar', 'sonbhadra']
Amber Zone Districts: ['ambedkar nagar', 'amethi', 'amroha', 'ayodhya', 'bulandshahr', 'chandauli', 'firozabad', 'ghazipur', 'hathras', 'jaunpur', 'kannauj', 'kasganj', 'kaushambi', 'mirzapur', 'pratapgarh', 'saharanpur', 'sambhal', 'shamli']
Green Zone Districts: ['agra', 'aligarh', 'azamgarh', 'baghpat', 'bahraich', 'balrampur', 'banda', 'bareilly', 'basti', 'budaun', 'chitrakoot', 'etah', 'etawah', 'fatehpur', 'gautam buddha nagar', 'ghaziabad', 'gonda', 'gorakhpur', 'hapur', 'hardoi', 'kanpur dehat', 'kanpur nagar', 'kheri', 'lucknow', 'maharajganj', 'mainpuri', 'mathura', 'meerut', 'moradabad', 'pilibhit', 'rampur', 'shahjahanpur', 'shrawasti', 'siddharthnagar', 'sitapur', 'sultanpur', 'unnao', 'varanasi']


### Relative Update Dominance Flag

In [179]:
combined_df["update_dominance_flag"] = 0

for index, row in combined_df.iterrows():
    if row["etu_ratio"] < etu_median:
        combined_df.loc[index, "update_dominance_flag"] = 1
    else:
        combined_df.loc[index, "update_dominance_flag"] = 0
        
combined_df.head()

Unnamed: 0,district,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,total_enrolled,total_demo,total_bio,total_interactions,etu_ratio,bio_burder,instability_index,update_dominance_flag
0,agra,17964,14564,1067,18590,182869,185441,121256,33595,201459,306697,541751,0.066112,0.603549,0.062012,0
1,aligarh,15189,13336,666,15213,189100,177603,85999,29191,204313,263602,497106,0.062385,0.563354,0.058722,0
2,ambedkar nagar,4423,5057,60,8096,75040,69682,53502,9540,83136,123184,215860,0.046239,0.597053,0.044195,1
3,amethi,4512,4035,69,6894,71361,71953,52955,8616,78255,124908,211779,0.042409,0.614817,0.040684,1
4,amroha,6577,3177,83,6910,88444,77106,44993,9837,95354,122099,227290,0.045237,0.561496,0.04328,1


In [180]:
combined_df[combined_df["update_dominance_flag"] == 1]["district"]

2       ambedkar nagar
3               amethi
4               amroha
5              auraiya
6              ayodhya
10              ballia
13           barabanki
16             bhadohi
17              bijnor
19         bulandshahr
20           chandauli
22              deoria
25         farrukhabad
27           firozabad
30            ghazipur
33            hamirpur
36             hathras
37              jalaun
38             jaunpur
39              jhansi
40             kannauj
43             kasganj
44           kaushambi
46          kushinagar
47            lalitpur
50              mahoba
53                 mau
55            mirzapur
57       muzaffarnagar
59          pratapgarh
60           prayagraj
61           raebareli
63          saharanpur
64             sambhal
65    sant kabir nagar
67              shamli
71           sonbhadra
Name: district, dtype: object

*observation needed*

### combined_df[["district", "total_enrolled", "total_updates", "update_dominance_flag"]].head()

In [181]:
combined_df[["district", "total_enrolled", "update_dominance_flag"]].head()

Unnamed: 0,district,total_enrolled,update_dominance_flag
0,agra,33595,0
1,aligarh,29191,0
2,ambedkar nagar,9540,1
3,amethi,8616,1
4,amroha,9837,1


### District Operational Stress Score

District Operational Stress Score=
4
1
	​

[(1−E/U Ratio)+Biometric Burden+(1−Identity Stability)+Update Dominance Flag]


In [182]:
combined_df.columns

Index(['district', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17',
       'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'total_enrolled',
       'total_demo', 'total_bio', 'total_interactions', 'etu_ratio',
       'bio_burder', 'instability_index', 'update_dominance_flag'],
      dtype='object')

In [183]:
weights = 0.25
scaler = MinMaxScaler()

norm_cols = ["etu_ratio", "bio_burder", "instability_index"]

for col in norm_cols:
    combined_df[f"{col}_norm"] = scaler.fit_transform(combined_df[[col]])

In [184]:
combined_df["etu_stress"] = 1 - combined_df["etu_ratio_norm"]
combined_df["stability_stress"] = 1 - combined_df["instability_index_norm"]
combined_df["bio_stress"] = combined_df["bio_burder_norm"]

combined_df["stress_score"] = weights * (combined_df["etu_stress"] + combined_df["stability_stress"] + combined_df["bio_stress"] + combined_df["update_dominance_flag"])
combined_df.head(10)

Unnamed: 0,district,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,total_enrolled,total_demo,total_bio,total_interactions,etu_ratio,bio_burder,instability_index,update_dominance_flag,etu_ratio_norm,bio_burder_norm,instability_index_norm,etu_stress,stability_stress,bio_stress,stress_score
0,agra,17964,14564,1067,18590,182869,185441,121256,33595,201459,306697,541751,0.066112,0.603549,0.062012,0,0.473374,0.732919,0.491024,0.526626,0.508976,0.732919,0.44213
1,aligarh,15189,13336,666,15213,189100,177603,85999,29191,204313,263602,497106,0.062385,0.563354,0.058722,0,0.424006,0.553154,0.441357,0.575994,0.558643,0.553154,0.421948
2,ambedkar nagar,4423,5057,60,8096,75040,69682,53502,9540,83136,123184,215860,0.046239,0.597053,0.044195,1,0.21009,0.703867,0.222062,0.78991,0.777938,0.703867,0.817929
3,amethi,4512,4035,69,6894,71361,71953,52955,8616,78255,124908,211779,0.042409,0.614817,0.040684,1,0.159354,0.783313,0.169054,0.840646,0.830946,0.783313,0.863726
4,amroha,6577,3177,83,6910,88444,77106,44993,9837,95354,122099,227290,0.045237,0.561496,0.04328,1,0.196822,0.544842,0.208237,0.803178,0.791763,0.544842,0.784946
5,auraiya,2560,2031,44,5194,48104,55581,32684,4635,53298,88265,146198,0.032742,0.623503,0.031704,1,0.031271,0.822163,0.033486,0.968729,0.966514,0.822163,0.939351
6,ayodhya,5735,4216,42,7824,80534,90208,63660,9993,88358,153868,252219,0.041255,0.635225,0.03962,1,0.144059,0.874587,0.152998,0.855941,0.847002,0.874587,0.894382
7,azamgarh,7488,15839,355,16795,151220,121802,119162,23682,168015,240964,432661,0.057905,0.589184,0.054736,0,0.364651,0.668675,0.381181,0.635349,0.618819,0.668675,0.480711
8,baghpat,3947,3767,188,5128,54456,47802,34083,7902,59584,81885,149371,0.055857,0.578819,0.052902,0,0.337513,0.622319,0.353497,0.662487,0.646503,0.622319,0.482827
9,bahraich,15302,23788,2493,10165,136083,171020,75538,41583,146248,246558,434389,0.105861,0.627684,0.095728,0,1.0,0.84086,1.0,0.0,0.0,0.84086,0.210215


In [185]:
combined_df["stress_score"].median()

0.5646722528054884

In [186]:
combined_df.sort_values(by = "stress_score", ascending = False)[["district", "stress_score"]].head(10)


Unnamed: 0,district,stress_score
71,sonbhadra,1.0
47,lalitpur,0.948089
60,prayagraj,0.946275
5,auraiya,0.939351
50,mahoba,0.930948
39,jhansi,0.926852
13,barabanki,0.917901
33,hamirpur,0.906152
6,ayodhya,0.894382
65,sant kabir nagar,0.890189


In [187]:
combined_df[combined_df["stress_score"] >= 0.8][["district", "stress_score"]]

Unnamed: 0,district,stress_score
2,ambedkar nagar,0.817929
3,amethi,0.863726
5,auraiya,0.939351
6,ayodhya,0.894382
10,ballia,0.868402
13,barabanki,0.917901
16,bhadohi,0.884982
17,bijnor,0.818438
22,deoria,0.86907
25,farrukhabad,0.865762


In [190]:
combined_df.to_csv(r"up\analysis.csv", index = False)