In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import glob
import os

In [22]:
base_path = os.path.dirname(os.path.abspath('__file__'))

enrolment_files = sorted(glob.glob(os.path.join(base_path, 'enrolment*.csv')))
demographic_files = sorted(glob.glob(os.path.join(base_path, 'demographic*.csv')))
biometric_files = sorted(glob.glob(os.path.join(base_path, 'biometric*.csv')))

print(f'Enrolment files: {len(enrolment_files)}')
print(f'Demographic files: {len(demographic_files)}')
print(f'Biometric files: {len(biometric_files)}')

df_enrolment = pd.concat([pd.read_csv(f) for f in enrolment_files], ignore_index=True)
df_demographic = pd.concat([pd.read_csv(f) for f in demographic_files], ignore_index=True)
df_biometric = pd.concat([pd.read_csv(f) for f in biometric_files], ignore_index=True)

print(f'\nConsolidated shapes:')
print(f'Enrolment: {df_enrolment.shape}')
print(f'Demographic: {df_demographic.shape}')
print(f'Biometric: {df_biometric.shape}')

Enrolment files: 6
Demographic files: 5
Biometric files: 5

Consolidated shapes:
Enrolment: (504, 7)
Demographic: (1935, 6)
Biometric: (4654, 6)


In [23]:
df_enrolment['date'] = pd.to_datetime(df_enrolment['date'], format='%d-%m-%Y')
df_demographic['date'] = pd.to_datetime(df_demographic['date'], format='%d-%m-%Y')
df_biometric['date'] = pd.to_datetime(df_biometric['date'], format='%d-%m-%Y')

In [24]:
print('Enrolment districts:', df_enrolment['district'].unique())
print('Demographic districts:', df_demographic['district'].unique())
print('Biometric districts:', df_biometric['district'].unique())

Enrolment districts: ['Dadra & Nagar Haveli' 'Dadra and Nagar Haveli' 'Daman' 'Diu'
 'Dadra And Nagar Haveli']
Demographic districts: ['Dadra & Nagar Haveli' 'Dadra and Nagar Haveli' 'Diu' 'Daman']
Biometric districts: ['Dadra & Nagar Haveli' 'Dadra and Nagar Haveli' 'Diu' 'Daman']


In [25]:
cleanup_map = {
    'dadra & nagar haveli': 'dadra and nagar haveli',
    'dadra and nagar haveli': 'dadra and nagar haveli',
    'daman': 'daman',
    'diu': 'diu'
}

def standardize_district(name):
    name_lower = str(name).strip().lower()
    return cleanup_map.get(name_lower, name_lower)

df_enrolment['district'] = df_enrolment['district'].apply(standardize_district)
df_demographic['district'] = df_demographic['district'].apply(standardize_district)
df_biometric['district'] = df_biometric['district'].apply(standardize_district)

print('\nAfter cleanup:')
print('Enrolment districts:', sorted(df_enrolment['district'].unique()))
print('Demographic districts:', sorted(df_demographic['district'].unique()))
print('Biometric districts:', sorted(df_biometric['district'].unique()))


After cleanup:
Enrolment districts: ['dadra and nagar haveli', 'daman', 'diu']
Demographic districts: ['dadra and nagar haveli', 'daman', 'diu']
Biometric districts: ['dadra and nagar haveli', 'daman', 'diu']


In [26]:
df_enrolment['month'] = df_enrolment['date'].dt.month
df_demographic['month'] = df_demographic['date'].dt.month
df_biometric['month'] = df_biometric['date'].dt.month

In [27]:
enrolment_agg = df_enrolment.groupby(['district', 'month']).agg({
    'age_0_5': 'sum',
    'age_5_17': 'sum',
    'age_18_greater': 'sum'
}).reset_index()

enrolment_agg['E'] = enrolment_agg['age_0_5'] + enrolment_agg['age_5_17'] + enrolment_agg['age_18_greater']
print('Enrolment aggregated:', enrolment_agg.shape)

Enrolment aggregated: (19, 6)


In [28]:
demographic_agg = df_demographic.groupby(['district', 'month']).agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_': 'sum'
}).reset_index()

demographic_agg['DU'] = demographic_agg['demo_age_5_17'] + demographic_agg['demo_age_17_']
print('Demographic aggregated:', demographic_agg.shape)

Demographic aggregated: (19, 5)


In [29]:
biometric_agg = df_biometric.groupby(['district', 'month']).agg({
    'bio_age_5_17': 'sum',
    'bio_age_17_': 'sum'
}).reset_index()

biometric_agg['BU'] = biometric_agg['bio_age_5_17'] + biometric_agg['bio_age_17_']
print('Biometric aggregated:', biometric_agg.shape)

Biometric aggregated: (33, 5)


In [30]:
merged = enrolment_agg.merge(demographic_agg, on=['district', 'month'], how='outer')
merged = merged.merge(biometric_agg, on=['district', 'month'], how='outer')

merged = merged.fillna(0)
print('Merged shape:', merged.shape)
print('Merged columns:', merged.columns.tolist())

Merged shape: (33, 12)
Merged columns: ['district', 'month', 'age_0_5', 'age_5_17', 'age_18_greater', 'E', 'demo_age_5_17', 'demo_age_17_', 'DU', 'bio_age_5_17', 'bio_age_17_', 'BU']


In [31]:
merged['U'] = merged['DU'] + merged['BU']
merged['T'] = merged['E'] + merged['U']
merged['activity_ratio'] = merged['U'] / merged['T'].replace(0, np.nan)
merged['activity_ratio'] = merged['activity_ratio'].fillna(0)

print('Computed E, DU, BU, U, T, activity_ratio')
print(merged[['district', 'month', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio']].head(10))

Computed E, DU, BU, U, T, activity_ratio
                 district  month      E      DU    BU        U        T  \
0  dadra and nagar haveli      1   22.0   115.0   269    384.0    406.0   
1  dadra and nagar haveli      3    0.0  1259.0  3725   4984.0   4984.0   
2  dadra and nagar haveli      4    0.0   561.0  3599   4160.0   4160.0   
3  dadra and nagar haveli      5    0.0     0.0  5904   5904.0   5904.0   
4  dadra and nagar haveli      6  399.0     0.0  4197   4197.0   4596.0   
5  dadra and nagar haveli      7  467.0     0.0  3609   3609.0   4076.0   
6  dadra and nagar haveli      8    0.0     0.0  5113   5113.0   5113.0   
7  dadra and nagar haveli      9  409.0  2004.0  8551  10555.0  10964.0   
8  dadra and nagar haveli     10  126.0  1043.0  3516   4559.0   4685.0   
9  dadra and nagar haveli     11  189.0  1617.0  3035   4652.0   4841.0   

   activity_ratio  
0        0.945813  
1        1.000000  
2        1.000000  
3        1.000000  
4        0.913185  
5        0.88

In [32]:
district_agg = merged.groupby('district').agg({
    'age_0_5': 'sum',
    'age_5_17': 'sum',
    'age_18_greater': 'sum',
    'demo_age_5_17': 'sum',
    'demo_age_17_': 'sum',
    'bio_age_5_17': 'sum',
    'bio_age_17_': 'sum',
    'E': 'sum',
    'DU': 'sum',
    'BU': 'sum',
    'U': 'sum',
    'T': 'sum',
    'activity_ratio': 'mean'
}).reset_index()

first_month = merged.groupby('district')['month'].first().reset_index()
district_agg = district_agg.merge(first_month, on='district')

print('District aggregated with raw age bands:')
print(district_agg.columns.tolist())

District aggregated with raw age bands:
['district', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'month']


In [33]:
total_months = merged['month'].nunique()

zero_month_counts = merged.groupby('district').apply(
    lambda x: (x['T'] == 0).sum(), include_groups=False
).reset_index(name='zero_months')

district_agg = district_agg.merge(zero_month_counts, on='district')
district_agg['zero_month_ratio'] = district_agg['zero_months'] / total_months
district_agg = district_agg.drop(columns=['zero_months'])

print('Zero month ratio computed')

Zero month ratio computed


In [34]:
monthly_enrolment = merged.groupby('district')['E'].agg(['mean', 'std', 'max']).reset_index()
monthly_enrolment.columns = ['district', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_enrolment']

monthly_enrolment['monthly_valatility'] = monthly_enrolment['monthly_valatility'].fillna(0)
monthly_enrolment['peak_load_ratio'] = monthly_enrolment['peak_enrolment'] / monthly_enrolment['avg_monthly_enrolment'].replace(0, np.nan)
monthly_enrolment['peak_load_ratio'] = monthly_enrolment['peak_load_ratio'].fillna(1)

district_agg = district_agg.merge(monthly_enrolment[['district', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio']], on='district')

print('Monthly metrics added')

Monthly metrics added


In [35]:
district_agg['biometric_burden'] = district_agg['BU'] / district_agg['U'].replace(0, np.nan)
district_agg['biometric_burden'] = district_agg['biometric_burden'].fillna(0)

district_agg['update_dominant'] = (district_agg['U'] > district_agg['E']).astype(int)

district_agg['enrollment_update_balance'] = district_agg['E'] / district_agg['T'].replace(0, np.nan)
district_agg['enrollment_update_balance'] = district_agg['enrollment_update_balance'].fillna(0.5)

print('Additional features added')
print(district_agg.columns.tolist())

Additional features added
['district', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'month', 'zero_month_ratio', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio', 'biometric_burden', 'update_dominant', 'enrollment_update_balance']


In [36]:
def normalize(series):
    min_val = series.min()
    max_val = series.max()
    if max_val == min_val:
        return pd.Series([0.5] * len(series), index=series.index)
    return (series - min_val) / (max_val - min_val)

def inverse_normalize(series):
    return 1 - normalize(series)

In [37]:
district_agg['access'] = normalize(district_agg['T'])
district_agg['responsiveness'] = normalize(district_agg['activity_ratio'])
district_agg['inclusion'] = normalize(district_agg['avg_monthly_enrolment'])
district_agg['stability'] = inverse_normalize(district_agg['monthly_valatility'])
district_agg['visibility'] = inverse_normalize(district_agg['zero_month_ratio'])

district_agg['ASS'] = (district_agg['access'] + district_agg['inclusion']) / 2
district_agg['UBS'] = (district_agg['responsiveness'] + district_agg['biometric_burden']) / 2
district_agg['SRS'] = (district_agg['stability'] + district_agg['visibility']) / 2

district_agg['DEI'] = (district_agg['ASS'] * 0.4 + district_agg['UBS'] * 0.3 + district_agg['SRS'] * 0.3)

print('Scores computed')

Scores computed


In [38]:
STATE_NAME = 'Dadra & Nagar Haveli and Daman & Diu'
district_agg.insert(0, 'state', STATE_NAME)

print('State column added')
print(district_agg.columns.tolist())

State column added
['state', 'district', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'month', 'zero_month_ratio', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio', 'biometric_burden', 'update_dominant', 'enrollment_update_balance', 'access', 'responsiveness', 'inclusion', 'stability', 'visibility', 'ASS', 'UBS', 'SRS', 'DEI']


In [39]:
final_column_order = [
    'state', 'district', 'month',
    'age_0_5', 'age_5_17', 'age_18_greater',
    'demo_age_5_17', 'demo_age_17_',
    'bio_age_5_17', 'bio_age_17_',
    'E', 'DU', 'BU', 'U', 'T',
    'activity_ratio', 'zero_month_ratio',
    'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio',
    'biometric_burden', 'update_dominant', 'enrollment_update_balance',
    'DEI', 'ASS', 'UBS', 'SRS'
]

district_agg = district_agg[final_column_order]

print('Column order matched to Rajasthan schema:')
print(district_agg.columns.tolist())

Column order matched to Rajasthan schema:
['state', 'district', 'month', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'zero_month_ratio', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio', 'biometric_burden', 'update_dominant', 'enrollment_update_balance', 'DEI', 'ASS', 'UBS', 'SRS']


In [40]:
score_cols = ['DEI', 'ASS', 'UBS', 'SRS']

for col in score_cols:
    min_val = district_agg[col].min()
    max_val = district_agg[col].max()
    print(f'{col}: min={min_val:.4f}, max={max_val:.4f}')
    if min_val < 0 or max_val > 1:
        print(f'  WARNING: {col} is outside [0, 1] range!')

print(f'\nTotal columns: {len(district_agg.columns)}')
print(f'Any NaN values: {district_agg.isna().any().any()}')

DEI: min=0.4914, max=0.6013
ASS: min=0.0000, max=1.0000
UBS: min=0.4209, max=0.8880
SRS: min=0.2500, max=0.7500

Total columns: 27
Any NaN values: False


In [41]:
print('\nFinal DEI Scores:')
print(district_agg[['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']].sort_values('DEI', ascending=False))


Final DEI Scores:
                                  state                district       DEI  \
0  Dadra & Nagar Haveli and Daman & Diu  dadra and nagar haveli  0.601260   
1  Dadra & Nagar Haveli and Daman & Diu                   daman  0.508897   
2  Dadra & Nagar Haveli and Daman & Diu                     diu  0.491399   

        ASS       UBS       SRS  
0  1.000000  0.420866  0.250000  
1  0.064965  0.878068  0.731636  
2  0.000000  0.887996  0.750000  


In [42]:
district_agg.to_csv('dnhdd_district_analysis.csv', index=False)
print('Saved: dnhdd_district_analysis.csv')

final_scores = district_agg[['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']]
final_scores.to_csv('dnhdd_district_final_scores.csv', index=False)
print('Saved: dnhdd_district_final_scores.csv')

Saved: dnhdd_district_analysis.csv
Saved: dnhdd_district_final_scores.csv
