In [87]:

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [100]:
def load_csvs_from_folder(folder_path):
    dfs = []
    for file in sorted(os.listdir(folder_path)):
        if file.endswith(".csv"):
            file_path = os.path.join(folder_path, file)
            dfs.append(pd.read_csv(file_path))
    return pd.concat(dfs, ignore_index=True)


In [101]:
enrolment_folder = os.path.join(dataset_root, "api_data_aadhar_enrolment")

df_enr= load_csvs_from_folder(enrolment_folder)

print("Enrolment shape:", df_enr.shape)

demographic_folder = os.path.join(dataset_root, "api_data_aadhar_demographic")

df_demo = load_csvs_from_folder(demographic_folder)

print("Demographic shape:", df_demo.shape)

biometric_folder = os.path.join(dataset_root, "api_data_aadhar_biometric")

df_bio= load_csvs_from_folder(biometric_folder)

print("Biometric shape:", df_bio.shape)



In [102]:
for name, df in {
    "Enrolment": df_enr,
    "Demographic": df_demo,
    "Biometric": df_bio
}.items():
    print(
        name,
        df.duplicated(['date','state','district','pincode']).sum()
    )


Enrolment shape: (1006029, 7)
Demographic shape: (2071700, 6)
Biometric shape: (1861108, 6)


In [103]:
df_enr.describe()

Enrolment 22957
Demographic 473601
Biometric 94896


In [109]:
df_demo.describe()

Unnamed: 0,pincode,demo_age_5_17,demo_age_17_
count,2071700.0,2071700.0,2071700.0
mean,527831.8,2.347552,21.44701
std,197293.3,14.90355,125.2498
min,100000.0,0.0,0.0
25%,396469.0,0.0,2.0
50%,524322.0,1.0,6.0
75%,695507.0,2.0,15.0
max,855456.0,2690.0,16166.0


In [108]:
df_bio.describe()

Unnamed: 0,pincode,bio_age_5_17,bio_age_17_
count,1861108.0,1861108.0,1861108.0
mean,521761.2,18.39058,19.09413
std,198162.7,83.70421,88.06502
min,110001.0,0.0,0.0
25%,391175.0,1.0,1.0
50%,522401.0,3.0,4.0
75%,686636.2,11.0,10.0
max,855456.0,8002.0,7625.0


In [111]:
df_bio['date'] = pd.to_datetime(df_bio['date'],dayfirst=True)
df_bio['pincode'] = df_bio['pincode'].astype(str)
df_demo['date'] = pd.to_datetime(df_demo['date'],dayfirst=True)
df_demo['pincode'] = df_demo['pincode'].astype(str)
df_enr['date'] = pd.to_datetime(df_enr['date'],dayfirst=True)
df_enr['pincode'] = df_enr['pincode'].astype(str)


In [None]:
df_enr = df_enr.drop_duplicates()
df_bio = df_bio.drop_duplicates()
df_demo = df_demo.drop_duplicates()

(1598099, 6)

In [116]:
df_enr.shape

(983072, 7)

In [117]:
df_bio.shape

(1766212, 6)

In [118]:
df_demo.shape

(1598099, 6)

In [119]:
df_enr = df_enr.groupby(
    ['date','state','district','pincode'],
    as_index=False
).sum()
df_enr.shape

(983072, 7)

In [120]:
df_bio = df_bio.groupby(
    ['date','state','district','pincode'],
    as_index=False
).sum()
df_bio.shape


(1766212, 6)

In [121]:
df_demo = df_demo.groupby(
    ['date','state','district','pincode'],
    as_index=False
).sum()
df_demo.shape


(1598099, 6)

In [125]:
if df_demo.duplicated(['date','state','district','pincode']).sum() == 0:
    print("true")


true


In [133]:
# Age columns must be non-negative
(df_bio[['bio_age_5_17','bio_age_17_']] >= 0).all()
    


bio_age_5_17    True
bio_age_17_     True
dtype: bool

In [134]:
df_final = df_enr.merge(
    df_demo,
    on=['date','state','district','pincode'],
    how='outer'
)


In [135]:
df_final = df_final.merge(
    df_bio,
    on=['date','state','district','pincode'],
    how='outer'
)


In [146]:
df_final.isna().sum().sort_values(ascending=False)


age_0_5           1347396
age_5_17          1347396
age_18_greater    1347396
demo_age_5_17      732369
demo_age_17_       732369
bio_age_5_17       564256
bio_age_17_        564256
date                    0
state                   0
district                0
pincode                 0
dtype: int64

In [147]:
df_final.isna().mean().sort_values(ascending=False)


age_0_5           0.578165
age_5_17          0.578165
age_18_greater    0.578165
demo_age_5_17     0.314258
demo_age_17_      0.314258
bio_age_5_17      0.242121
bio_age_17_       0.242121
date              0.000000
state             0.000000
district          0.000000
pincode           0.000000
dtype: float64

In [148]:
count_cols = [
    col for col in df_final.columns
    if col not in ['date', 'state', 'district', 'pincode']
]

df_final[count_cols] = df_final[count_cols].fillna(0)


In [149]:
df_final.head

<bound method NDFrame.head of               date        state                    district pincode  age_0_5  \
0       2025-03-02    Meghalaya            East Khasi Hills  793121     11.0   
1       2025-03-09        Bihar                   Bhagalpur  812005     13.0   
2       2025-03-09        Bihar                   Madhubani  847108     18.0   
3       2025-03-09        Bihar             Purbi Champaran  845304     18.0   
4       2025-03-09        Bihar             Purbi Champaran  845418     30.0   
...            ...          ...                         ...     ...      ...   
2330463 2025-12-29  West Bengal              South Dinajpur  733153      0.0   
2330464 2025-12-29  West Bengal  South Twenty Four Parganas  743348      0.0   
2330465 2025-12-29  West Bengal  South Twenty Four Parganas  743610      0.0   
2330466 2025-12-29  West Bengal              West Midnapore  721146      0.0   
2330467 2025-12-29  West Bengal              West Midnapore  721303      0.0   

         