In [1]:
from pathlib import Path
import pandas as pd

def load_uidai_folder(folder_path):
    files = list(Path(folder_path).glob("*.csv"))
    print(f"Found {len(files)} files in {folder_path}")
    
    df = pd.concat(
        (pd.read_csv(f) for f in files),
        ignore_index=True
    )
    return df


In [2]:
BASE_PATH = "../data"

enrolment_df = load_uidai_folder(f"{BASE_PATH}/api_data_aadhar_enrolment")
demo_df      = load_uidai_folder(f"{BASE_PATH}/api_data_aadhar_demographic")
bio_df       = load_uidai_folder(f"{BASE_PATH}/api_data_aadhar_biometric")

Found 3 files in ../data/api_data_aadhar_enrolment
Found 5 files in ../data/api_data_aadhar_demographic
Found 4 files in ../data/api_data_aadhar_biometric


In [3]:
def basic_sanity(df, name):
    print(f"\n{name} SHAPE:", df.shape)
    print("COLUMNS:", list(df.columns))
    print("\nNULL COUNT:")
    print(df.isnull().sum())

In [4]:
basic_sanity(enrolment_df, "ENROLMENT")
basic_sanity(demo_df, "DEMOGRAPHIC")
basic_sanity(bio_df, "BIOMETRIC")


ENROLMENT SHAPE: (1006029, 7)
COLUMNS: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

NULL COUNT:
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

DEMOGRAPHIC SHAPE: (2071700, 6)
COLUMNS: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

NULL COUNT:
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

BIOMETRIC SHAPE: (1861108, 6)
COLUMNS: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

NULL COUNT:
date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64


In [6]:
def add_year_month(df):
    if not pd.api.types.is_datetime64_any_dtype(df["date"]):
        df["date"] = pd.to_datetime(
            df["date"],
            dayfirst=True,
            errors="raise"
        )
    df["year_month"] = df["date"].dt.to_period("M").astype(str)
    return df

In [7]:
enrolment_df = add_year_month(enrolment_df)
demo_df      = add_year_month(demo_df)
bio_df       = add_year_month(bio_df)


In [8]:
enrolment_df[["date", "year_month"]].head()

Unnamed: 0,date,year_month
0,2025-03-02,2025-03
1,2025-03-09,2025-03
2,2025-03-09,2025-03
3,2025-03-09,2025-03
4,2025-03-09,2025-03


In [9]:
def aggregate_district_month(df, value_cols):
    return (
        df.groupby(["state", "district", "year_month"])[value_cols]
          .sum()
          .reset_index()
    )

In [10]:
enrolment_signal = aggregate_district_month(
    enrolment_df,
    ["age_0_5", "age_5_17", "age_18_greater"]
)

demo_signal = aggregate_district_month(
    demo_df,
    ["demo_age_5_17", "demo_age_17_"]
)

bio_signal = aggregate_district_month(
    bio_df,
    ["bio_age_5_17", "bio_age_17_"]
)

In [11]:
type(enrolment_signal), type(demo_signal), type(bio_signal)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [12]:
print("Enrolment:", enrolment_signal.shape)
print("Demographic:", demo_signal.shape)
print("Biometric:", bio_signal.shape)

Enrolment: (5062, 6)
Demographic: (6072, 5)
Biometric: (8507, 5)


In [18]:
import os

os.makedirs("data/processed", exist_ok=True)


In [19]:
enrolment_signal.to_csv(
    "data/processed/enrolment_district_month.csv", index=False
)

demo_signal.to_csv(
    "data/processed/demographic_district_month.csv", index=False
)

bio_signal.to_csv(
    "data/processed/biometric_district_month.csv", index=False
)
