In [None]:
import pandas as pd
import numpy as np  
from pathlib import Path

In [61]:
PROCESSED_PATH = Path("../datasets/processed")
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

In [48]:
df_enrolment = pd.read_csv('../datasets/raw/api_data_aadhar_enrolment_0_500000.csv')
df_enrolment.head() 

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [31]:
df_demographic = pd.read_csv('../datasets/raw/api_data_aadhar_demographic_0_500000.csv')
df_demographic.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [3]:
df_biometeric = pd.read_csv('../datasets/raw/api_data_aadhar_biometric_0_500000.csv')
df_biometeric.head()


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [9]:
def standardize_date(df, date_col):
    """Convert date column to YYYY-MM period format"""
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df["period"] = df[date_col].dt.to_period("M").astype(str)
    return df

In [10]:
def standardize_geography(df, state_col, district_col):
    df[state_col] = df[state_col].astype(str).str.strip().str.title()
    df[district_col] = df[district_col].fillna("STATE_TOTAL")
    df[district_col] = df[district_col].astype(str).str.strip().str.title()
    return df

In [12]:
def clean_numeric_counts(df, count_columns):
    for col in count_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
        df[col] = df[col].clip(lower=0).astype(int)
    return df

In [14]:
def resolve_duplicates(df, group_cols, sum_cols):
    return df.groupby(group_cols, as_index=False)[sum_cols].sum()

In [22]:
print(list(df_enrolment.columns))


['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater', 'period']


In [49]:
age_cols = ["age_0_5", "age_5_17", "age_18_greater"]

df_enrolment[age_cols] = df_enrolment[age_cols].fillna(0)

df_enrolment["enrolment_count"] = (
    df_enrolment["age_0_5"]
    + df_enrolment["age_5_17"]
    + df_enrolment["age_18_greater"]
)


In [55]:
df_enrolment = standardize_date(df_enrolment, "date")
df_enrolment = standardize_geography(df_enrolment, "state", "district")
df_enrolment = clean_numeric_counts(df_enrolment, ["enrolment_count"])



df_enrolment = resolve_duplicates(
    df_enrolment,
    group_cols=["state", "district", "period"],
    sum_cols=["enrolment_count"]
    )


df_enrolment.head()

Unnamed: 0,state,district,period,enrolment_count
0,100000,100000,2025-02,3
1,100000,100000,2025-03,1
2,100000,100000,2025-08,1
3,100000,100000,2025-09,1
4,100000,100000,2025-11,2


In [56]:
df_enrolment.tail()

Unnamed: 0,state,district,period,enrolment_count
11396,West Bengal,West Midnapore,2025-12,45
11397,West Bengal,West Midnapore,NaT,703
11398,Westbengal,Hooghly,2025-05,1
11399,Westbengal,Hooghly,2025-12,1
11400,Westbengal,Hooghly,NaT,1


In [26]:
print(list(df_demographic.columns))


['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_', 'period']


In [35]:
age_cols = ["demo_age_5_17", "demo_age_17_"]
df_demographic[age_cols] = df_demographic[age_cols].fillna(0)

df_demographic["demographic_update_count"] = (
    df_demographic["demo_age_5_17"]
    + df_demographic["demo_age_17_"]
)


In [40]:
df_demographic = standardize_date(df_demographic, "date")
df_demographic = standardize_geography(df_demographic, "state", "district")
df_demographic = clean_numeric_counts(df_demographic, ["demographic_update_count"])


df_demographic = resolve_duplicates(
    df_demographic,
    group_cols=["state", "district", "period"],
    sum_cols=["demographic_update_count"]
    )


df_demographic.head()

Unnamed: 0,state,district,period,demographic_update_count
0,Andaman & Nicobar Islands,Andamans,2025-01,13
1,Andaman & Nicobar Islands,Andamans,2025-02,8
2,Andaman & Nicobar Islands,Andamans,2025-03,26
3,Andaman & Nicobar Islands,Andamans,2025-04,19
4,Andaman & Nicobar Islands,Andamans,2025-05,6


In [57]:
df_demographic.tail()

Unnamed: 0,state,district,period,demographic_update_count
12034,Westbengal,Hooghly,2025-11,2
12035,Westbengal,Hooghly,2025-12,3
12036,Westbengal,Hooghly,NaT,18
12037,Westbengal,Howrah,2025-08,2
12038,Westbengal,Howrah,NaT,2


In [33]:
print(list(df_biometeric.columns))


['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [37]:
# Biometric age-wise columns
bio_age_cols = [
    "bio_age_5_17",
    "bio_age_17_"
]

bio_age_cols = [c for c in bio_age_cols if c in df_biometeric.columns]

df_biometeric[bio_age_cols] = df_biometeric[bio_age_cols].fillna(0)

df_biometeric["biometric_update_count"] = (
    df_biometeric[bio_age_cols].sum(axis=1)
)

In [39]:
df_biometeric = standardize_date(df_biometeric, "date")
df_biometeric = standardize_geography(df_biometeric, "state", "district")
df_biometeric = clean_numeric_counts(df_biometeric, ["biometric_update_count"])


df_biometeric = resolve_duplicates(
    df_biometeric,
    group_cols=["state", "district", "period"],
    sum_cols=["biometric_update_count"]
)

df_biometeric.head()

Unnamed: 0,state,district,period,biometric_update_count
0,Andaman & Nicobar Islands,Andamans,2025-01,993
1,Andaman & Nicobar Islands,Andamans,2025-02,24
2,Andaman & Nicobar Islands,Andamans,2025-03,22
3,Andaman & Nicobar Islands,Andamans,2025-04,12
4,Andaman & Nicobar Islands,Andamans,2025-05,16


In [58]:
df_biometeric.tail()

Unnamed: 0,state,district,period,biometric_update_count
11974,Westbengal,Hooghly,2025-01,2
11975,Westbengal,Hooghly,2025-03,1
11976,Westbengal,Hooghly,2025-11,1
11977,Westbengal,Hooghly,2025-12,2
11978,Westbengal,Howrah,2025-07,1


In [62]:
df_enrolment.to_csv(PROCESSED_PATH / "enrolment_cleaned.csv", index=False)
df_demographic.to_csv(PROCESSED_PATH / "demographic_updates_cleaned.csv", index=False)
df_biometeric.to_csv(PROCESSED_PATH / "biometric_updates_cleaned.csv", index=False)


print("Cleaned datasets saved to data/processed/")

Cleaned datasets saved to data/processed/


In [63]:
print("\nNull checks:")
print(df_enrolment.isnull().sum())


print("\nZero-heavy rows (example):")
print(df_enrolment[df_enrolment["enrolment_count"] == 0].head())


Null checks:
state              0
district           0
period             0
enrolment_count    0
dtype: int64

Zero-heavy rows (example):
Empty DataFrame
Columns: [state, district, period, enrolment_count]
Index: []


In [65]:
print("\nNull checks:")
print(df_demographic.isnull().sum())


print("\nZero-heavy rows (example):")
print(df_demographic[df_demographic["demographic_update_count"] == 0].head())


Null checks:
state                       0
district                    0
period                      0
demographic_update_count    0
dtype: int64

Zero-heavy rows (example):
                state      district   period  demographic_update_count
1816            Bihar     Samstipur  2025-07                         0
3667  Jammu & Kashmir  Leh (Ladakh)  2025-06                         0


In [67]:
print("\nNull checks:")
print(df_biometeric.isnull().sum())


print("\nZero-heavy rows (example):")
print(df_biometeric[df_biometeric["biometric_update_count"] == 0].head())


Null checks:
state                     0
district                  0
period                    0
biometric_update_count    0
dtype: int64

Zero-heavy rows (example):
Empty DataFrame
Columns: [state, district, period, biometric_update_count]
Index: []
