<a href="https://colab.research.google.com/github/UIDAI-Hackathon-6395/aadhaar-data-cleaning/blob/main/demographic_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from pathlib import Path
from difflib import get_close_matches

DEMOGRAPHIC_FILES = [
    "api_data_aadhar_demographic_0_500000.csv",
    "api_data_aadhar_demographic_500000_1000000.csv",
    "api_data_aadhar_demographic_1000000_1500000.csv",
    "api_data_aadhar_demographic_1500000_2000000.csv",
    "api_data_aadhar_demographic_2000000_2071700.csv"
]

STATE_MASTER_CSV = "Indian_States.csv"
DISTRICT_MASTER_CSV = "Districts_in_India.csv"
CHUNK_SIZE = 200_000

state_master = pd.read_csv(STATE_MASTER_CSV)
state_master["state"] = state_master["state"].str.lower().str.strip()
STATE_LIST = state_master["state"].tolist()

district_master = pd.read_csv(DISTRICT_MASTER_CSV)
district_master.columns = district_master.columns.str.lower().str.strip()
district_master["state"] = district_master["state"].str.lower().str.strip()
district_master["district"] = district_master["district"].str.lower().str.strip()

DISTRICT_LOOKUP = (
    district_master.groupby("state")["district"]
    .apply(list)
    .to_dict()
)

def clean_columns(cols):
    return (
        cols.str.strip()
            .str.lower()
            .str.replace(" ", "_")
            .str.replace(r"_+", "_", regex=True)
    )

def clean_text(x):
    if pd.isna(x):
        return None
    x = str(x).lower().strip()
    x = re.sub(r"[^a-z\s]", "", x)
    x = re.sub(r"\s+", " ", x)
    return x if x else None

def standardize_state(x):
    if not x:
        return None

    FIXES = {
        "orissa": "odisha",
        "odisa": "odisha",
        "uttaranchal": "uttarakhand",
        "pondicherry": "puducherry",
        "jammu & kashmir": "jammu and kashmir",
        "andaman & nicobar": "andaman and nicobar islands",
        "dadra nagar haveli": "dadra and nagar haveli and daman and diu",
        "daman and diu": "dadra and nagar haveli and daman and diu"
    }

    x = FIXES.get(x, x)
    match = get_close_matches(x, STATE_LIST, n=1, cutoff=0.85)
    return match[0] if match else x

def standardize_district(state, district):
    if not state or not district:
        return None

    candidates = DISTRICT_LOOKUP.get(state, [])
    if not candidates:
        return district

    match = get_close_matches(district, candidates, n=1, cutoff=0.85)
    return match[0] if match else district

for file in DEMOGRAPHIC_FILES:
    print(f"Processing: {file}")
    output_file = "cleaned_" + Path(file).name

    first_write = True

    for chunk in pd.read_csv(file, chunksize=CHUNK_SIZE):
        chunk.columns = clean_columns(chunk.columns)

        if "demo_age_17_" in chunk.columns:
            chunk = chunk.rename(columns={"demo_age_17_": "demo_age_17_plus"})

        chunk = chunk[
            [
                "date",
                "state",
                "district",
                "pincode",
                "demo_age_5_17",
                "demo_age_17_plus"
            ]
        ]

        chunk["date"] = pd.to_datetime(
            chunk["date"], format="%d-%m-%Y", errors="coerce"
        )

        numeric_cols = ["demo_age_5_17", "demo_age_17_plus"]
        for col in numeric_cols:
            chunk[col] = pd.to_numeric(chunk[col], errors="coerce")

        chunk["state"] = chunk["state"].apply(clean_text)
        chunk["district"] = chunk["district"].apply(clean_text)

        chunk["state"] = chunk["state"].apply(standardize_state)
        chunk["district"] = chunk.apply(
            lambda r: standardize_district(r["state"], r["district"]),
            axis=1
        )
        chunk["pincode"] = chunk["pincode"].astype(str)

        chunk = chunk.dropna(
            subset=["date", "state", "district", "pincode"]
        )

        chunk[numeric_cols] = chunk[numeric_cols].fillna(0).astype(int)

        assert chunk.isna().sum().sum() == 0
        chunk.to_csv(
            output_file,
            mode="w" if first_write else "a",
            index=False,
            header=first_write
        )
        first_write = False

    print(f"✅ Saved cleaned demographic file with NO NaNs: {output_file}")


Processing: api_data_aadhar_demographic_0_500000.csv
✅ Saved cleaned demographic file with NO NaNs: cleaned_api_data_aadhar_demographic_0_500000.csv
Processing: api_data_aadhar_demographic_500000_1000000.csv
✅ Saved cleaned demographic file with NO NaNs: cleaned_api_data_aadhar_demographic_500000_1000000.csv
Processing: api_data_aadhar_demographic_1000000_1500000.csv
✅ Saved cleaned demographic file with NO NaNs: cleaned_api_data_aadhar_demographic_1000000_1500000.csv
Processing: api_data_aadhar_demographic_1500000_2000000.csv
✅ Saved cleaned demographic file with NO NaNs: cleaned_api_data_aadhar_demographic_1500000_2000000.csv
Processing: api_data_aadhar_demographic_2000000_2071700.csv
✅ Saved cleaned demographic file with NO NaNs: cleaned_api_data_aadhar_demographic_2000000_2071700.csv
