<a href="https://colab.research.google.com/github/UIDAI-Hackathon-6395/aadhaar-data-cleaning/blob/main/enrolment_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from pathlib import Path
from difflib import get_close_matches

ENROLMENT_FILES = [
    "api_data_aadhar_enrolment_0_500000.csv",
    "api_data_aadhar_enrolment_500000_1000000.csv",
    "api_data_aadhar_enrolment_1000000_1006029.csv"
]

STATE_MASTER_CSV = "Indian_States.csv"
DISTRICT_MASTER_CSV = "Districts_in_India.csv"
CHUNK_SIZE = 200_000

REMOVED_FILE = "removed_enrolment_records.csv"

state_master = pd.read_csv(STATE_MASTER_CSV)
state_master["state"] = state_master["state"].str.lower().str.strip()
STATE_LIST = state_master["state"].tolist()

district_master = pd.read_csv(DISTRICT_MASTER_CSV)
district_master.columns = district_master.columns.str.lower().str.strip()
district_master["state"] = district_master["state"].str.lower().str.strip()
district_master["district"] = district_master["district"].str.lower().str.strip()

DISTRICT_LOOKUP = (
    district_master.groupby("state")["district"]
    .apply(list)
    .to_dict()
)

def clean_columns(cols):
    return (
        cols.str.strip()
            .str.lower()
            .str.replace(" ", "_")
            .str.replace(r"_+", "_", regex=True)
    )

def clean_text(x):
    if pd.isna(x):
        return None
    x = str(x).lower().strip()
    x = re.sub(r"[^a-z\s]", "", x)
    x = re.sub(r"\s+", " ", x)
    return x if x else None

def standardize_state(x):
    if not x:
        return None
    FIXES = {
        "orissa": "odisha",
        "odisa": "odisha",
        "uttaranchal": "uttarakhand",
        "pondicherry": "puducherry"
    }
    x = FIXES.get(x, x)
    match = get_close_matches(x, STATE_LIST, n=1, cutoff=0.85)
    return match[0] if match else x

def standardize_district(state, district):
    if not state or not district:
        return None
    candidates = DISTRICT_LOOKUP.get(state, [])
    match = get_close_matches(district, candidates, n=1, cutoff=0.85)
    return match[0] if match else district

removed_first_write = True

for file in ENROLMENT_FILES:
    print(f"Processing: {file}")
    output_file = "cleaned_" + Path(file).name
    first_write = True

    for chunk in pd.read_csv(file, chunksize=CHUNK_SIZE):
        chunk.columns = clean_columns(chunk.columns)

        chunk = chunk[
            ["date", "state", "district", "pincode",
             "age_0_5", "age_5_17", "age_18_greater"]
        ]

        chunk["date"] = pd.to_datetime(chunk["date"], format="%d-%m-%Y", errors="coerce")

        numeric_cols = ["age_0_5", "age_5_17", "age_18_greater"]
        for col in numeric_cols:
            chunk[col] = pd.to_numeric(chunk[col], errors="coerce")

        chunk["state"] = chunk["state"].apply(clean_text).apply(standardize_state)
        chunk["district"] = chunk.apply(
            lambda r: standardize_district(r["state"], clean_text(r["district"])),
            axis=1
        )

        chunk["pincode"] = chunk["pincode"].astype(str)

        invalid_rows = chunk[
            chunk[["date", "state", "district", "pincode"]].isna().any(axis=1)
        ]

        valid_rows = chunk.drop(invalid_rows.index)

        if not invalid_rows.empty:
            invalid_rows.to_csv(
                REMOVED_FILE,
                mode="w" if removed_first_write else "a",
                index=False,
                header=removed_first_write
            )
            removed_first_write = False

        valid_rows.loc[:, numeric_cols] = (
            valid_rows[numeric_cols].fillna(0).astype(int)
        )


        valid_rows.to_csv(
            output_file,
            mode="w" if first_write else "a",
            index=False,
            header=first_write
        )
        first_write = False

    print(f"‚úÖ Saved cleaned file: {output_file}")

print(f"üìÅ Removed records saved in: {REMOVED_FILE}")


Processing: api_data_aadhar_enrolment_0_500000.csv
‚úÖ Saved cleaned file: cleaned_api_data_aadhar_enrolment_0_500000.csv
Processing: api_data_aadhar_enrolment_500000_1000000.csv
‚úÖ Saved cleaned file: cleaned_api_data_aadhar_enrolment_500000_1000000.csv
Processing: api_data_aadhar_enrolment_1000000_1006029.csv
‚úÖ Saved cleaned file: cleaned_api_data_aadhar_enrolment_1000000_1006029.csv
üìÅ Removed records saved in: removed_enrolment_records.csv
