In [3]:
import pandas as pd

enrolment_df = pd.read_parquet("../sql/data/data_cache/enrolment_raw.parquet")
demographic_df = pd.read_parquet("../sql/data/data_cache/demographic_raw.parquet")
biometric_df = pd.read_parquet("../sql/data/data_cache/biometric_raw.parquet")

In [4]:
def unique_states(df, name):
    states = sorted(df["state"].dropna().unique())
    print(f"\n{name} states ({len(states)}):")
    for s in states:
        print(s)
    return set(states)

enrolment_states = unique_states(enrolment_df, "Enrolment")
demographic_states = unique_states(demographic_df, "Demographic")
biometric_states = unique_states(biometric_df, "Biometric")


Enrolment states (55):
100000
Andaman & Nicobar Islands
Andaman and Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Bihar
Chandigarh
Chhattisgarh
Dadra & Nagar Haveli
Dadra and Nagar Haveli
Dadra and Nagar Haveli and Daman and Diu
Daman & Diu
Daman and Diu
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jammu & Kashmir
Jammu And Kashmir
Jammu and Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
ODISHA
Odisha
Orissa
Pondicherry
Puducherry
Punjab
Rajasthan
Sikkim
Tamil Nadu
Telangana
The Dadra And Nagar Haveli And Daman And Diu
Tripura
Uttar Pradesh
Uttarakhand
WEST BENGAL
WESTBENGAL
West  Bengal
West Bangal
West Bengal
West bengal
Westbengal
andhra pradesh

Demographic states (65):
100000
Andaman & Nicobar Islands
Andaman and Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
BALANAGAR
Bihar
Chandigarh
Chhatisgarh
Chhattisgarh
Dadra & Nagar Haveli
Dadra and Nagar Haveli
Dadra and Nagar Haveli and Daman and Diu
Dam

In [5]:
print("\nOnly in enrolment:")
print(enrolment_states - demographic_states - biometric_states)

print("\nOnly in demographic:")
print(demographic_states - enrolment_states - biometric_states)

print("\nOnly in biometric:")
print(biometric_states - enrolment_states - demographic_states)


Only in enrolment:
{'Jammu And Kashmir', 'The Dadra And Nagar Haveli And Daman And Diu'}

Only in demographic:
{'Madanapalle', 'Raja Annamalai Puram', 'Nagpur', 'Puttenahalli', 'BALANAGAR', 'Darbhanga', 'West Bengli', 'Jaipur'}

Only in biometric:
{'Tamilnadu'}


In [1]:
CANONICAL_STATES = {
    "Andaman and Nicobar Islands",
    "Andhra Pradesh",
    "Arunachal Pradesh",
    "Assam",
    "Bihar",
    "Chandigarh",
    "Chhattisgarh",
    "Dadra and Nagar Haveli and Daman and Diu",
    "Delhi",
    "Goa",
    "Gujarat",
    "Haryana",
    "Himachal Pradesh",
    "Jammu and Kashmir",
    "Jharkhand",
    "Karnataka",
    "Kerala",
    "Ladakh",
    "Lakshadweep",
    "Madhya Pradesh",
    "Maharashtra",
    "Manipur",
    "Meghalaya",
    "Mizoram",
    "Nagaland",
    "Odisha",
    "Puducherry",
    "Punjab",
    "Rajasthan",
    "Sikkim",
    "Tamil Nadu",
    "Telangana",
    "Tripura",
    "Uttar Pradesh",
    "Uttarakhand",
    "West Bengal",
}

# --------- ADDED (data issues found in Excel files) ---------

CANONICAL_STATES.add("Arunachal Pradesh")
CANONICAL_STATES.add("Andaman and Nicobar Islands")

STATE_CANONICAL_MAP = {
    # Andaman & Nicobar
    "Andaman & Nicobar Islands": "Andaman and Nicobar Islands",
    "Andaman And Nicobar": "Andaman and Nicobar Islands",

    # Dadra / Daman merge
    "Dadra & Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "Dadra and Nagar Haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "Daman & Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "Daman and Diu": "Dadra and Nagar Haveli and Daman and Diu",
    "The Dadra And Nagar Haveli And Daman And Diu": "Dadra and Nagar Haveli and Daman and Diu",

    # Jammu & Kashmir
    "Jammu & Kashmir": "Jammu and Kashmir",
    "Jammu And Kashmir": "Jammu and Kashmir",

    # Odisha
    "Orissa": "Odisha",
    "ODISHA": "Odisha",
    "odisha": "Odisha",

    # Puducherry
    "Pondicherry": "Puducherry",

    # Tamil Nadu
    "Tamilnadu": "Tamil Nadu",

    # Uttaranchal
    "Uttaranchal": "Uttarakhand",

    # West Bengal variants
    "WEST BENGAL": "West Bengal",
    "WESTBENGAL": "West Bengal",
    "West  Bengal": "West Bengal",
    "West Bangal": "West Bengal",
    "West Bengli": "West Bengal",
    "West bengal": "West Bengal",
    "Westbengal": "West Bengal",
    "west Bengal": "West Bengal",

    # Andhra Pradesh
    "andhra pradesh": "Andhra Pradesh",

    # --------- ADDED (typos found in Excel) ---------
    "Arunanchal Pradesh": "Arunachal Pradesh",
    "Arunanchal pradesh": "Arunachal Pradesh",
}

# --------- ADDED (safe normalization helper) ---------

def normalize_state(state):
    if state is None:
        return None
    state=str(state).strip()
    state=STATE_CANONICAL_MAP.get(state,state)
    return state if state in CANONICAL_STATES else None


In [7]:
def normalize_state(df):
    df = df.copy()

    # Trim whitespace
    df["state"] = df["state"].str.strip()

    # Apply canonical mapping
    df["state"] = df["state"].replace(STATE_CANONICAL_MAP)

    # Flag invalid states (districts, garbage, codes)
    df["state_is_valid"] = df["state"].isin(CANONICAL_STATES)

    return df

In [8]:
enrolment_df = normalize_state(enrolment_df)
demographic_df = normalize_state(demographic_df)
biometric_df = normalize_state(biometric_df)

In [9]:
def invalid_states(df, name):
    invalid = sorted(df.loc[~df["state_is_valid"], "state"].unique())
    print(f"\n{name} invalid states:")
    for s in invalid:
        print(s)

invalid_states(enrolment_df, "Enrolment")
invalid_states(demographic_df, "Demographic")
invalid_states(biometric_df, "Biometric")


Enrolment invalid states:
100000

Demographic invalid states:
100000
BALANAGAR
Chhatisgarh
Darbhanga
Jaipur
Madanapalle
Nagpur
Puttenahalli
Raja Annamalai Puram

Biometric invalid states:
Chhatisgarh


In [10]:
# SANITY CHECK

def valid_state_set(df):
    return set(df.loc[df["state_is_valid"], "state"].unique())

all_valid_states = (
    valid_state_set(enrolment_df)
    | valid_state_set(demographic_df)
    | valid_state_set(biometric_df)
)

print("Valid states after normalization:", len(all_valid_states))
print(sorted(all_valid_states))

Valid states after normalization: 36
['Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal']


In [11]:
enrolment_clean = enrolment_df[enrolment_df["state_is_valid"]].copy()
demographic_clean = demographic_df[demographic_df["state_is_valid"]].copy()
biometric_clean = biometric_df[biometric_df["state_is_valid"]].copy()

for df in (enrolment_clean, demographic_clean, biometric_clean):
    df.drop(columns=["state_is_valid"], inplace=True)

In [12]:
def row_loss(raw, clean, name):
    lost = len(raw) - len(clean)
    print(f"{name}: dropped {lost} rows ({lost / len(raw):.4%})")

row_loss(enrolment_df, enrolment_clean, "Enrolment")
row_loss(demographic_df, demographic_clean, "Demographic")
row_loss(biometric_df, biometric_clean, "Biometric")

Enrolment: dropped 22 rows (0.0022%)
Demographic: dropped 17 rows (0.0008%)
Biometric: dropped 5 rows (0.0003%)


In [14]:
enrolment_clean.to_parquet("../sql/data/data_cache/enrolment_clean.parquet", engine="fastparquet", index=False)

demographic_clean.to_parquet("../sql/data/data_cache/demographic_clean.parquet", engine="fastparquet", index=False)

biometric_clean.to_parquet("../sql/data/data_cache/biometric_clean.parquet", engine="fastparquet", index=False)