In [1]:
import pandas as pd
import re
import unicodedata
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline 

from collections import defaultdict

In [2]:
base_dir = r'/kaggle/input/adhaar-biometric-registration-2025'
dfs = []
for idx,files in enumerate(os.listdir(base_dir)):
    print(f'count: {idx+1}')
    dfs.append(pd.read_csv(os.path.join(base_dir,files)))
    print(f"Unique Count States: {dfs[idx]['state'].nunique()}")
    print(f"Unique Count Districts: {dfs[idx]['district'].nunique()}")

count: 1
Unique Count States: 53
Unique Count Districts: 947
count: 2
Unique Count States: 56
Unique Count Districts: 945
count: 3
Unique Count States: 53
Unique Count Districts: 943
count: 4
Unique Count States: 55
Unique Count Districts: 950


- data is messy and raw - needs aggresive cleaning
- I'll follow the cleaning pattern used in last notebook
- raw names -> normalize names -> clean name -> minimal mapping -> return title()

# Data Cleaning

## State names cleaning

In [3]:
for idx,df in enumerate(dfs):
    print(f'Df Idx: {idx+1}')
    print(df['state'].unique())

Df Idx: 1
['Goa' 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu & Kashmir'
 'Jammu and Kashmir' 'Jharkhand' 'Karnataka' 'Kerala'
 'Andaman & Nicobar Islands' 'Andaman and Nicobar Islands'
 'Andhra Pradesh' 'Mizoram' 'Nagaland' 'Odisha' 'Orissa' 'Pondicherry'
 'Puducherry' 'Punjab' 'Rajasthan' 'Sikkim' 'Tamil Nadu' 'Telangana'
 'Tripura' 'Uttar Pradesh' 'Uttarakhand' 'West Bengal' 'Ladakh'
 'Lakshadweep' 'Madhya Pradesh' 'Maharashtra' 'Manipur' 'Meghalaya'
 'Arunachal Pradesh' 'Assam' 'Bihar' 'Chandigarh' 'Chhattisgarh'
 'Dadra and Nagar Haveli' 'Daman & Diu' 'Delhi' 'Daman and Diu'
 'West  Bengal' 'Dadra and Nagar Haveli and Daman and Diu'
 'Dadra & Nagar Haveli' 'West bengal' 'West Bangal' 'ODISHA' 'WEST BENGAL'
 'andhra pradesh' 'odisha' 'Westbengal' 'WESTBENGAL']
Df Idx: 2
['Haryana' 'Bihar' 'Jammu and Kashmir' 'Tamil Nadu' 'Maharashtra'
 'Gujarat' 'Odisha' 'West Bengal' 'Kerala' 'Rajasthan' 'Punjab'
 'Himachal Pradesh' 'Uttar Pradesh' 'Assam' 'Uttarakhand' 'Madhya Pradesh'
 'Karnata

In [4]:
## Cleaning Pipeline
def normalize_state(s):
    if pd.isna(s):
        return np.nan

    s = str(s)

    ##encoding 
    s = unicodedata.normalize('NFKD',s)
    s = s.encode('ascii','ignore').decode('ascii')

    #string manipulation
    s = s.lower()
    s = s.replace('&','and')
    ## removing punctuations/spaces
    s = re.sub(r"[\(\)\[\]\.,:]"," ",s)
    s = s.replace('/'," ")
    s = re.sub(r"\s+"," ",s)
    s = s.strip()
    ##pure numbers
    if re.fullmatch(r"\d+",s):
        return np.nan
    return s


##minimal canonical map
STATE_MAP = {

    # ---------------- Simple aliases ----------------
    "orissa": "odisha",
    "uttaranchal": "uttarakhand",
    "pondicherry": "puducherry",
    "tamilnadu": "tamil nadu",
    "chhatisgarh": "chhattisgarh",
    "west bangal": "west bengal",
    "westbengal": "west bengal",

    # ---------------- Merged UT (FORCE CANONICAL) ----------------
    "dadra and nagar haveli": 
        "dadra and nagar haveli and daman and diu",

    "daman and diu": 
        "dadra and nagar haveli and daman and diu",

    "dadra and nagar haveli and daman and diu": 
        "dadra and nagar haveli and daman and diu",

    # ---------------- Symbol normalization ----------------
    "jammu and kashmir": "jammu and kashmir",
    "andaman and nicobar islands": "andaman and nicobar islands",
    "andhra pradesh": "andhra pradesh",
    "odisha": "odisha",
    "west bengal": "west bengal",
}


def clean_state_column(df,col='state'):
    df = df.copy()

    df['_norm_state'] = df[col].apply(normalize_state)

    df['_mapped_state'] = df['_norm_state'].map(
        lambda x: STATE_MAP.get(x,x) if pd.notna(x) else np.nan
    )

    df['state_clean'] = (
        df['_mapped_state']
        .str.title()
        .str.replace(r'\s+'," ",regex=True)
        .str.strip()
    )

    return df

In [5]:
state_cleaned_df = [clean_state_column(df) for df in dfs]

for idx,df in enumerate(state_cleaned_df):
    print(f'DF index: {idx+1}')
    print(df['state_clean'].nunique())
    print(df['state_clean'].unique())

DF index: 1
36
['Goa' 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu And Kashmir'
 'Jharkhand' 'Karnataka' 'Kerala' 'Andaman And Nicobar Islands'
 'Andhra Pradesh' 'Mizoram' 'Nagaland' 'Odisha' 'Puducherry' 'Punjab'
 'Rajasthan' 'Sikkim' 'Tamil Nadu' 'Telangana' 'Tripura' 'Uttar Pradesh'
 'Uttarakhand' 'West Bengal' 'Ladakh' 'Lakshadweep' 'Madhya Pradesh'
 'Maharashtra' 'Manipur' 'Meghalaya' 'Arunachal Pradesh' 'Assam' 'Bihar'
 'Chandigarh' 'Chhattisgarh' 'Dadra And Nagar Haveli And Daman And Diu'
 'Delhi']
DF index: 2
36
['Haryana' 'Bihar' 'Jammu And Kashmir' 'Tamil Nadu' 'Maharashtra'
 'Gujarat' 'Odisha' 'West Bengal' 'Kerala' 'Rajasthan' 'Punjab'
 'Himachal Pradesh' 'Uttar Pradesh' 'Assam' 'Uttarakhand' 'Madhya Pradesh'
 'Karnataka' 'Andhra Pradesh' 'Telangana' 'Goa' 'Nagaland' 'Jharkhand'
 'Delhi' 'Chhattisgarh' 'Meghalaya' 'Chandigarh' 'Puducherry' 'Manipur'
 'Sikkim' 'Tripura' 'Mizoram' 'Arunachal Pradesh' 'Ladakh'
 'Dadra And Nagar Haveli And Daman And Diu' 'Andaman And Nicobar I

### Final cleaned state should have a count of 36 (28 states and 8 Union Territories)

## District Name Cleaning

In [6]:
for idx,df in enumerate(state_cleaned_df):
    print(f'DF index: {idx+1}')
    print(df['district'].nunique())
    print(df['district'].unique())

DF index: 1
947
['North Goa' 'South Goa' 'Ahmadabad' 'Ahmedabad' 'Amreli' 'Anand'
 'Arvalli' 'Banaskantha' 'Bharuch' 'Bhavnagar' 'Botad' 'Chhotaudepur'
 'Dahod' 'Devbhumi Dwarka' 'Dohad' 'Gandhinagar' 'Gir Somnath' 'Jamnagar'
 'Junagadh' 'Kachchh' 'Kheda' 'Mahesana' 'Mahisagar' 'Morbi' 'Narmada'
 'Navsari' 'Panchmahals' 'Patan' 'Porbandar' 'Rajkot' 'Sabarkantha'
 'Surat' 'Surendra Nagar' 'Surendranagar' 'Tapi' 'The Dangs' 'Vadodara'
 'Valsad' 'Ambala' 'Bhiwani' 'Charkhi Dadri' 'Faridabad' 'Fatehabad'
 'Gurgaon' 'Hisar' 'Jhajjar' 'Jind' 'Kaithal' 'Karnal' 'Mahendragarh'
 'Mewat' 'Palwal' 'Panchkula' 'Panipat' 'Rewari' 'Rohtak' 'Sirsa'
 'Sonipat' 'Yamuna Nagar' 'Bilaspur' 'Chamba' 'Hamirpur' 'Kangra'
 'Kinnaur' 'Kullu' 'Mandi' 'Shimla' 'Sirmaur' 'Solan' 'Una' 'Kathua'
 'Anantnag' 'Badgam' 'Bandipore' 'Baramula' 'Budgam' 'Doda' 'Ganderbal'
 'Jammu' 'Kulgam' 'Kupwara' 'Leh' 'Pulwama' 'Punch' 'Rajouri' 'Reasi'
 'Samba' 'Srinagar' 'Udhampur' 'Bokaro' 'Chatra' 'Deoghar' 'Dhanbad'
 'Dumka' 'Ea

In [7]:
##district cleaning pipeline

def normalize_district(s):
    if pd.isna(s):
        return np.nan

    s = str(s)

    s = unicodedata.normalize("NFKD",s) ##decomposes the characters to base level unicode representation
    s = s.encode('ascii','ignore').decode('ascii')
    
    s = s.lower()
    s = s.replace('&','and')
    s = s.replace("−", "-").replace("–", "-").replace("—", "-") ##hyphen normalizatoin

    s = re.sub(r"[\(\)\[\]\{\}]"," ",s) ##removing junk brackets
    s = re.sub(r"[^\w\s\-]"," ",s) ##junk punctuations

    for k, v in NUMBER_WORDS.items():
        s = re.sub(rf"\b{k}\b", v, s)
        
    s = re.sub(r"\b20\s+4\b", "24", s)
    s = re.sub(r"\s+"," ",s).strip() ##collapse spaces

    if re.fullmatch(r'\d+',s):
        return np.nan ## district names can't be numbers

    return s

##allowed directions
DIRECTION_ONLY = {
        "north", "south", "east", "west",
    "north east", "north west", "south east", "south west"
}

##sub localities
GARBAGE_PATTERNS = [
    r"\bnear\b",
    r"\broad\b",
    r"\bcolony\b",
    r"\bcross\b",
    r"\bthana\b",
    r"\bsector\b",
    r"\bward\b",
    r"\bblock\b",
]


EXPLICIT_GARBAGE = {
    "domjur",
    "bardez",
    "tiswadi",
    "najafgarh",
    "bally jagachha",
    "south dumdum m",
}


def is_garbage(s):
    if s is None or pd.isna(s):
        return True

    s = s.strip()

    if len(s) <= 2:
        return True

    if s in DIRECTION_ONLY:
        return True

    if s in EXPLICIT_GARBAGE:
        return True


    for pat in GARBAGE_PATTERNS:
        if re.search(pat,s):
            return True

    return False


NUMBER_WORDS = {
    "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8",
    "nine": "9", "ten": "10", "twenty": "20",
}

#minimal cannonical map
CANONICAL_MAP = {

    # ---- spelling duplicates ----
    "viluppuram": "villupuram",
    "rangareddi": "rangareddy",
    "buldana": "buldhana",
    "hazaribag": "hazaribagh",
    "malda": "maldah",
    "puruliya": "purulia",
    "jhunjhunun": "jhunjhunu",
    "khorda": "khordha",

    # ---- Andhra normalization ----
    "ananthapur": "ananthapuramu",
    "anantapur": "ananthapuramu",

    #------ Karnataka-----
    # 'Chickmagalur':'Chikmagalur',
    "Chikkaballapur":"Chikmagalur",
    "Chikkamagaluru":"Chikmagalur",
    "Chikmagalur":"Chikmagalur",

    # ---- Numeric normalization ----
    "north 24 parganas": "north 24 parganas",
    "north twenty four parganas": "north 24 parganas",
    "south 24 parganas": "south 24 parganas",
    "south twenty four parganas": "south 24 parganas",

    # ---- Renamed districts ----
    "allahabad": "prayagraj",
    "aurangabad bh": "chhatrapati sambhajinagar",
    "osmanabad": "dharashiv",

    # ---- Merged UT ----
    "diu": "dadra and nagar haveli and daman and diu",
    "dadra and nagar haveli": "dadra and nagar haveli and daman and diu",
    "daman and diu": "dadra and nagar haveli and daman and diu",
}

CANONICAL_MAP.update({

    # ---- West Bengal ----
    "hugli": "hooghly",
    "hooghiy": "hooghly",
    "haora": "howrah",
    "medinipur": "paschim medinipur",

    # ---- Odisha ----
    "baleswar": "baleshwar",
    "sonapur": "subarnapur",

    # ---- Maharashtra ----
    "ahmadnagar": "ahilyanagar",
    "ahmednagar": "ahilyanagar",
    "chatrapati sambhaji nagar": "chhatrapati sambhajinagar",
    "raigarh mh": "raigarh",

    # ---- Tamil Nadu ----
    "kanyakumari": "kanniyakumari",
    "thiruvarur": "tiruvarur",
    "tuticorin": "thoothukkudi",
    "tirupathur": "tirupattur",

    # ---- Telangana ----
    "k v rangareddy": "rangareddy",

    # ---- Assam ----
    "north cachar hills": "dima hasao",

    # ---- Garbage collapses ----
    "andamans": np.nan,
    "nicobars": np.nan,
    "leh ladakh": "leh",
    "bijapur kar": "bijapur",
    "mammit": "mamit",
})


def make_key(s):
    if pd.isna(s):
        return np.nan

    s = s.lower()

    s = re.sub(r"[\s\-]", "", s)
    s = re.sub(r"[aeiou]","",s)
    return s


def clean_district_column(df, col="district"):
    df = df.copy()

    df["_raw"] = df[col]

    # normalize
    df["_norm"] = df[col].apply(normalize_district)

    # garbage removal
    df.loc[df["_norm"].apply(is_garbage), "_norm"] = np.nan

    # canonical mapping
    df["_canon"] = df["_norm"].map(
        lambda x: CANONICAL_MAP.get(x, x) if pd.notna(x) else np.nan
    )

    # build grouping key
    df["_key"] = df["_canon"].apply(make_key)

    # auto-collapse by dominant spelling
    key_to_canon = {}

    valid = df.dropna(subset=["_canon", "_key"])

    for key, grp in valid.groupby("_key"):
        canonical = grp["_canon"].value_counts().idxmax()
        key_to_canon[key] = canonical

    df["district_clean"] = df["_key"].map(key_to_canon)

    # formatting
    df["district_clean"] = (
        df["district_clean"]
        .str.replace("-", " ", regex=False)
        .str.title()
        .str.replace(" And ", " and ", regex=False)
        .str.strip()
    )

    return df

In [8]:
cleaned_df = [clean_district_column(df) for df in state_cleaned_df]

for idx,df in enumerate(cleaned_df):
    print(f'Df Idx: {idx+1}')
    print(df['district_clean'].nunique())
    print(df['district_clean'].unique())

Df Idx: 1
793
['North Goa' 'South Goa' 'Ahmedabad' 'Amreli' 'Anand' 'Arvalli'
 'Banaskantha' 'Bharuch' 'Bhavnagar' 'Botad' 'Chhotaudepur' 'Dahod'
 'Devbhumi Dwarka' 'Gandhinagar' 'Gir Somnath' 'Jamnagar' 'Junagadh'
 'Kachchh' 'Kheda' 'Mahesana' 'Mahisagar' 'Morbi' 'Narmada' 'Navsari'
 'Panchmahals' 'Patna' 'Porbandar' 'Rajkot' 'Sabarkantha' 'Surat'
 'Surendra Nagar' 'Tapi' 'The Dangs' 'Vadodara' 'Valsad' 'Ambala'
 'Bhiwani' 'Charkhi Dadri' 'Faridabad' 'Fatehabad' 'Gurgaon' 'Hisar'
 'Jhajjar' 'Jind' 'Kaithal' 'Kurnool' 'Mahendragarh' 'Mewat' 'Palwal'
 'Panchkula' 'Panipat' 'Rewari' 'Rohtak' 'Sirsa' 'Sonipat' 'Yamuna Nagar'
 'Bilaspur' 'Chamba' 'Hamirpur' 'Kangra' 'Kannur' 'Kullu' 'Mandi' 'Shimla'
 'Sirmaur' 'Solan' 'Una' 'Kathua' 'Anantnag' 'Badgam' 'Bandipore'
 'Baramula' 'Doda' 'Ganderbal' 'Jammu' 'Kulgam' 'Kupwara' 'Leh' 'Pulwama'
 'Punch' 'Rajouri' 'Reasi' 'Samba' 'Srinagar' 'Udhampur' 'Bokaro' 'Chatra'
 'Deoghar' 'Dhanbad' 'Dumka' 'East Singhbhum' 'Garhwa' 'Giridih' 'Godda'
 'Gumla

- initial pruning reveals some more collapse like spelling and numbers
- India has about 773 districts

## Concatenating all the cleaned dataframes in one big data frame

In [9]:
cleaned_df[0].head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,_norm_state,_mapped_state,state_clean,_raw,_norm,_canon,_key,district_clean
0,19-09-2025,Goa,North Goa,403502,0,4,goa,goa,Goa,North Goa,north goa,north goa,nrthg,North Goa
1,19-09-2025,Goa,North Goa,403508,1,4,goa,goa,Goa,North Goa,north goa,north goa,nrthg,North Goa
2,19-09-2025,Goa,North Goa,403513,2,0,goa,goa,Goa,North Goa,north goa,north goa,nrthg,North Goa
3,19-09-2025,Goa,North Goa,403527,2,2,goa,goa,Goa,North Goa,north goa,north goa,nrthg,North Goa
4,19-09-2025,Goa,South Goa,403601,7,3,goa,goa,Goa,South Goa,south goa,south goa,sthg,South Goa


In [10]:
df = pd.concat([df[['date','state_clean','district_clean','pincode','bio_age_5_17','bio_age_17_']] for df in cleaned_df],axis=0).reset_index()
df.shape

(1861108, 7)

In [11]:
df.head()

Unnamed: 0,index,date,state_clean,district_clean,pincode,bio_age_5_17,bio_age_17_
0,0,19-09-2025,Goa,North Goa,403502,0,4
1,1,19-09-2025,Goa,North Goa,403508,1,4
2,2,19-09-2025,Goa,North Goa,403513,2,0
3,3,19-09-2025,Goa,North Goa,403527,2,2
4,4,19-09-2025,Goa,South Goa,403601,7,3


In [12]:
df.to_csv('cleaned_df.csv',index=False)