In [2]:
import pandas as pd

In [4]:
biometric_final_df=pd.read_csv("../Dataset/api_data_aadhar_biometric/aadhar_biometric_final.csv")
demographic_final_df=pd.read_csv("../Dataset/api_data_aadhar_demographic/aadhaar_demographic_final.csv")
enrolment_final_df=pd.read_csv("../Dataset/api_data_aadhar_enrolment/aadhar_enrolment_final.csv")

In [5]:
for df in [biometric_final_df, demographic_final_df, enrolment_final_df]:
    df['join_date'] = pd.to_datetime(
        df['date'],
        format='mixed',
        dayfirst=True,
        errors='coerce'
    )
bio_demo_df = pd.merge(
    biometric_final_df,
    demographic_final_df,
    on=['join_date', 'state', 'district', 'pincode'],
    how='outer',
    suffixes=('_bio', '_demo')
)
final_df = pd.merge(
    bio_demo_df,
    enrolment_final_df,
    on=['join_date', 'state', 'district', 'pincode'],
    how='outer',
    suffixes=('', '_enr')
)
final_df['date_final'] = (
    final_df['date_bio']
    .combine_first(final_df['date_demo'])
    .combine_first(final_df['date'])
)



In [6]:
final_df=final_df.drop(columns=['date_bio','date_final','date','date_demo'],errors='ignore')

In [7]:
final_df.head()

Unnamed: 0,state,district,pincode,bio_age_5_17,bio_age_17_,join_date,demo_age_5_17,demo_age_17_,age_0_5,age_5_17,age_18_greater
0,Andaman & Nicobar Islands,Andamans,744101,16.0,193.0,2025-03-01,,,,,
1,Andaman and Nicobar Islands,Nicobar,744301,101.0,48.0,2025-03-01,16.0,180.0,,,
2,Andaman and Nicobar Islands,Nicobar,744302,15.0,12.0,2025-03-01,,,,,
3,Andaman and Nicobar Islands,Nicobar,744303,46.0,27.0,2025-03-01,,,,,
4,Andaman and Nicobar Islands,Nicobar,744304,16.0,14.0,2025-03-01,,,,,


# State Normalisation


In [8]:
import json
import pandas as pd
import re
from rapidfuzz import process, fuzz


In [11]:
# Load JSON
with open("../Dataset/GeoJSON/states_districts.json", "r", encoding="utf-8") as f:
    state_data = json.load(f)

# Extract official state names
OFFICIAL_STATES = [s["state"] for s in state_data["states"]]

In [12]:
OFFICIAL_STATES


['Andhra Pradesh',
 'Arunachal Pradesh',
 'Assam',
 'Bihar',
 'Chhattisgarh',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Madhya Pradesh',
 'Maharashtra',
 'Odisha',
 'Punjab',
 'Rajasthan',
 'Tamil Nadu',
 'Telangana',
 'Uttar Pradesh',
 'West Bengal',
 'Andaman and Nicobar Islands',
 'Chandigarh',
 'Dadra and Nagar Haveli and Daman and Diu',
 'Delhi',
 'Jammu and Kashmir',
 'Ladakh',
 'Lakshadweep',
 'Puducherry']

In [13]:
def normalize_text(text):
    if pd.isna(text):
        return None

    text = text.lower()
    text = text.replace("&", "and")
    text = re.sub(r"[^a-z\s]", "", text)   # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [14]:
# Create mapping of normalized â†’ official
normalized_state_map = {
    normalize_text(state): state for state in OFFICIAL_STATES
}

normalized_state_keys = list(normalized_state_map.keys())


In [15]:
def match_state_fuzzy(state_name, threshold=85):
    if pd.isna(state_name):
        return pd.Series([None, None])

    normalized = normalize_text(state_name)

    match, score, _ = process.extractOne(
        normalized,
        normalized_state_keys,
        scorer=fuzz.token_sort_ratio
    )

    if score >= threshold:
        return pd.Series([normalized_state_map[match], score])
    else:
        return pd.Series([None, score])


In [19]:
final_df[["state_normalized", "state_match_score"]] = (
    final_df["state"].apply(match_state_fuzzy)
)


In [31]:
final_df["state"].unique()

array(['Andaman & Nicobar Islands', 'Andaman and Nicobar Islands',
       'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chandigarh', 'Chhattisgarh', 'Dadra & Nagar Haveli',
       'Dadra and Nagar Haveli',
       'Dadra and Nagar Haveli and Daman and Diu', 'Daman & Diu',
       'Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra',
       'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Orissa',
       'Pondicherry', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim',
       'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh',
       'Uttarakhand', 'West Bengal',
       'The Dadra And Nagar Haveli And Daman And Diu',
       'Jammu And Kashmir', 'Jammu & Kashmir', 'ODISHA', 'WEST BENGAL',
       'WESTBENGAL', 'West  Bengal', 'West bengal', 'Westbengal',
       'andhra pradesh', 'odisha', 'west Bengal', '100000', 'We

In [30]:
final_df["state_normalized"].unique()


array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', nan, 'Dadra and Nagar Haveli and Daman and Diu',
       'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh',
       'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh',
       'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Odisha',
       'Puducherry', 'Punjab', 'Rajasthan', 'Tamil Nadu', 'Telangana',
       'Uttar Pradesh', 'West Bengal'], dtype=object)

In [33]:
import pandas as pd
import re

# Normalize raw state text
final_df["state"] = (
    final_df["state"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
    .str.replace("&", "and")
)

# Explicit state corrections (canonical mapping)
state_corrections = {

    # Andaman & Nicobar
    "andaman and nicobar islands": "Andaman and Nicobar Islands",

    # Andhra Pradesh
    "andhra pradesh": "Andhra Pradesh",

    # Telangana
    "telangana": "Telangana",
    
    #Tamil Nadu
    "tamil nadu": "Tamil Nadu",
    "tamilnadu": "Tamil Nadu",

    # Bihar
    "bihar": "Bihar",

    # Chhattisgarh
    "chhatisgarh": "Chhattisgarh",

    # Dadra & Nagar Haveli and Daman & Diu (merged UT)
    "dadra and nagar haveli": "Dadra and Nagar Haveli and Daman and Diu",
    "daman and diu": "Dadra and Nagar Haveli and Daman and Diu",
    "the dadra and nagar haveli and daman and diu": "Dadra and Nagar Haveli and Daman and Diu",

    # Delhi
    "delhi": "Delhi",

    # Jammu & Kashmir
    "jammu and kashmir": "Jammu and Kashmir",

    # Odisha
    "odisha": "Odisha",
    "orissa": "Odisha",

    # Puducherry
    "pondicherry": "Puducherry",

    # Uttarakhand
    "uttaranchal": "Uttarakhand",

    # West Bengal variants
    "west bengal": "West Bengal",
    "west bengli": "West Bengal",
    "west bangal": "West Bengal",
    "westbengal": "West Bengal",
}

# Apply corrections
final_df["state"] = final_df["state"].replace(state_corrections)

# Final formatting
final_df["state"] = final_df["state"].str.title()

# Inspect results
unique_states = sorted(final_df["state"].unique())

print("Total number of unique states:", len(unique_states))
for state in unique_states:
    print(state)

Total number of unique states: 42
100000
Andaman And Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Balanagar
Bihar
Chandigarh
Chhattisgarh
Dadra And Nagar Haveli And Daman And Diu
Darbhanga
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jaipur
Jammu And Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madanapalle
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
Odisha
Puducherry
Punjab
Puttenahalli
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
West Bengal
