In [1]:
# Core
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Date & Time
import datetime as dt

# Warnings control
import warnings
warnings.filterwarnings("ignore")

# Display settings (optional)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 100)

In [2]:
api_data_aadhar_biometric_0_500000 = pd.read_csv('Datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_0_500000.csv')
api_data_aadhar_biometric_1000000_1500000 = pd.read_csv('Datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_1000000_1500000.csv')
api_data_aadhar_biometric_1500000_1861108 = pd.read_csv('Datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_1500000_1861108.csv')
api_data_aadhar_biometric_500000_1000000 = pd.read_csv('Datasets/api_data_aadhar_biometric/api_data_aadhar_biometric_500000_1000000.csv')

In [3]:
dfs = [api_data_aadhar_biometric_0_500000, api_data_aadhar_biometric_1000000_1500000, api_data_aadhar_biometric_1500000_1861108, api_data_aadhar_biometric_500000_1000000]

for i, df in enumerate(dfs, start=1):
    print(f"\n===== DataFrame {i} =====")
    print(df.head(5))
    print(df.size)
    print("-" * 40)


===== DataFrame 1 =====
         date              state      district  pincode  bio_age_5_17  bio_age_17_
0  01-03-2025            Haryana  Mahendragarh   123029           280          577
1  01-03-2025              Bihar     Madhepura   852121           144          369
2  01-03-2025  Jammu and Kashmir         Punch   185101           643         1091
3  01-03-2025              Bihar       Bhojpur   802158           256          980
4  01-03-2025         Tamil Nadu       Madurai   625514           271          815
3000000
----------------------------------------

===== DataFrame 2 =====
         date             state      district  pincode  bio_age_5_17  bio_age_17_
0  07-11-2025           Haryana  Yamuna Nagar   135002             4            6
1  07-11-2025           Haryana   Yamunanagar   135001             1            2
2  07-11-2025  Himachal Pradesh      Bilaspur   174004             2            2
3  07-11-2025  Himachal Pradesh      Bilaspur   174005             1       

In [4]:
dfs = [
    api_data_aadhar_biometric_0_500000,
    api_data_aadhar_biometric_1000000_1500000,
    api_data_aadhar_biometric_1500000_1861108,
    api_data_aadhar_biometric_500000_1000000
]

for i, df in enumerate(dfs, start=1):

    # ✅ 1) Rename column (fix mismatch)
    df.rename(columns={"bio_age_17": "bio_age_17_"}, inplace=True)

    # ✅ 2) Change Data Types
    df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y", errors="coerce")
    df["state"] = df["state"].astype("category")
    df["district"] = df["district"].astype("category")
    df["pincode"] = df["pincode"].astype("string").str.zfill(6)

    df["bio_age_5_17"] = df["bio_age_5_17"].astype("int32")
    df["bio_age_17_"] = df["bio_age_17_"].astype("int32")

    # ✅ Print
    print(f"\n===== DataFrame {i} =====")
    print(df.head(5))
    print("Rows, Cols:", df.shape)
    print("-" * 40)


===== DataFrame 1 =====
        date              state      district pincode  bio_age_5_17  bio_age_17_
0 2025-03-01            Haryana  Mahendragarh  123029           280          577
1 2025-03-01              Bihar     Madhepura  852121           144          369
2 2025-03-01  Jammu and Kashmir         Punch  185101           643         1091
3 2025-03-01              Bihar       Bhojpur  802158           256          980
4 2025-03-01         Tamil Nadu       Madurai  625514           271          815
Rows, Cols: (500000, 6)
----------------------------------------

===== DataFrame 2 =====
        date             state      district pincode  bio_age_5_17  bio_age_17_
0 2025-11-07           Haryana  Yamuna Nagar  135002             4            6
1 2025-11-07           Haryana   Yamunanagar  135001             1            2
2 2025-11-07  Himachal Pradesh      Bilaspur  174004             2            2
3 2025-11-07  Himachal Pradesh      Bilaspur  174005             1            0

In [5]:
final_df = pd.concat(dfs, ignore_index=True)
final_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,2025-03-01,Haryana,Mahendragarh,123029,280,577
1,2025-03-01,Bihar,Madhepura,852121,144,369
2,2025-03-01,Jammu and Kashmir,Punch,185101,643,1091
3,2025-03-01,Bihar,Bhojpur,802158,256,980
4,2025-03-01,Tamil Nadu,Madurai,625514,271,815


In [6]:
final_df["state"].unique().tolist()

['Haryana',
 'Bihar',
 'Jammu and Kashmir',
 'Tamil Nadu',
 'Maharashtra',
 'Gujarat',
 'Odisha',
 'West Bengal',
 'Kerala',
 'Rajasthan',
 'Punjab',
 'Himachal Pradesh',
 'Uttar Pradesh',
 'Assam',
 'Uttarakhand',
 'Madhya Pradesh',
 'Karnataka',
 'Andhra Pradesh',
 'Telangana',
 'Goa',
 'Nagaland',
 'Jharkhand',
 'Delhi',
 'Chhattisgarh',
 'Meghalaya',
 'Chandigarh',
 'Orissa',
 'Puducherry',
 'Pondicherry',
 'Manipur',
 'Sikkim',
 'Tripura',
 'Mizoram',
 'Arunachal Pradesh',
 'Ladakh',
 'Dadra and Nagar Haveli and Daman and Diu',
 'Daman and Diu',
 'Andaman and Nicobar Islands',
 'Andaman & Nicobar Islands',
 'Dadra and Nagar Haveli',
 'Lakshadweep',
 'Daman & Diu',
 'Dadra & Nagar Haveli',
 'Jammu & Kashmir',
 'WESTBENGAL',
 'andhra pradesh',
 'Westbengal',
 'West  Bengal',
 'WEST BENGAL',
 'West Bangal',
 'ODISHA',
 'odisha',
 'West bengal',
 'west Bengal',
 'Uttaranchal',
 'Chhatisgarh',
 'Tamilnadu']

In [7]:
# ✅ Clean state column
final_df["state"] = (
    final_df["state"].astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .str.title()
)

# ✅ Fix duplicates / wrong spellings
final_df["state"] = final_df["state"].replace({
    # Odisha
    "Orissa": "Odisha",
    "Odisha": "Odisha",

    # West Bengal
    "Westbengal": "West Bengal",
    "West Bengal": "West Bengal",
    "West  Bengal": "West Bengal",
    "West Beng al": "West Bengal",
    "West Bangal": "West Bengal",

    # Andhra Pradesh
    "Andhra Pradesh": "Andhra Pradesh",

    # Tamil Nadu
    "Tamilnadu": "Tamil Nadu",

    # Uttarakhand
    "Uttaranchal": "Uttarakhand",

    # Chhattisgarh
    "Chhatisgarh": "Chhattisgarh",

    # Jammu & Kashmir
    "Jammu And Kashmir": "Jammu & Kashmir",

    # Andaman & Nicobar
    "Andaman And Nicobar Islands": "Andaman & Nicobar Islands",

    # Puducherry
    "Pondicherry": "Puducherry",

    # Merge UT name variations
    "Daman And Diu": "Dadra & Nagar Haveli and Daman & Diu",
    "Daman & Diu": "Dadra & Nagar Haveli and Daman & Diu",
    "Dadra And Nagar Haveli": "Dadra & Nagar Haveli and Daman & Diu",
    "Dadra & Nagar Haveli": "Dadra & Nagar Haveli and Daman & Diu",
    "Dadra And Nagar Haveli And Daman And Diu": "Dadra & Nagar Haveli and Daman & Diu",
})

final_df["state"] = final_df["state"].replace({
    "Dadra & Nagar Haveli And Daman & Diu": "Dadra & Nagar Haveli and Daman & Diu"
})

# ✅ Check final unique states
final_df["state"].unique().tolist()

['Haryana',
 'Bihar',
 'Jammu & Kashmir',
 'Tamil Nadu',
 'Maharashtra',
 'Gujarat',
 'Odisha',
 'West Bengal',
 'Kerala',
 'Rajasthan',
 'Punjab',
 'Himachal Pradesh',
 'Uttar Pradesh',
 'Assam',
 'Uttarakhand',
 'Madhya Pradesh',
 'Karnataka',
 'Andhra Pradesh',
 'Telangana',
 'Goa',
 'Nagaland',
 'Jharkhand',
 'Delhi',
 'Chhattisgarh',
 'Meghalaya',
 'Chandigarh',
 'Puducherry',
 'Manipur',
 'Sikkim',
 'Tripura',
 'Mizoram',
 'Arunachal Pradesh',
 'Ladakh',
 'Dadra & Nagar Haveli and Daman & Diu',
 'Andaman & Nicobar Islands',
 'Lakshadweep']

In [8]:
final_df["district"].unique().tolist()

['Mahendragarh',
 'Madhepura',
 'Punch',
 'Bhojpur',
 'Madurai',
 'Ratnagiri',
 'Anand',
 'Gandhinagar',
 'Dhenkanal',
 'Valsad',
 'Salem',
 'Hooghly',
 'Paschim Medinipur',
 'Wayanad',
 'Sawai Madhopur',
 'Vaishali',
 'Karur',
 'Rupnagar',
 'Sabarkantha',
 'Una',
 'Bhilwara',
 'Bara Banki',
 'Cachar',
 'Dehradun',
 'Chamba',
 'Shajapur',
 'The Nilgiris',
 'Wardha',
 'Nabarangapur',
 'Shaheed Bhagat Singh Nagar',
 'Davangere',
 'Dahod',
 'Tiruppur',
 'Vizianagaram',
 'Moga',
 'Rajkot',
 'Hyderabad',
 'Sundergarh',
 'Kurnool',
 'Warangal',
 'Tirunelveli',
 'South Goa',
 'Anantapur',
 'Thiruvananthapuram',
 'Srikakulam',
 'Nizamabad',
 'Cuttack',
 'Baran',
 'Guntur',
 'Mysuru',
 'Purba Medinipur',
 'Ernakulam',
 'Balaghat',
 'Sivaganga',
 'Harda *',
 'Mokokchung',
 'Uttara Kannada',
 'Perambalur',
 'Udupi',
 'Ananthapur',
 'Coimbatore',
 'West Singhbhum',
 'Tumakuru',
 'Birbhum',
 'Visakhapatnam',
 'Hardoi',
 'Siddharthnagar',
 'Dhanbad',
 'Darbhanga',
 'Jabalpur',
 'North 24 Parganas',


In [9]:
final_df["district"] = (
    final_df["district"].astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)     # extra spaces
    .str.replace(r"\*", "", regex=True)       # remove *
    .str.replace(r"\xa0", " ", regex=True)    # remove weird space
    .str.replace("?", "", regex=False)        # remove ?
    .str.strip()
    .str.title()
)

In [10]:
district_fix = {

    # ✅ Odisha
    "Khorda": "Khordha",
    "Anugul": "Angul",
    "Baleshwar": "Baleswar",
    "Baudh": "Boudh",
    "Sonapur": "Subarnapur",
    "Debagarh": "Deogarh",

    # ✅ West Bengal
    "Koch Bihar": "Cooch Behar",
    "Malda": "Maldah",
    "Howrah": "Howrah",
    "Hawrah": "Howrah",
    "Hugli": "Hooghly",
    "Darjiling": "Darjeeling",
    "Puruliya": "Purulia",
    "Burdwan": "Bardhaman",
    "Barddhaman": "Bardhaman",
    "West Midnapore": "Paschim Medinipur",
    "East Midnapore": "Purba Medinipur",
    "East Midnapur": "Purba Medinipur",
    "North Twenty Four Parganas": "North 24 Parganas",
    "South Twenty Four Parganas": "South 24 Parganas",
    "South 24 Pargana": "South 24 Parganas",

    # ✅ Tamil Nadu
    "The Nilgiris": "Nilgiris",
    "Tuticorin": "Thoothukkudi",
    "Kanyakumari": "Kanniyakumari",
    "Villupuram": "Viluppuram",
    "Tirupathur": "Tirupattur",

    # ✅ Karnataka
    "Bangalore": "Bengaluru",
    "Belgaum": "Belagavi",
    "Shimoga": "Shivamogga",
    "Tumkur": "Tumakuru",
    "Mysore": "Mysuru",
    "Hasan": "Hassan",
    "Bellary": "Ballari",

    # ✅ Andhra Pradesh
    "Ananthapur": "Anantapur",
    "Ananthapuramu": "Anantapur",
    "Cuddapah": "Kadapa",
    "Y. S. R": "Kadapa",

    # ✅ Telangana
    "Karim Nagar": "Karimnagar",
    "Rangareddi": "Rangareddy",
    "K.v. Rangareddy": "Rangareddy",
    "K.V.Rangareddy": "Rangareddy",
    "rangareddi": "Rangareddy",

    # ✅ J&K / Ladakh
    "Punch": "Poonch",
    "Badgam": "Budgam",
    "Baramula": "Baramulla",
    "Shupiyan": "Shopian",
}

final_df["district"] = final_df["district"].replace(district_fix)


In [11]:
final_df["district"].unique().tolist()

['Mahendragarh',
 'Madhepura',
 'Poonch',
 'Bhojpur',
 'Madurai',
 'Ratnagiri',
 'Anand',
 'Gandhinagar',
 'Dhenkanal',
 'Valsad',
 'Salem',
 'Hooghly',
 'Paschim Medinipur',
 'Wayanad',
 'Sawai Madhopur',
 'Vaishali',
 'Karur',
 'Rupnagar',
 'Sabarkantha',
 'Una',
 'Bhilwara',
 'Bara Banki',
 'Cachar',
 'Dehradun',
 'Chamba',
 'Shajapur',
 'Nilgiris',
 'Wardha',
 'Nabarangapur',
 'Shaheed Bhagat Singh Nagar',
 'Davangere',
 'Dahod',
 'Tiruppur',
 'Vizianagaram',
 'Moga',
 'Rajkot',
 'Hyderabad',
 'Sundergarh',
 'Kurnool',
 'Warangal',
 'Tirunelveli',
 'South Goa',
 'Anantapur',
 'Thiruvananthapuram',
 'Srikakulam',
 'Nizamabad',
 'Cuttack',
 'Baran',
 'Guntur',
 'Mysuru',
 'Purba Medinipur',
 'Ernakulam',
 'Balaghat',
 'Sivaganga',
 'Harda',
 'Mokokchung',
 'Uttara Kannada',
 'Perambalur',
 'Udupi',
 'Coimbatore',
 'West Singhbhum',
 'Tumakuru',
 'Birbhum',
 'Visakhapatnam',
 'Hardoi',
 'Siddharthnagar',
 'Dhanbad',
 'Darbhanga',
 'Jabalpur',
 'North 24 Parganas',
 'Leh',
 'West Godav

In [12]:
final_df["district"] = (
    final_df["district"]
    .str.replace("–", " ", regex=False)
    .str.replace("-", " ", regex=False)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

In [13]:
wrong = final_df["district"].value_counts()
print("✅ Total Unique Districts:", final_df["district"].nunique())
print("\nTop 50 districts:\n", wrong.head(50))

✅ Total Unique Districts: 889

Top 50 districts:
 district
Anantapur             18372
Bengaluru             16803
Belagavi              14424
Bardhaman             14038
North 24 Parganas     13429
Viluppuram            13123
Pune                  11586
Thrissur              11165
East Godavari         10647
Purba Medinipur       10438
Karimnagar            10163
Warangal               9976
Palakkad               9605
Hyderabad              9422
South 24 Parganas      9349
Ernakulam              9113
Medak                  9020
Tirunelveli            9003
Khordha                8918
Kanniyakumari          8916
Thiruvananthapuram     8913
Mysuru                 8779
West Godavari          8625
Nalgonda               8461
Paschim Medinipur      8453
Tumakuru               8372
Vellore                8335
Krishna                8309
Guntur                 8208
Baleswar               8192
Kollam                 7997
Kadapa                 7826
Khammam                7810
Kottayam         

In [14]:
final_df.to_csv("Datasets/Clean Datasets/api_data_aadhar_biometric.csv", index=False, encoding="utf-8")