In [1]:
# Core
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Date & Time
import datetime as dt

# Warnings control
import warnings
warnings.filterwarnings("ignore")

# Display settings (optional)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 100)

In [2]:
api_data_aadhar_demographic_0_500000 = pd.read_csv('Datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_0_500000.csv')
api_data_aadhar_demographic_1000000_1500000 = pd.read_csv('Datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_1000000_1500000.csv')
api_data_aadhar_demographic_1500000_2000000 = pd.read_csv('Datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_1500000_2000000.csv')
api_data_aadhar_demographic_2000000_2071700 = pd.read_csv('Datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_2000000_2071700.csv')
api_data_aadhar_demographic_500000_1000000 = pd.read_csv('Datasets/api_data_aadhar_demographic/api_data_aadhar_demographic_500000_1000000.csv')

In [3]:
dfs = [api_data_aadhar_demographic_0_500000, api_data_aadhar_demographic_1000000_1500000, api_data_aadhar_demographic_1500000_2000000, api_data_aadhar_demographic_2000000_2071700, api_data_aadhar_demographic_500000_1000000]

for i, df in enumerate(dfs, start=1):
    print(f"\n===== DataFrame {i} =====")
    print(df.head(5))
    print(df.size)
    print("-" * 40)


===== DataFrame 1 =====
         date           state    district  pincode  demo_age_5_17  demo_age_17_
0  01-03-2025   Uttar Pradesh   Gorakhpur   273213             49           529
1  01-03-2025  Andhra Pradesh    Chittoor   517132             22           375
2  01-03-2025         Gujarat      Rajkot   360006             65           765
3  01-03-2025  Andhra Pradesh  Srikakulam   532484             24           314
4  01-03-2025       Rajasthan     Udaipur   313801             45           785
3000000
----------------------------------------

===== DataFrame 2 =====
         date       state    district  pincode  demo_age_5_17  demo_age_17_
0  01-11-2025  Puducherry    Karaikal   609603              0             4
1  01-11-2025  Puducherry    Karaikal   609605              0             3
2  01-11-2025  Puducherry    Karaikal   609606              0             5
3  01-11-2025  Puducherry  Puducherry   605011              2             6
4  01-11-2025  Puducherry  Puducherry   6

In [4]:
dfs = [
    api_data_aadhar_demographic_0_500000,
    api_data_aadhar_demographic_500000_1000000,
    api_data_aadhar_demographic_1000000_1500000,
    api_data_aadhar_demographic_1500000_2000000,
    api_data_aadhar_demographic_2000000_2071700
]

# ✅ Clean + change datatypes
for i, df in enumerate(dfs, start=1):

    # ✅ common columns
    df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y", errors="coerce")
    df["state"] = df["state"].astype("category")
    df["district"] = df["district"].astype("category")
    df["pincode"] = df["pincode"].astype("string").str.zfill(6)

    print(f"\n===== DataFrame {i} =====")
    print(df.head(5))
    print("Rows, Cols:", df.shape)
    print("-" * 40)

# ✅ concat all
final_demographic_df = pd.concat(dfs, ignore_index=True)

print("\n✅ Final Demographic DataFrame Shape:", final_demographic_df.shape)



===== DataFrame 1 =====
        date           state    district pincode  demo_age_5_17  demo_age_17_
0 2025-03-01   Uttar Pradesh   Gorakhpur  273213             49           529
1 2025-03-01  Andhra Pradesh    Chittoor  517132             22           375
2 2025-03-01         Gujarat      Rajkot  360006             65           765
3 2025-03-01  Andhra Pradesh  Srikakulam  532484             24           314
4 2025-03-01       Rajasthan     Udaipur  313801             45           785
Rows, Cols: (500000, 6)
----------------------------------------

===== DataFrame 2 =====
        date        state    district pincode  demo_age_5_17  demo_age_17_
0 2025-09-19  Maharashtra      Satara  415517              0             2
1 2025-09-19  Maharashtra      Satara  415518              0             2
2 2025-09-19  Maharashtra      Satara  415520              0             3
3 2025-09-19  Maharashtra      Satara  415539              1             5
4 2025-09-19  Maharashtra  Sindhudurg  416

In [5]:
final_demographic_df['state'].unique().tolist()

['Uttar Pradesh',
 'Andhra Pradesh',
 'Gujarat',
 'Rajasthan',
 'Karnataka',
 'West Bengal',
 'Telangana',
 'Odisha',
 'Maharashtra',
 'Kerala',
 'Bihar',
 'Tamil Nadu',
 'Madhya Pradesh',
 'Assam',
 'Tripura',
 'Arunachal Pradesh',
 'Punjab',
 'Jharkhand',
 'Delhi',
 'Chandigarh',
 'Chhattisgarh',
 'Jammu and Kashmir',
 'Mizoram',
 'Nagaland',
 'Himachal Pradesh',
 'Goa',
 'Haryana',
 'Meghalaya',
 'Uttarakhand',
 'Manipur',
 'Daman and Diu',
 'Puducherry',
 'Sikkim',
 'Ladakh',
 'Dadra and Nagar Haveli and Daman and Diu',
 'Dadra and Nagar Haveli',
 'Orissa',
 'Pondicherry',
 'Andaman & Nicobar Islands',
 'Andaman and Nicobar Islands',
 'west Bengal',
 'Daman & Diu',
 'West  Bengal',
 'odisha',
 'Jammu & Kashmir',
 'Lakshadweep',
 'Dadra & Nagar Haveli',
 'Westbengal',
 'andhra pradesh',
 'WEST BENGAL',
 'West Bangal',
 'West bengal',
 'ODISHA',
 'WESTBENGAL',
 'Chhatisgarh',
 'West Bengli',
 'Darbhanga',
 'Puttenahalli',
 'BALANAGAR',
 'Uttaranchal',
 '100000',
 'Jaipur',
 'Madanapa

In [6]:
import numpy as np

# ✅ 1) Basic cleaning
final_demographic_df["state"] = (
    final_demographic_df["state"].astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .str.title()
)

# ✅ 2) Fix wrong spellings / duplicates
final_demographic_df["state"] = final_demographic_df["state"].replace({

    # Odisha
    "Orissa": "Odisha",
    "Odisha": "Odisha",

    # West Bengal
    "Westbengal": "West Bengal",
    "West  Bengal": "West Bengal",
    "West Bangal": "West Bengal",
    "West Bengli": "West Bengal",
    "West Bengal": "West Bengal",

    # Andhra Pradesh
    "Andhra Pradesh": "Andhra Pradesh",

    # Uttarakhand
    "Uttaranchal": "Uttarakhand",

    # Jammu & Kashmir
    "Jammu And Kashmir": "Jammu & Kashmir",

    # Andaman & Nicobar
    "Andaman And Nicobar Islands": "Andaman & Nicobar Islands",
    "Andaman & Nicobar Islands": "Andaman & Nicobar Islands",

    # Puducherry
    "Pondicherry": "Puducherry",

    # Merge UT names (official combined UT)
    "Daman And Diu": "Dadra & Nagar Haveli and Daman & Diu",
    "Daman & Diu": "Dadra & Nagar Haveli and Daman & Diu",
    "Dadra And Nagar Haveli": "Dadra & Nagar Haveli and Daman & Diu",
    "Dadra And Nagar Haveli And Daman And Diu": "Dadra & Nagar Haveli and Daman & Diu",
})

# ✅ 3) Remove invalid state values (district names / numbers)
valid_states = [
    "Andhra Pradesh","Arunachal Pradesh","Assam","Bihar","Chhattisgarh","Goa","Gujarat","Haryana",
    "Himachal Pradesh","Jharkhand","Karnataka","Kerala","Madhya Pradesh","Maharashtra","Manipur",
    "Meghalaya","Mizoram","Nagaland","Odisha","Punjab","Rajasthan","Sikkim","Tamil Nadu","Telangana",
    "Tripura","Uttar Pradesh","Uttarakhand","West Bengal",
    "Andaman & Nicobar Islands","Chandigarh","Delhi","Jammu & Kashmir","Ladakh","Lakshadweep",
    "Puducherry","Dadra & Nagar Haveli and Daman & Diu"
]

final_demographic_df.loc[~final_demographic_df["state"].isin(valid_states), "state"] = np.nan

# ✅ Check result
print(sorted(final_demographic_df["state"].dropna().unique().tolist()))
print("Missing state rows:", final_demographic_df["state"].isna().sum())

['Andaman & Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Dadra & Nagar Haveli and Daman & Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu & Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal']
Missing state rows: 117


In [7]:
final_demographic_df['district'].unique().tolist()

['Gorakhpur',
 'Chittoor',
 'Rajkot',
 'Srikakulam',
 'Udaipur',
 'Sikar',
 'Tumakuru',
 'Kurnool',
 'Paschim Medinipur',
 'Ghazipur',
 'Patan',
 'Mulugu',
 'Ganganagar',
 'Nayagarh',
 'Nashik',
 'Shivamogga',
 'Thrissur',
 'Hassan',
 'Patna',
 'Belgaum',
 'Kancheepuram',
 'Jabalpur',
 'Chennai',
 'Tinsukia',
 'Jamui',
 'Gaya',
 'Bengaluru',
 'Tiruppur',
 'Jalgaon',
 'Dhalai',
 'Nabarangapur',
 'Chittorgarh',
 'Darbhanga',
 'Purnia',
 'Muzaffarnagar',
 'Jaipur',
 'Churu',
 'Bharatpur',
 'Jamnagar',
 'Howrah',
 'Ahmedabad',
 'West Kameng',
 'Jalor',
 'Belagavi',
 'Bahraich',
 'Nagapattinam',
 'Sheikhpura',
 'Amritsar',
 'Mysore',
 'Alluri Sitharama Raju',
 'Sitamarhi',
 'Khunti',
 'Saharanpur',
 'Kanyakumari',
 'Prakasam',
 'Ananthapuramu',
 'Hooghly',
 'Central Delhi',
 'Deoria',
 'Giridih',
 'Koraput',
 'Shahjahanpur',
 'Chhotaudepur',
 'Jalandhar',
 'Kannauj',
 'Visakhapatnam',
 'Jodhpur',
 'Karimnagar',
 'Amethi',
 'Hyderabad',
 'Salem',
 'Cuddalore',
 'Sabarkantha',
 'Kolar',
 'Nag

In [8]:
final_demographic_df["district"] = (
    final_demographic_df["district"].astype(str)
    .str.strip()
    .str.replace(r"\xa0", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.replace(r"\*", "", regex=True)
    .str.replace(r"[^\w\s\-\.\(\)&/]", "", regex=True)  # removes weird symbols
    .str.title()
)

In [9]:
district_fix = {
    # Odisha
    "Khorda": "Khordha",
    "Anugul": "Angul",
    "Baleshwar": "Baleswar",
    "Baudh": "Boudh",
    "Sonapur": "Subarnapur",

    # West Bengal
    "Koch Bihar": "Cooch Behar",
    "Malda": "Maldah",
    "Hawrah": "Howrah",
    "Hugli": "Hooghly",
    "Darjiling": "Darjeeling",
    "Puruliya": "Purulia",
    "Burdwan": "Bardhaman",
    "Barddhaman": "Bardhaman",
    "West Midnapore": "Paschim Medinipur",
    "East Midnapore": "Purba Medinipur",
    "North Twenty Four Parganas": "North 24 Parganas",
    "South Twenty Four Parganas": "South 24 Parganas",
    "South 24 Pargana": "South 24 Parganas",

    # Tamil Nadu
    "The Nilgiris": "Nilgiris",
    "Tuticorin": "Thoothukkudi",
    "Kanyakumari": "Kanniyakumari",
    "Villupuram": "Viluppuram",
    "Tirupathur": "Tirupattur",

    # Karnataka
    "Bangalore": "Bengaluru",
    "Belgaum": "Belagavi",
    "Shimoga": "Shivamogga",
    "Tumkur": "Tumakuru",
    "Mysore": "Mysuru",
    "Hasan": "Hassan",
    "Bellary": "Ballari",

    # Andhra Pradesh / Telangana
    "Cuddapah": "Kadapa",
    "Y. S. R": "Kadapa",
    "K.V.Rangareddy": "Rangareddy",
    "K.v. Rangareddy": "Rangareddy",
    "Rangareddi": "Rangareddy",
}

final_demographic_df["district"] = final_demographic_df["district"].replace(district_fix)


In [10]:
bad_keywords = ["Near", "Road", "Colony", "Cross", "Garden", "Hospital", "University", "Thana", "Dist", "Meera"]

mask_bad = final_demographic_df["district"].str.contains("|".join(bad_keywords), case=False, na=False)
mask_num = final_demographic_df["district"].str.fullmatch(r"\d+", na=False)

final_demographic_df.loc[mask_bad | mask_num, "district"] = np.nan


In [11]:
print("✅ Unique districts:", final_demographic_df["district"].nunique())
print("❌ Missing districts (cleaned to NaN):", final_demographic_df["district"].isna().sum())
print(sorted(final_demographic_df["district"].dropna().unique())[:50])


✅ Unique districts: 917
❌ Missing districts (cleaned to NaN): 7472
['Adilabad', 'Agar Malwa', 'Agra', 'Ahilyanagar', 'Ahmadabad', 'Ahmadnagar', 'Ahmed Nagar', 'Ahmedabad', 'Aizawl', 'Ajmer', 'Akola', 'Alappuzha', 'Aligarh', 'Alipurduar', 'Alirajpur', 'Allahabad', 'Alluri Sitharama Raju', 'Almora', 'Alwar', 'Ambala', 'Ambedkar Nagar', 'Amethi', 'Amravati', 'Amreli', 'Amritsar', 'Amroha', 'Anakapalli', 'Anand', 'Anantapur', 'Ananthapur', 'Ananthapuramu', 'Anantnag', 'Andamans', 'Angul', 'Anjaw', 'Annamayya', 'Anugal', 'Anuppur', 'Araria', 'Ariyalur', 'Arvalli', 'Arwal', 'Ashok Nagar', 'Auraiya', 'Aurangabad', 'Aurangabad(Bh)', 'Ayodhya', 'Azamgarh', 'Badgam', 'Bagalkot']


In [12]:
final_demographic_df.to_csv("Datasets/Clean Datasets/api_data_aadhar_demographic.csv", index=False, encoding="utf-8")