In [14]:
import pandas as pd
import glob
import os

DATA_DIR = "data"

# combining the given csv to make one compiled csv with sum of columns as populations for each PIN code
BIO_COLS   = ["bio_age_5_17", "bio_age_17_"]
DEMO_COLS  = ["demo_age_5_17", "demo_age_17_"]
ENROL_COLS = ["age_0_5", "age_5_17", "age_18_greater"]

all_rows = []

for file in glob.glob(os.path.join(DATA_DIR, "*.csv")):
    df = pd.read_csv(file)

    pin_col = None
    for c in df.columns:
        if "pin" in c.lower():
            pin_col = c
            break

    if pin_col is None:
        print(f"No pincode column found in {file}")
        continue

    if set(BIO_COLS).issubset(df.columns):
        people = df[BIO_COLS].sum(axis=1)

    elif set(DEMO_COLS).issubset(df.columns):
        people = df[DEMO_COLS].sum(axis=1)

    elif set(ENROL_COLS).issubset(df.columns):
        people = df[ENROL_COLS].sum(axis=1)

    else:
        print(f"Unknown format: {file}")
        continue

    temp = pd.DataFrame({
        "pincode": df[pin_col],
        "people": people
    })

    all_rows.append(temp)

# combining everything
master = pd.concat(all_rows, ignore_index=True)

# cleaning data
master["pincode"] = master["pincode"].astype(str).str.zfill(6)
master["people"] = pd.to_numeric(master["people"], errors="coerce").fillna(0)

# final compiled df
final = master.groupby("pincode", as_index=False)["people"].sum()

print("Compilation complete")
print(final.head())
print(f"Total pincodes: {len(final)}")


Compilation complete
  pincode  people
0  100000     220
1  110001    5592
2  110002   11061
3  110003    9920
4  110004     178
Total pincodes: 19815


In [15]:
import pgeocode
import folium
from folium.plugins import HeatMap

# geocode pincodes
nomi = pgeocode.Nominatim("IN")
geo = nomi.query_postal_code(final["pincode"].tolist())

final["lat"] = geo.latitude.values
final["lon"] = geo.longitude.values

# dropping invalid locations
final = final.dropna(subset=["lat", "lon"])

# heatmap data: [lat, lon, weight]
heat_data = final[["lat", "lon", "people"]].values.tolist()

# creating india map
m = folium.Map(location=[22.0, 80.0], zoom_start=5)

HeatMap(
    heat_data,
    radius=14,
    blur=20,
    max_zoom=8
).add_to(m)

# creating output directory if it doesn't exist
if not os.path.exists("output"):
    os.makedirs("output")

m.save("output/india_pincode_heatmap.html")

print("Heatmap saved to output/india_pincode_heatmap.html")

Heatmap saved to output/india_pincode_heatmap.html


We are collecting data for aadhar centers from UIDAI website. We have only taken a small amount so as to not over load the official website with needless requests.

In [17]:
import requests
import pandas as pd
import time
import os
import sys

API_URL = "https://bhuvan-app3.nrsc.gov.in/aadhaar/usrtask/app_specific/get/getpinDetails.php"

OUTPUT_FILE = "pincode_center_counts_subset.csv"

SLEEP_TIME = 0.35
SAVE_EVERY = 100

def print_progress(current, total, start_time):
    bar_length = 30
    progress = current / total
    filled = int(bar_length * progress)
    bar = "█" * filled + "-" * (bar_length - filled)

    elapsed = time.time() - start_time
    rate = elapsed / current if current > 0 else 0
    remaining = rate * (total - current)

    mins, secs = divmod(int(remaining), 60)
    hrs, mins = divmod(mins, 60)

    sys.stdout.write(
        f"\r[{bar}] {progress*100:5.1f}% "
        f"| {current}/{total} "
        f"| ETA: {hrs:02d}:{mins:02d}:{secs:02d}"
    )
    sys.stdout.flush()

# ensuring correct dtypes
final["pincode"] = final["pincode"].astype(str)
final["people"] = pd.to_numeric(final["people"], errors="coerce")

# sorting by population descending
final_sorted = final.sort_values(
    by="people",
    ascending=False,
    na_position="last"
)

# selecting aadhar centers of interest
subset_10000_10200 = final_sorted.iloc[10000:10200]
top_200 = final_sorted.head(200)

# combining the df
final_subset = pd.concat([top_200, subset_10000_10200], ignore_index=True)

pincodes = final_subset["pincode"].unique()
total_pins = len(pincodes)

print("Total PINs to fetch:", total_pins)

# resume support
if os.path.exists(OUTPUT_FILE):
    df_done = pd.read_csv(OUTPUT_FILE)
    df_done["pincode"] = df_done["pincode"].astype(str)
    done_pins = set(df_done["pincode"])
else:
    df_done = pd.DataFrame(columns=["pincode", "center_count"])
    done_pins = set()

print("Already processed:", len(done_pins))
print("Remaining:", total_pins - len(done_pins))

results_buffer = []
processed = 0
start_time = time.time()

for pin in pincodes:
    if pin in done_pins:
        processed += 1
        continue

    try:
        params = {
            "sno": pin,
            "str": ""
        }

        r = requests.get(API_URL, params=params, timeout=10)
        r.raise_for_status()
        data = r.json()

        center_count = data.get("centerCount", 0)

        results_buffer.append({
            "pincode": pin,
            "center_count": center_count
        })

        time.sleep(SLEEP_TIME)

    except Exception:
        results_buffer.append({
            "pincode": pin,
            "center_count": None
        })

    processed += 1
    print_progress(processed, total_pins, start_time)

    if len(results_buffer) >= SAVE_EVERY:
        df_done = pd.concat(
            [df_done, pd.DataFrame(results_buffer)],
            ignore_index=True
        )
        df_done.to_csv(OUTPUT_FILE, index=False)
        results_buffer.clear()

# final save
if results_buffer:
    df_done = pd.concat(
        [df_done, pd.DataFrame(results_buffer)],
        ignore_index=True
    )
    df_done.to_csv(OUTPUT_FILE, index=False)

print("\n✅ Finished fetching center counts for subset")


Total PINs to fetch: 400
Already processed: 400
Remaining: 0

✅ Finished fetching center counts for subset


In [18]:
final_sorted_by_people = final.sort_values(by="people", ascending=False)
print(final_sorted_by_people.head())

     pincode  people        lat        lon
92    110094  190580  28.663867  77.227767
2694  244001  189541  28.864100  78.826653
58    110059  183169  28.655300  77.065700
1648  202001  152776  27.916500  78.064556
2868  247001  147476  29.949038  77.544543
