In [None]:
import pandas as pd
import matplotlib.pyplot as plt


demo = pd.read_parquet("../sql/data/data_cache/demographic_clean.parquet")
bio = pd.read_parquet("../sql/data/data_cache/biometric_clean.parquet")

centres = pd.read_csv("../sql/data/Aadhar.csv")
population = pd.read_csv("../sql/data/clean_population.csv")


centres = centres.rename(columns={
    "State": "state",
    "No. of centres": "centres"
})

population = population.rename(columns={
    "state_name": "state",
    "population": "population"
})


updates = pd.concat([demo, bio]).groupby("state").size().reset_index(name="total_updates")


centres["centres"] = pd.to_numeric(centres["centres"], errors="coerce")
updates["total_updates"] = pd.to_numeric(updates["total_updates"], errors="coerce")
population["population"] = pd.to_numeric(population["population"], errors="coerce")


def normalize_state(s):
    return (
        s.astype(str)
        .str.strip()
        .str.lower()
        .str.replace("&", "and", regex=False)
        .str.replace(r"\s+", " ", regex=True)
        .str.replace(" islands", "", regex=False)
        .str.replace("dadra nagar haveli", "dadra and nagar haveli", regex=False)
        .str.replace("arunanchal pradesh", "arunachal pradesh", regex=False)
    )

centres["state"] = normalize_state(centres["state"])
updates["state"] = normalize_state(updates["state"])
population["state"] = normalize_state(population["state"])


df = centres.merge(updates, on="state", how="inner")
df = df.merge(population, on="state", how="inner")


df = df.dropna(subset=["centres", "total_updates", "population"])
df = df[df["centres"] > 0]
df = df[df["population"] > 0]


df["updates_per_centre"] = df["total_updates"] / df["centres"]
df["centres_per_100k_population"] = (df["centres"] / df["population"]) * 100000


df["state_display"] = df["state"].str.title()
df["state_display"] = df["state_display"].str.replace(
    "Dadra And Nagar Haveli And Daman And Diu",
    "Dadra & NH and D&D",
    regex=False
)


df_updates = df.sort_values("updates_per_centre", ascending=False)


fig, ax = plt.subplots(figsize=(12, 10))

ax.barh(
    df_updates["state_display"],
    df_updates["updates_per_centre"],
    edgecolor="white",
    linewidth=0.5,
    color="#2E86AB"
)


ax.set_xlabel("Average Updates per Aadhaar Centre", fontsize=12, fontweight="bold")
ax.set_title(
    "Aadhaar Updates per Centre by State/UT",
    fontsize=14,
    fontweight="bold",
    pad=20
)


ax.grid(axis="x", linestyle="--", alpha=0.3)
ax.set_axisbelow(True)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)


for i, value in enumerate(df_updates["updates_per_centre"]):
    ax.text(value, i, f"{value:.0f}", va="center", fontsize=9)

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


df_centres = df.sort_values("centres_per_100k_population", ascending=False)


fig, ax = plt.subplots(figsize=(12, 10))

ax.barh(
    df_centres["state_display"],
    df_centres["centres_per_100k_population"],
    edgecolor="white",
    linewidth=0.5,
    color="#A23B72"
)


ax.set_xlabel("Centres per 100,000 Population", fontsize=12, fontweight="bold")
ax.set_title(
    "Aadhaar Centre Infrastructure Density by State/UT",
    fontsize=14,
    fontweight="bold",
    pad=20
)


ax.grid(axis="x", linestyle="--", alpha=0.3)
ax.set_axisbelow(True)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)


for i, value in enumerate(df_centres["centres_per_100k_population"]):
    ax.text(value, i, f"{value:.1f}", va="center", fontsize=9)

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


print("\n=== UPDATES PER CENTRE STATISTICS ===")
print(f"Mean: {df['updates_per_centre'].mean():.2f}")
print(f"Median: {df['updates_per_centre'].median():.2f}")
print(f"Min: {df['updates_per_centre'].min():.2f}")
print(f"Max: {df['updates_per_centre'].max():.2f}")

print("\n=== CENTRES PER 100,000 POPULATION STATISTICS ===")
print(f"Mean: {df['centres_per_100k_population'].mean():.2f}")
print(f"Median: {df['centres_per_100k_population'].median():.2f}")
print(f"Min: {df['centres_per_100k_population'].min():.2f}")
print(f"Max: {df['centres_per_100k_population'].max():.2f}")


FileNotFoundError: [Errno 2] No such file or directory: '../sql/data/State_wise_Total_Updates.xlsx'