In [None]:
# Load CLEANED enrolment data for age distribution
import matplotlib.pyplot as plt
import pandas as pd

df_age = pd.read_csv("../data/processed/cleaned/enrolment_clean.csv")

# optional: remove unset rows
df_age = df_age[
    (df_age["state"] != "<unset>") &
    (df_age["district"] != "<unset>") &
    (df_age["pincode"] != "<unset>")
]

# Aggregate age groups
age_totals = df_age[["age_0_5", "age_5_17", "age_17_plus"]].sum()
age_totals.index = ["0â€“5 years", "5â€“17 years", "17+ years"]

plt.figure(figsize=(8, 5))
age_totals.plot(kind="bar")

plt.title("\nAge-wise Distribution of Aadhaar Enrolments\n")
plt.xlabel("\nAge Group\n")
plt.ylabel("Total Enrolments\n")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()



In [None]:
import plotly.express as px
import pandas as pd

df = pd.read_csv("../data/processed/analysis/enrolment_analysis.csv")
df["date"] = pd.to_datetime(df["date"])

# THIS creates daily_enrolment
daily_enrolment = df.groupby("date")["total_enrolment"].sum()

# convert Series â†’ DataFrame
daily_enrolment_df = daily_enrolment.reset_index()
daily_enrolment_df.columns = ["date", "total_enrolment"]

fig = px.line(
    daily_enrolment_df,
    x="date",
    y="total_enrolment",
    title="Total Aadhaar Enrolment Over Time"
)

fig.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

# load enrolment analysis data
df = pd.read_csv("../data/processed/analysis/enrolment_analysis.csv")

# aggregate total enrolment by state
state_enrolment = (
    df.groupby("state")["total_enrolment"]
    .sum()
    .sort_values(ascending=False)
    .reset_index()
)

# top 10 states
top_states = state_enrolment.head(10)

plt.figure(figsize=(10, 5))
sns.barplot(
    data=top_states,
    x="total_enrolment",
    y="state"
)

plt.title("Top 10 States by Total Aadhaar Enrolment")
plt.xlabel("Total Enrolments")
plt.ylabel("State")
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="white")

# load enrolment analysis data
df = pd.read_csv("../data/processed/analysis/enrolment_analysis.csv")

# focus on the highest enrolment state
TOP_STATE = "Uttar Pradesh"
state_df = df[df["state"] == TOP_STATE]


In [None]:
# aggregate enrolment by district and date
district_time = (
    state_df
    .groupby(["district", "date"])["total_enrolment"]
    .sum()
    .reset_index()
)

# pivot for heatmap
heatmap_data = district_time.pivot(
    index="district",
    columns="date",
    values="total_enrolment"
)

plt.figure(figsize=(12, 8))
sns.heatmap(
    heatmap_data,
    cmap="YlOrRd",
    linewidths=0.1
)

plt.title(f"District-wise Aadhaar Enrolment Heatmap â€” {TOP_STATE}")
plt.xlabel("Date")
plt.ylabel("District")
plt.show()


In [None]:
df = pd.read_csv("../data/processed/analysis/updates_analysis.csv")


In [None]:
update_counts = (
    df.groupby("update_type")["total_updates"]
    .sum()
    .reset_index()
)

sns.barplot(data=update_counts, x="update_type", y="total_updates")
plt.title("Distribution of Aadhaar Update Types")
plt.xlabel("Update Type")
plt.ylabel("Total Updates")
plt.show()


In [None]:
state_updates = (
    df.groupby("state")["total_updates"]
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()

)
state_updates["state"] = state_updates["state"].str.title()
sns.barplot(data=state_updates, x="total_updates", y="state")
plt.title("Top States by Aadhaar Update Volume")
plt.xlabel("Total Updates")
plt.ylabel("State")
plt.show()


In [None]:
# load enrolment and update datasets explicitly
enrol_df = pd.read_csv("../data/processed/analysis/enrolment_analysis.csv")
upd_df   = pd.read_csv("../data/processed/analysis/updates_analysis.csv")



In [None]:
# normalize state names in both datasets
enrol_df["state"] = enrol_df["state"].astype(str).str.strip().str.title()
upd_df["state"]   = upd_df["state"].astype(str).str.strip().str.title()


In [None]:
enrol_state = enrol_df.groupby("state")["total_enrolment"].sum()
update_state = upd_df.groupby("state")["total_updates"].sum()

common_states = enrol_state.index.intersection(update_state.index)

print("Common states:", len(common_states))
common_states[:10]



In [None]:
enrol_state = enrol_df.groupby("state")["total_enrolment"].sum()
update_state = upd_df.groupby("state")["total_updates"].sum()

ratio_df = (update_state / enrol_state).dropna().reset_index()
ratio_df.columns = ["state", "update_enrolment_ratio"]

top_ratio = ratio_df.sort_values(
    "update_enrolment_ratio", ascending=False
).head(10)

sns.barplot(
    data=top_ratio,
    x="update_enrolment_ratio",
    y="state"
)

plt.title("States with High Update-to-Enrolment Ratio\n")
plt.xlabel("Update / Enrolment Ratio")
plt.ylabel("State")
plt.show()


In [None]:
# normalize state names again (safe even if already done)
enrol_df["state"] = enrol_df["state"].astype(str).str.strip().str.title()
upd_df["state"]   = upd_df["state"].astype(str).str.strip().str.title()

# aggregate
enrol_state = enrol_df.groupby("state")["total_enrolment"].sum()
update_state = upd_df.groupby("state")["total_updates"].sum()

# merge safely
ratio_df = (
    enrol_state.to_frame("total_enrolment")
    .merge(
        update_state.to_frame("total_updates"),
        left_index=True,
        right_index=True,
        how="inner"
    )
)

# avoid tiny denominators
ratio_df = ratio_df[ratio_df["total_enrolment"] > 0]

# compute ratio
ratio_df["update_enrolment_ratio"] = (
    ratio_df["total_updates"] / ratio_df["total_enrolment"]
)

ratio_df = ratio_df.reset_index()

# FILTER to avoid small-volume distortion
ratio_df_filtered = ratio_df[
    ratio_df["total_enrolment"] >
    ratio_df["total_enrolment"].quantile(0.25)
]


In [None]:
plot_df = ratio_df_filtered.sort_values(
    "update_enrolment_ratio", ascending=True
)


In [None]:
plot_df = ratio_df_filtered.sort_values(
    "update_enrolment_ratio", ascending=True
)

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

sns.scatterplot(
    data=plot_df,
    x="update_enrolment_ratio",
    y="state",
    s=120,
    color="#4C72B0"
)

plt.title(
    "States with High Update-to-Enrolment Ratio\n(Proxy Indicator of Operational Difficulty)",
    fontsize=13
)
plt.xlabel("Update / Enrolment Ratio")
plt.ylabel("State")

plt.grid(axis="x", linestyle="--", alpha=0.4)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# load enrolment and update datasets explicitly
enrol_df = pd.read_csv("../data/processed/analysis/enrolment_analysis.csv")
upd_df   = pd.read_csv("../data/processed/analysis/updates_analysis.csv")
# enrol = pd.read_csv("../data/processed/analysis/enrolment_analysis.csv", parse_dates=["date"])
# updates = pd.read_csv("../data/processed/analysis/update_analysis.csv", parse_dates=["date"])

# Sort for time-series
enrol = enrol_df.sort_values("date")
updates = upd_df.sort_values("date")


In [None]:
# Create age-based total enrollment
enrol["age_based_total"] = (
    enrol["age_0_5"].fillna(0)
    + enrol["age_5_17"].fillna(0)
    + enrol["age_17_plus"].fillna(0)
)


In [None]:
daily_enrol = (
    enrol.groupby("date", as_index=False)["age_based_total"]
         .sum()
)

window = 14

daily_enrol["mean"] = daily_enrol["age_based_total"].rolling(window).mean()
daily_enrol["std"] = daily_enrol["age_based_total"].rolling(window).std()

daily_enrol["anomaly"] = (
    abs(daily_enrol["age_based_total"] - daily_enrol["mean"])
    > 3 * daily_enrol["std"]
)

plt.figure(figsize=(12,6))
plt.plot(daily_enrol["date"], daily_enrol["age_based_total"], label="Enrollment (Age-wise)")
plt.scatter(
    daily_enrol[daily_enrol["anomaly"]]["date"],
    daily_enrol[daily_enrol["anomaly"]]["age_based_total"],
    color="red"
)

plt.title("Sudden Spikes & Drops in Enrollment (Age-wise Sum)")
plt.xlabel("Date")
plt.ylabel("Total Enrollment")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
enrol["date"] = pd.to_datetime(enrol["date"], format="%Y-%m-%d")

In [None]:
daily_enrol = (
    enrol.groupby("date", as_index=False)["age_based_total"]
         .sum()
)


In [None]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
plt.plot(daily_enrol["date"], daily_enrol["age_based_total"])

plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))

plt.xlim(
    pd.Timestamp("2025-03-01"),
    pd.Timestamp("2025-12-31")
)

plt.xlabel("Month (2025)")
plt.ylabel("Total Enrolment")
plt.title("Sudden Spikes & Drops in Enrollment")
plt.tight_layout()
plt.show()



In [None]:
daily_enrol.columns


In [None]:
window = 14

daily_enrol["mean"] = daily_enrol["age_based_total"].rolling(window).mean()
daily_enrol["std"]  = daily_enrol["age_based_total"].rolling(window).std()


In [None]:
daily_enrol = daily_enrol.dropna(subset=["mean", "std"])


In [None]:
daily_enrol = daily_enrol.dropna(subset=["mean", "std"]).copy()


In [None]:
daily_enrol["anomaly"] = (
    abs(daily_enrol["age_based_total"] - daily_enrol["mean"])
    > 2 * daily_enrol["std"]
)



In [None]:
daily_enrol["anomaly"].value_counts()


In [None]:
anomaly_dates = daily_enrol.loc[daily_enrol["anomaly"], "date"]

In [None]:
enrol_anomaly_days = enrol[enrol["date"].isin(anomaly_dates)]


In [None]:
enrol_anomaly_days["date"].nunique()


In [None]:
district_contrib = (
    enrol_anomaly_days
    .groupby("district")["age_based_total"]
    .sum()
    .reset_index()
    .sort_values("age_based_total", ascending=False)
)

In [None]:
print(district_contrib.head(10))


In [None]:
top10 = district_contrib.head(10)


In [None]:
plt.figure(figsize=(10,6))
bars = plt.barh(
    top10["district"],
    top10["age_based_total"],
    color="darkblue"
)
plt.gca().invert_yaxis()

# ðŸ”¹ Add labels inside bars
for bar in bars:
    width = bar.get_width()
    plt.text(
        width * 0.98,                 # position inside bar
        bar.get_y() + bar.get_height()/2,
        f"{int(width):,} Enrolments",             # formatted number
        va="center",
        ha="right",
        color="white",
        fontsize=10,
        fontweight="bold"
    )

plt.title("Top 10 Districts Driving Enrollment Anomalies")
plt.xlabel("Enrollment Volume on Anomaly Days")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.ensemble import IsolationForest
import pandas as pd

district_results = []

for district, df_d in enrol.groupby("district"):

    # Aggregate daily data for this district
    daily_d = (
        df_d.groupby("date")["age_based_total"]
        .sum()
        .reset_index()
        .sort_values("date")
    )

    # Skip districts with insufficient data
    if daily_d.shape[0] < 30:
        continue

    # Feature engineering (minimal + effective)
    daily_d["day_of_week"] = daily_d["date"].dt.weekday

    X = daily_d[["age_based_total", "day_of_week"]]

    # Isolation Forest
    iso = IsolationForest(
        n_estimators=200,
        contamination=0.08,   # aligns with your ~6â€“8% anomaly rate
        random_state=42
    )

    daily_d["ml_anomaly"] = iso.fit_predict(X) == -1
    daily_d["district"] = district

    # Keep only anomalies
    district_results.append(daily_d[daily_d["ml_anomaly"]])


In [None]:
ml_district_anomalies = pd.concat(district_results, ignore_index=True)


In [None]:
# Districts flagged by ML
ml_districts = set(ml_district_anomalies["district"])

# Districts active during statistical anomaly days
stats_districts = set(
    enrol_anomaly_days["district"]
)

# Overlap
overlap_districts = ml_districts.intersection(stats_districts)

print("ML districts:", len(ml_districts))
print("Statistical anomaly districts:", len(stats_districts))
print("Overlap:", len(overlap_districts))


In [None]:
stats_daily = (
    enrol_anomaly_days[["date", "district"]]
    .drop_duplicates()
    .sort_values("date")
)

stats_cumulative = (
    stats_daily
    .groupby("date")["district"]
    .apply(lambda x: x.unique())
    .reset_index()
)

seen = set()
cum_counts = []

for districts in stats_cumulative["district"]:
    seen |= set(districts)
    cum_counts.append(len(seen))

stats_cumulative["cum_unique_districts"] = cum_counts


In [None]:
ml_daily = (
    ml_district_anomalies[["date", "district"]]
    .drop_duplicates()
    .sort_values("date")
)

ml_cumulative = (
    ml_daily
    .groupby("date")["district"]
    .apply(lambda x: x.unique())
    .reset_index()
)

seen = set()
cum_counts = []

for districts in ml_cumulative["district"]:
    seen |= set(districts)
    cum_counts.append(len(seen))

ml_cumulative["cum_unique_districts"] = cum_counts


In [None]:
plt.figure(figsize=(10,6))

plt.plot(
    stats_cumulative["date"],
    stats_cumulative["cum_unique_districts"],
    label="Statistical Detection",
    linewidth=2
)

plt.plot(
    ml_cumulative["date"],
    ml_cumulative["cum_unique_districts"],
    label="ML Detection (Isolation Forest)",
    linewidth=2
)

plt.xlabel("Date")
plt.ylabel("Cumulative Unique Districts")
plt.title("Cumulative District Coverage: Statistical vs ML Detection")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load cleaned data (PIN code level)
df = pd.read_csv("../data/processed/cleaned/enrolment_clean.csv")

# Choose the most vulnerable district (example)
DISTRICT = "Aurangabad"

# Create total demand
df["total_demand"] = df["age_0_5"] + df["age_5_17"] + df["age_17_plus"]

# Filter district
district_df = df[df["district"] == DISTRICT]

# Aggregate by pincode (average daily demand)
pincode_trend = (
    district_df
    .groupby("pincode", as_index=False)["total_demand"]
    .mean()
    .sort_values("total_demand", ascending=False)
    .head(10)
)

# Plot
plt.figure(figsize=(6, 9))

plt.barh(
    pincode_trend["pincode"].astype(str),
    pincode_trend["total_demand"]
)

plt.xlabel("Average Daily Enrolments")
plt.ylabel("PIN Code")
plt.title(
    f"Top 10 PIN Codes Driving Aadhaar Demand\n({DISTRICT})"
)

plt.gca().invert_yaxis()
plt.grid(axis="x", alpha=0.3)
plt.tight_layout()
plt.show()
