# Aadhaar Infrastructure Predictive Analytics

This notebook runs the predictive analytics pipeline for Aadhaar infrastructure planning. 
It forecasts enrolment and update demand and provides infrastructure recommendations.

In [None]:
# Install dependencies if running in a new Colab environment
!pip install -r requirements.txt

## 1. Run Data Pipeline
Ingest daily CSVs, aggregate to monthly level, and create a master dataset.

In [None]:
from src.pipeline import build_mid_master_df
import os

base_dir = os.getcwd()
master_df = build_mid_master_df(base_dir)
master_df.to_csv(os.path.join(base_dir, "output", "master_dataset_monthly.csv"), index=False)
print("Master Dataset created.")
master_df.head()

## 2. Generate Forecasts
Forecast demand for Enrolment, Biometric Updates, and Demographic Updates for the next 6 months.

In [None]:
from src.forecast import generate_forecasts
import pandas as pd

forecast_df = generate_forecasts(master_df)
forecast_df.to_csv(os.path.join(base_dir, "output", "district_forecasts.csv"), index=False)
print("Forecasts generated.")
forecast_df.head()

## 3. Infrastructure Recommendations
Calculate stress scores and recommend the number of Enrolment/Update kits required per district.

In [None]:
from src.analytics import calculate_derived_indicators, generate_recommendations

# Aggregate to get average demand over forecast horizon
avg_fc_df = forecast_df.groupby(['state', 'district'])[[
    'Forecast_Enrolment', 'Forecast_Bio_Updates', 'Forecast_Demo_Updates'
]].mean().reset_index()

indicators_df = calculate_derived_indicators(avg_fc_df)
plan_df = generate_recommendations(indicators_df)

plan_df.to_csv(os.path.join(base_dir, "output", "infrastructure_plan.csv"), index=False)
print("Infrastructure Plan Generated.")
plan_df[['state', 'district', 'Rec_Enrolment_Kits', 'Rec_Update_Kits', 'Zone_Category']].head(10)

## 4. Visualizations
Visualize high-demand zones.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter for top 20 districts by Update Stress Score
top_stress = plan_df.sort_values('Update_Stress_Score', ascending=False).head(20)

plt.figure(figsize=(12, 6))
sns.barplot(data=top_stress, x='Update_Stress_Score', y='district', hue='state', dodge=False)
plt.title('Top 20 Districts by Biometric Update Stress')
plt.xlabel('Stress Score (0-100)')
plt.show()