# UIDAI National Hackathon: Strategic Insights & Advanced Analytics
## Author: Senior Data Scientist, UIDAI

This notebook consolidates the entire analytical pipeline for the Aadhaar enrolment and update datasets (March - December 2025). It includes data cleaning, multi-layer EDA, clustering analysis, and the Service Stress Index (SSI) along with advanced visualizations.

### 1. Setup & Environment Initialisation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Aesthetic configuration
plt.style.use('bmh')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

base_dir = '.'  # Project root for GitHub portability
visuals_dir = os.path.join(base_dir, 'visuals')
os.makedirs(visuals_dir, exist_ok=True)
print('Environment Ready.')

ModuleNotFoundError: No module named 'pandas'

### 2. Data Consolidation & Cleaning
Merging multiple CSV chunks for enrolment, biometric, and demographic updates into unified DataFrames.

In [None]:
def load_and_clean(category_folder, date_format='%d-%m-%Y'):
    folder_path = os.path.join(base_dir, category_folder)
    files = glob.glob(os.path.join(folder_path, '*.csv'))
    df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
    df['date'] = pd.to_datetime(df['date'], format=date_format)
    df['state'] = df['state'].str.strip().str.upper()
    df['district'] = df['district'].str.strip().str.upper()
    return df

en_df = load_and_clean('api_data_aadhar_enrolment (2)')
bio_df = load_and_clean('api_data_aadhar_biometric')
demo_df = load_and_clean('api_data_aadhar_demographic')

print(f'Enrolment Records: {len(en_df)}')
print(f'Biometric Records: {len(bio_df)}')
print(f'Demographic Records: {len(demo_df)}')

### 3. Feature Engineering & Multi-Dimensional Aggregation
Creating a master daily state summary and derived metrics like the Update-to-Enrolment ratio.

In [None]:
en_df['total_enrolment'] = en_df[['age_0_5', 'age_5_17', 'age_18_greater']].sum(axis=1)
bio_df['total_biometric'] = bio_df[['bio_age_5_17', 'bio_age_17_']].sum(axis=1)
demo_df['total_demographic'] = demo_df[['demo_age_5_17', 'demo_age_17_']].sum(axis=1)

en_agg = en_df.groupby(['date', 'state']).agg({'total_enrolment': 'sum'}).reset_index()
bio_agg = bio_df.groupby(['date', 'state']).agg({'total_biometric': 'sum'}).reset_index()
demo_agg = demo_df.groupby(['date', 'state']).agg({'total_demographic': 'sum'}).reset_index()

master_agg = en_agg.merge(bio_agg, on=['date', 'state'], how='outer')
master_agg = master_agg.merge(demo_agg, on=['date', 'state'], how='outer').fillna(0)

master_agg['month'] = master_agg['date'].dt.strftime('%Y-%m')
master_agg['total_updates'] = master_agg['total_biometric'] + master_agg['total_demographic']
master_agg['total_activity'] = master_agg['total_enrolment'] + master_agg['total_updates']

print('Master aggregation completed.')

### 4. Advanced Clustering Analysis
Grouping states based on their operational behavior (Enrolment vs. Biometric vs. Demographic dominant).

In [None]:
state_behavior = master_agg.groupby('state').agg({
    'total_enrolment': 'sum',
    'total_biometric': 'sum',
    'total_demographic': 'sum',
    'total_activity': 'sum'
}).reset_index()

state_behavior = state_behavior[state_behavior['total_activity'] > 5000].copy()
X = state_behavior[['total_enrolment', 'total_biometric', 'total_demographic']]
X_norm = X.div(X.sum(axis=1), axis=0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_norm)
kmeans = KMeans(n_clusters=3, random_state=42)
state_behavior['cluster'] = kmeans.fit_predict(X_scaled)

# NEW: Clustering Visualization
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_norm['total_enrolment'], X_norm['total_biometric'], 
            c=state_behavior['cluster'], cmap='viridis', s=100, alpha=0.7, edgecolors='k')
plt.xlabel('Proportion of Enrolment')
plt.ylabel('Proportion of Biometric Updates')
plt.title('State Clusters: Operational Archetypes', fontsize=16, fontweight='bold')

# Add labels for key states
for i, txt in enumerate(state_behavior['state']):
    if state_behavior.iloc[i]['total_activity'] > 1000000:
        plt.annotate(txt, (X_norm.iloc[i]['total_enrolment'], X_norm.iloc[i]['total_biometric']), fontsize=9)

plt.colorbar(scatter, label='Cluster ID')
plt.savefig(os.path.join(visuals_dir, 'state_clusters_scatter.png'), dpi=300)
plt.show()

### 5. Service Stress Index (SSI)
Predictive modeling to identify infrastructure hotspots.

In [None]:
master_clean = master_agg[master_agg['date'].dt.day != 1].copy()
state_metrics = master_clean.groupby('state').agg({
    'total_enrolment': 'sum',
    'total_biometric': 'sum',
    'total_demographic': 'sum',
    'total_updates': 'sum'
}).reset_index()

state_metrics['ratio'] = state_metrics['total_updates'] / (state_metrics['total_enrolment'] + 10)
daily_vol = master_clean.groupby('state').apply(lambda x: x['total_updates'].std()).reset_index(name='volatility')
state_metrics = state_metrics.merge(daily_vol, on='state')
state_metrics['vol_index'] = state_metrics['volatility'] / (state_metrics['total_updates'] / 300 + 1)

def norm(s): return 100 * (s - s.min()) / (s.max() - s.min() + 1e-6)
state_metrics['SSI'] = (norm(state_metrics['ratio']) * 0.5) + (norm(state_metrics['vol_index']) * 0.5)
state_metrics = state_metrics[state_metrics['total_enrolment'] > 1000].sort_values('SSI', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(state_metrics.head(15)['state'][::-1], state_metrics.head(15)['SSI'][::-1], color='maroon')
plt.title('Critical Hotspots: Service Stress Index', fontsize=16, fontweight='bold')
plt.xlabel('Stress Score (0-100)')
plt.tight_layout()
plt.savefig(os.path.join(visuals_dir, 'final_ssi_chart.png'), dpi=300)
plt.show()

### 6. NEW: Geographic Demand Concentration (Lorenz Curve)
Visualising the inequality of UIDAI service demands across states. A high curvature indicates that service demand is concentrated in a few super-active nodes.

In [None]:
def plot_lorenz_curve(data, label, color):
    sorted_data = np.sort(data)
    cum_data = np.cumsum(sorted_data) / np.sum(sorted_data)
    cum_pts = np.linspace(0, 1, len(cum_data))
    plt.plot(cum_pts, cum_data, label=label, color=color, linewidth=3)

plt.figure(figsize=(10, 10))
plot_lorenz_curve(state_metrics['total_updates'], 'Updates Demand', 'orange')
plot_lorenz_curve(state_metrics['total_enrolment'], 'Enrolment Demand', 'green')
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Equality')
plt.title('Geographic Demand Concentration: Enrolment vs Updates', fontsize=16, fontweight='bold')
plt.xlabel('Cumulative Proportion of States')
plt.ylabel('Cumulative Proportion of Demand')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(visuals_dir, 'lorenz_concentration.png'), dpi=300)
plt.show()