# UIDAI Hackathon - Data Visualization

## Objective
This notebook creates comprehensive interactive visualizations for the Aadhaar datasets:
- Temporal trend visualizations (daily, weekly, monthly)
- Geographical distribution maps
- State and district comparisons
- Age group demographic charts
- Seasonality patterns
- Update ratio visualizations
- Distribution and outlier analysis

**Author:** Harsh Vardhan  
**Date:** January 13, 2026  
**Input:** Cleaned data from previous notebooks  
**Output:** Interactive HTML visualizations in outputs/ directory

## 1. Setup Environment

In [1]:
# Standard libraries
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Configure plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Add src directory to path
project_root = Path(r'c:\Users\harsh\OneDrive - Indian Institute of Information Technology, Nagpur\IIIT Nagpur\6th Semester\Projects\IdentityLab')
sys.path.append(str(project_root / 'src'))

# Import custom modules
from data_loader import AadhaarDataLoader
from preprocessing import AadhaarDataPreprocessor
from analysis import AadhaarAnalyzer
from visualization import AadhaarVisualizer

print("✓ Environment setup complete")
print(f"✓ Project root: {project_root}")

✓ Environment setup complete
✓ Project root: c:\Users\harsh\OneDrive - Indian Institute of Information Technology, Nagpur\IIIT Nagpur\6th Semester\Projects\IdentityLab


## 2. Load and Prepare Data

In [None]:
# Load data
loader = AadhaarDataLoader(str(project_root))
preprocessor = AadhaarDataPreprocessor()
analyzer = AadhaarAnalyzer()

print("Loading and cleaning datasets...")
print("-" * 60)

In [None]:
# Load and clean enrolment
df_enrolment_raw = loader.load_enrolment_data()
df_enrolment = preprocessor.clean_enrolment_data(df_enrolment_raw)
df_enrolment = df_enrolment.drop_duplicates()
print(f"✓ Enrolment: {len(df_enrolment):,} records")

In [None]:
# Load and clean demographic
df_demographic_raw = loader.load_demographic_data()
df_demographic = preprocessor.clean_demographic_data(df_demographic_raw)
df_demographic = df_demographic.drop_duplicates()
print(f"✓ Demographic: {len(df_demographic):,} records")

In [None]:
# Load and clean biometric
df_biometric_raw = loader.load_biometric_data()
df_biometric = preprocessor.clean_biometric_data(df_biometric_raw)
df_biometric = df_biometric.drop_duplicates()
print(f"✓ Biometric: {len(df_biometric):,} records")

## 3. Initialize Visualizer

In [None]:
# Initialize visualizer
viz = AadhaarVisualizer(output_dir=str(project_root / 'outputs'))
print(f"✓ Visualizer initialized")
print(f"✓ Output directory: {project_root / 'outputs'}")

## 4. Temporal Analysis Visualizations

Create time series plots to show trends over time.

In [None]:
# Prepare monthly aggregations
print("Preparing temporal aggregations...")
enrol_monthly = analyzer.temporal_aggregation(df_enrolment, 'total_enrolments', freq='M')
demo_monthly = analyzer.temporal_aggregation(df_demographic, 'total_demo_updates', freq='M')
bio_monthly = analyzer.temporal_aggregation(df_biometric, 'total_bio_updates', freq='M')
print("✓ Monthly aggregations ready")

In [None]:
# Plot enrolment time series
fig = viz.plot_time_series(
    enrol_monthly, 
    'date', 
    'total_enrolments_sum',
    'Monthly Enrolment Trends (March - December 2025)',
    save_path='enrolment_time_series.html'
)
fig.show()
print("✓ Enrolment time series saved")

In [None]:
# Plot demographic update time series
fig = viz.plot_time_series(
    demo_monthly,
    'date',
    'total_demo_updates_sum',
    'Monthly Demographic Updates Trends',
    save_path='demographic_time_series.html'
)
fig.show()
print("✓ Demographic time series saved")

In [None]:
# Plot biometric update time series
fig = viz.plot_time_series(
    bio_monthly,
    'date',
    'total_bio_updates_sum',
    'Monthly Biometric Updates Trends',
    save_path='biometric_time_series.html'
)
fig.show()
print("✓ Biometric time series saved")

In [None]:
# Combined comparison time series
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=enrol_monthly['date'],
    y=enrol_monthly['total_enrolments_sum'],
    mode='lines+markers',
    name='Enrolments',
    line=dict(width=3)
))

fig.add_trace(go.Scatter(
    x=demo_monthly['date'],
    y=demo_monthly['total_demo_updates_sum'],
    mode='lines+markers',
    name='Demographic Updates',
    line=dict(width=3)
))

fig.add_trace(go.Scatter(
    x=bio_monthly['date'],
    y=bio_monthly['total_bio_updates_sum'],
    mode='lines+markers',
    name='Biometric Updates',
    line=dict(width=3)
))

fig.update_layout(
    title='Aadhaar Activity Comparison: Enrolments vs Updates',
    xaxis_title='Date',
    yaxis_title='Count',
    hovermode='x unified',
    template='plotly_white',
    legend=dict(x=0.01, y=0.99)
)

fig.write_html(f"{viz.output_dir}/combined_time_series.html")
fig.show()
print("✓ Combined time series saved")

## 5. Distribution Visualizations

Analyze the distribution of enrolments and updates.

In [None]:
# Distribution of enrolments
fig = viz.plot_distribution(
    df_enrolment,
    'total_enrolments',
    'Distribution of Daily Enrolments',
    save_path='enrolment_distribution.html'
)
fig.show()
print("✓ Enrolment distribution saved")

In [None]:
# Distribution of demographic updates
fig = viz.plot_distribution(
    df_demographic,
    'total_demo_updates',
    'Distribution of Daily Demographic Updates',
    save_path='demographic_distribution.html'
)
fig.show()
print("✓ Demographic distribution saved")

In [None]:
# Distribution of biometric updates
fig = viz.plot_distribution(
    df_biometric,
    'total_bio_updates',
    'Distribution of Daily Biometric Updates',
    save_path='biometric_distribution.html'
)
fig.show()
print("✓ Biometric distribution saved")

## 6. Geographical Analysis

Visualize state and district-level patterns.

In [None]:
# Top states by enrolment
top_states_enrol = analyzer.top_n_analysis(df_enrolment, 'state', 'total_enrolments', n=15)
fig = viz.plot_top_n_bar(
    top_states_enrol,
    'state',
    'total_total_enrolments',
    n=15,
    title='Top 15 States by Total Enrolments',
    save_path='top_states_enrolment.html'
)
fig.show()
print("✓ Top states visualization saved")

In [None]:
# Top districts by enrolment
top_districts_enrol = analyzer.top_n_analysis(df_enrolment, 'district', 'total_enrolments', n=20)
fig = viz.plot_top_n_bar(
    top_districts_enrol,
    'district',
    'total_total_enrolments',
    n=20,
    title='Top 20 Districts by Total Enrolments',
    save_path='top_districts_enrolment.html'
)
fig.show()
print("✓ Top districts visualization saved")

In [None]:
# State-level aggregation for updates
demo_by_state = analyzer.geographical_aggregation(df_demographic, 'state', 'total_demo_updates')
bio_by_state = analyzer.geographical_aggregation(df_biometric, 'state', 'total_bio_updates')

# Top states by demographic updates
fig = viz.plot_top_n_bar(
    demo_by_state.head(15),
    'state',
    'total_demo_updates_sum',
    n=15,
    title='Top 15 States by Demographic Updates',
    save_path='top_states_demographic.html'
)
fig.show()
print("✓ Top states demographic visualization saved")

In [None]:
# Top states by biometric updates
fig = viz.plot_top_n_bar(
    bio_by_state.head(15),
    'state',
    'total_bio_updates_sum',
    n=15,
    title='Top 15 States by Biometric Updates',
    save_path='top_states_biometric.html'
)
fig.show()
print("✓ Top states biometric visualization saved")

## 7. Seasonality Patterns

Visualize monthly patterns and seasonal trends.

In [None]:
# Seasonal pattern for enrolments
fig = viz.plot_seasonal_pattern(
    df_enrolment,
    'total_enrolments',
    'Enrolment Seasonality - Monthly Average Pattern',
    save_path='enrolment_seasonality.html'
)
fig.show()
print("✓ Enrolment seasonality saved")

In [None]:
# Seasonal pattern for demographic updates
fig = viz.plot_seasonal_pattern(
    df_demographic,
    'total_demo_updates',
    'Demographic Update Seasonality - Monthly Average Pattern',
    save_path='demographic_seasonality.html'
)
fig.show()
print("✓ Demographic seasonality saved")

In [None]:
# Seasonal pattern for biometric updates
fig = viz.plot_seasonal_pattern(
    df_biometric,
    'total_bio_updates',
    'Biometric Update Seasonality - Monthly Average Pattern',
    save_path='biometric_seasonality.html'
)
fig.show()
print("✓ Biometric seasonality saved")

## 8. Age Group Analysis

Compare distributions across age groups.

In [None]:
# Age group distribution pie chart
age_groups = ['age_0_5', 'age_5_17', 'age_18_greater']
age_totals = df_enrolment[age_groups].sum()

fig = go.Figure(data=[go.Pie(
    labels=['Age 0-5', 'Age 5-17', 'Age 18+'],
    values=age_totals.values,
    hole=0.3,
    marker=dict(colors=['#636EFA', '#EF553B', '#00CC96'])
)])

fig.update_layout(
    title='Enrolment Distribution by Age Group',
    template='plotly_white'
)

fig.write_html(f"{viz.output_dir}/age_group_distribution.html")
fig.show()
print("✓ Age group distribution saved")

In [None]:
# Age group comparison by state (top 10 states)
top_10_states = analyzer.top_n_analysis(df_enrolment, 'state', 'total_enrolments', n=10)['state'].tolist()
age_by_state = df_enrolment[df_enrolment['state'].isin(top_10_states)].groupby('state')[age_groups].sum()

fig = viz.plot_comparison(
    age_by_state.reset_index(),
    'state',
    age_groups,
    'Age Group Distribution - Top 10 States',
    save_path='age_group_by_state.html'
)
fig.show()
print("✓ Age group by state comparison saved")

## 9. Update Ratio Analysis

Visualize the ratio of updates to enrolments.

In [None]:
# Calculate update ratios
demo_ratio = analyzer.calculate_update_ratio(df_enrolment, df_demographic, geo_level='state')
bio_ratio = analyzer.calculate_update_ratio(df_enrolment, df_biometric, geo_level='state')

print("✓ Update ratios calculated")

In [None]:
# Top states by demographic update ratio
fig = viz.plot_top_n_bar(
    demo_ratio.head(15),
    'state',
    'update_ratio',
    n=15,
    title='Top 15 States by Demographic Update Ratio (%)',
    save_path='demographic_update_ratio.html'
)
fig.show()
print("✓ Demographic update ratio visualization saved")

In [None]:
# Top states by biometric update ratio
fig = viz.plot_top_n_bar(
    bio_ratio.head(15),
    'state',
    'update_ratio',
    n=15,
    title='Top 15 States by Biometric Update Ratio (%)',
    save_path='biometric_update_ratio.html'
)
fig.show()
print("✓ Biometric update ratio visualization saved")

In [None]:
# Compare update ratios
comparison_df = pd.merge(
    demo_ratio[['state', 'update_ratio']].rename(columns={'update_ratio': 'demographic_ratio'}),
    bio_ratio[['state', 'update_ratio']].rename(columns={'update_ratio': 'biometric_ratio'}),
    on='state'
).head(15)

fig = viz.plot_comparison(
    comparison_df,
    'state',
    ['demographic_ratio', 'biometric_ratio'],
    'Update Ratio Comparison - Top 15 States',
    save_path='update_ratio_comparison.html'
)
fig.show()
print("✓ Update ratio comparison saved")

## 10. Correlation Heatmap

Visualize correlations between different metrics.

In [None]:
# Prepare correlation data for enrolment
enrol_corr_cols = ['age_0_5', 'age_5_17', 'age_18_greater', 'total_enrolments', 'month', 'day_of_week']
fig = viz.plot_heatmap(
    df_enrolment,
    enrol_corr_cols,
    'Enrolment Data Correlation Matrix',
    save_path='enrolment_correlation.html'
)
fig.show()
print("✓ Enrolment correlation heatmap saved")

## 11. Day of Week Analysis

Analyze patterns by day of week.

In [None]:
# Day of week pattern
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_enrol = df_enrolment.groupby('day_of_week')['total_enrolments'].mean().reset_index()
dow_enrol['day_name'] = dow_enrol['day_of_week'].apply(lambda x: day_names[x])

fig = go.Figure()
fig.add_trace(go.Bar(
    x=dow_enrol['day_name'],
    y=dow_enrol['total_enrolments'],
    marker_color='indianred'
))

fig.update_layout(
    title='Average Enrolments by Day of Week',
    xaxis_title='Day of Week',
    yaxis_title='Average Enrolments',
    template='plotly_white'
)

fig.write_html(f"{viz.output_dir}/day_of_week_pattern.html")
fig.show()
print("✓ Day of week pattern saved")

## 12. Summary Dashboard

Create a comprehensive multi-panel dashboard.

In [None]:
# Create comprehensive dashboard
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=3, cols=2,
    subplot_titles=(
        'Monthly Enrolments', 
        'Age Group Distribution',
        'Top 10 States', 
        'Seasonality Pattern',
        'Day of Week Pattern',
        'Update Ratios'
    ),
    specs=[
        [{'type': 'scatter'}, {'type': 'domain'}],
        [{'type': 'bar'}, {'type': 'scatter'}],
        [{'type': 'bar'}, {'type': 'bar'}]
    ]
)

# 1. Monthly enrolments
fig.add_trace(
    go.Scatter(x=enrol_monthly['date'], y=enrol_monthly['total_enrolments_sum'], 
               mode='lines+markers', name='Enrolments'),
    row=1, col=1
)

# 2. Age distribution pie
fig.add_trace(
    go.Pie(labels=['Age 0-5', 'Age 5-17', 'Age 18+'], values=age_totals.values, name='Age Groups'),
    row=1, col=2
)

# 3. Top 10 states
top_10_data = top_states_enrol.head(10)
fig.add_trace(
    go.Bar(x=top_10_data['total_total_enrolments'], y=top_10_data['state'], 
           orientation='h', name='States'),
    row=2, col=1
)

# 4. Seasonality
monthly_avg = df_enrolment.groupby('month')['total_enrolments'].mean()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
fig.add_trace(
    go.Scatter(x=[month_names[i-1] for i in monthly_avg.index], y=monthly_avg.values,
               mode='lines+markers', name='Seasonality'),
    row=2, col=2
)

# 5. Day of week
fig.add_trace(
    go.Bar(x=dow_enrol['day_name'], y=dow_enrol['total_enrolments'], name='Day Pattern'),
    row=3, col=1
)

# 6. Update ratios
top_ratio_states = demo_ratio.head(10)
fig.add_trace(
    go.Bar(x=top_ratio_states['update_ratio'], y=top_ratio_states['state'],
           orientation='h', name='Update Ratio'),
    row=3, col=2
)

fig.update_layout(
    title_text='Aadhaar Data Analysis Dashboard',
    showlegend=False,
    height=1200,
    template='plotly_white'
)

fig.write_html(f"{viz.output_dir}/comprehensive_dashboard.html")
fig.show()
print("✓ Comprehensive dashboard saved")

## 13. Visualization Summary

In [None]:
# List all generated visualizations
import os

output_dir = project_root / 'outputs'
html_files = [f for f in os.listdir(output_dir) if f.endswith('.html')]

print("="*80)
print("VISUALIZATION SUMMARY")
print("="*80)
print(f"\nTotal visualizations created: {len(html_files)}")
print(f"Output directory: {output_dir}")
print("\nGenerated Files:")
for i, file in enumerate(sorted(html_files), 1):
    file_size = os.path.getsize(output_dir / file) / 1024
    print(f"  {i}. {file} ({file_size:.1f} KB)")

print("\n" + "="*80)
print("✓ All visualizations complete!")
print("\nVisualization Categories:")
print("1. Temporal Analysis (4 files) - Time series trends")
print("2. Distributions (3 files) - Statistical distributions")
print("3. Geographical (5 files) - State/district patterns")
print("4. Seasonality (3 files) - Monthly patterns")
print("5. Age Groups (2 files) - Demographic breakdowns")
print("6. Update Ratios (3 files) - Update efficiency metrics")
print("7. Correlations (1 file) - Variable relationships")
print("8. Dashboard (1 file) - Comprehensive overview")
print("\nAll visualizations are interactive HTML files that can be opened in a browser.")