# 05: Visualization Dashboard - UIDAI Hackathon PS-1

**Predictive Analysis of Aadhaar Update Demand**

This notebook creates publication-ready visualizations using **sampled data** to prevent memory issues.

## Strategy:
- Work with small data samples
- Create static plots only
- Save figures immediately
- Clear memory after each section
- Use try-except blocks for robustness

## 1. Setup & Memory Management

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
%matplotlib inline
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 150
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['axes.labelsize'] = 10

print("✓ Libraries imported successfully")

In [None]:
# Create output directory for figures
output_dir = Path('../outputs/figures')
output_dir.mkdir(parents=True, exist_ok=True)

print(f"✓ Output directory created: {output_dir}")

In [None]:
# Memory management function
def clear_memory():
    """Close all plots and run garbage collection"""
    plt.close('all')
    gc.collect()
    
def save_and_clear(filename, fig=None):
    """Save figure and clear memory"""
    if fig is not None:
        fig.savefig(output_dir / filename, bbox_inches='tight', dpi=150)
    else:
        plt.savefig(output_dir / filename, bbox_inches='tight', dpi=150)
    print(f"  → Saved: {filename}")
    clear_memory()

print("✓ Helper functions defined")

## 2. Load Data Efficiently

In [None]:
# Load feature matrix sample (10k rows)
print("Loading feature matrix sample...")
feature_matrix = pd.read_csv('../outputs/results/feature_matrix_sample.csv')

# Convert date columns
date_cols = [col for col in feature_matrix.columns if 'date' in col.lower() or 'time' in col.lower()]
for col in date_cols:
    try:
        feature_matrix[col] = pd.to_datetime(feature_matrix[col])
    except:
        pass

print(f"  Shape: {feature_matrix.shape}")
print(f"  Memory: {feature_matrix.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("✓ Feature matrix loaded")

In [None]:
# Load predictions (sample if needed)
print("\nLoading predictions...")
try:
    predictions = pd.read_csv('../outputs/results/predictions.csv')
    
    # Sample if too large
    if len(predictions) > 100000:
        predictions = predictions.sample(n=100000, random_state=42)
        print(f"  Sampled to 100k rows")
    
    # Convert date columns
    for col in predictions.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            try:
                predictions[col] = pd.to_datetime(predictions[col])
            except:
                pass
    
    print(f"  Shape: {predictions.shape}")
    print(f"  Memory: {predictions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print("✓ Predictions loaded")
except FileNotFoundError:
    print("  ⚠ Predictions file not found, will skip related visualizations")
    predictions = None

In [None]:
# Load regional classification
print("\nLoading regional classification...")
try:
    regional_data = pd.read_csv('../outputs/results/regional_classification.csv')
    print(f"  Shape: {regional_data.shape}")
    print(f"  Memory: {regional_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print("✓ Regional data loaded")
except FileNotFoundError:
    print("  ⚠ Regional classification file not found, will skip related visualizations")
    regional_data = None

In [None]:
# Load additional result files
print("\nLoading additional result files...")
try:
    feature_importance = pd.read_csv('../outputs/results/feature_importance.csv')
    print(f"  Feature importance: {feature_importance.shape}")
except:
    feature_importance = None
    print("  ⚠ Feature importance not found")

try:
    model_comparison = pd.read_csv('../outputs/results/model_comparison.csv')
    print(f"  Model comparison: {model_comparison.shape}")
except:
    model_comparison = None
    print("  ⚠ Model comparison not found")

print("\n✓ Data loading complete")

## 3. Temporal Analysis

In [None]:
print("Creating temporal analysis...")

try:
    # Find date and demand columns
    date_col = None
    demand_col = None
    
    # Look for date columns
    for col in feature_matrix.columns:
        if 'date' in col.lower() and feature_matrix[col].dtype == 'datetime64[ns]':
            date_col = col
            break
    
    # Look for demand/update columns
    for col in feature_matrix.columns:
        if any(word in col.lower() for word in ['demand', 'update', 'count', 'volume']):
            if pd.api.types.is_numeric_dtype(feature_matrix[col]):
                demand_col = col
                break
    
    if date_col and demand_col:
        # Prepare data
        temp_data = feature_matrix[[date_col, demand_col]].copy()
        temp_data = temp_data.dropna()
        temp_data = temp_data.sort_values(date_col)
        
        # Create figure
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        fig.suptitle('Temporal Analysis of Aadhaar Update Demand', fontsize=14, fontweight='bold')
        
        # Plot 1: Daily trend (last 90 days)
        recent_data = temp_data.tail(min(90, len(temp_data)))
        axes[0, 0].plot(recent_data[date_col], recent_data[demand_col], linewidth=1.5, color='#2E86AB')
        axes[0, 0].set_title('Daily Update Demand (Last 90 Days)')
        axes[0, 0].set_xlabel('Date')
        axes[0, 0].set_ylabel('Update Demand')
        axes[0, 0].tick_params(axis='x', rotation=45)
        axes[0, 0].grid(True, alpha=0.3)
        
        # Plot 2: Monthly aggregation
        temp_data['month'] = temp_data[date_col].dt.to_period('M')
        monthly = temp_data.groupby('month')[demand_col].sum().reset_index()
        monthly['month'] = monthly['month'].astype(str)
        axes[0, 1].bar(range(len(monthly)), monthly[demand_col], color='#A23B72', alpha=0.7)
        axes[0, 1].set_title('Monthly Aggregated Updates')
        axes[0, 1].set_xlabel('Month')
        axes[0, 1].set_ylabel('Total Updates')
        axes[0, 1].set_xticks(range(len(monthly)))
        axes[0, 1].set_xticklabels(monthly['month'], rotation=45, ha='right')
        axes[0, 1].grid(True, alpha=0.3, axis='y')
        
        # Plot 3: Day of week pattern
        temp_data['dayofweek'] = temp_data[date_col].dt.day_name()
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        axes[1, 0].boxplot([temp_data[temp_data['dayofweek'] == day][demand_col].values 
                            for day in day_order if day in temp_data['dayofweek'].values],
                           labels=[d[:3] for d in day_order if d in temp_data['dayofweek'].values],
                           patch_artist=True,
                           boxprops=dict(facecolor='#F18F01', alpha=0.6))
        axes[1, 0].set_title('Day of Week Pattern')
        axes[1, 0].set_xlabel('Day')
        axes[1, 0].set_ylabel('Update Demand')
        axes[1, 0].grid(True, alpha=0.3, axis='y')
        
        # Plot 4: Month-wise summary
        temp_data['month_num'] = temp_data[date_col].dt.month
        monthly_avg = temp_data.groupby('month_num')[demand_col].mean().reset_index()
        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        axes[1, 1].bar(monthly_avg['month_num'], monthly_avg[demand_col], color='#06A77D', alpha=0.7)
        axes[1, 1].set_title('Average Demand by Month')
        axes[1, 1].set_xlabel('Month')
        axes[1, 1].set_ylabel('Average Demand')
        axes[1, 1].set_xticks(monthly_avg['month_num'])
        axes[1, 1].set_xticklabels([month_names[m-1] for m in monthly_avg['month_num']], rotation=45)
        axes[1, 1].grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        save_and_clear('temporal_analysis.png', fig)
        print("✓ Temporal analysis complete")
    else:
        print(f"  ⚠ Required columns not found (date: {date_col}, demand: {demand_col})")
        
except Exception as e:
    print(f"  ✗ Error in temporal analysis: {str(e)}")
    clear_memory()

## 4. Geographic Analysis

In [None]:
print("Creating geographic analysis...")

try:
    # Find geographic columns
    state_col = None
    district_col = None
    
    for col in feature_matrix.columns:
        if 'state' in col.lower():
            state_col = col
        if 'district' in col.lower():
            district_col = col
    
    if state_col:
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        fig.suptitle('Geographic Analysis of Aadhaar Updates', fontsize=14, fontweight='bold')
        
        # Plot 1: Top 15 states
        state_counts = feature_matrix[state_col].value_counts().head(15)
        axes[0, 0].barh(range(len(state_counts)), state_counts.values, color='#2E86AB', alpha=0.7)
        axes[0, 0].set_yticks(range(len(state_counts)))
        axes[0, 0].set_yticklabels(state_counts.index)
        axes[0, 0].set_title('Top 15 States by Update Volume')
        axes[0, 0].set_xlabel('Number of Updates')
        axes[0, 0].invert_yaxis()
        axes[0, 0].grid(True, alpha=0.3, axis='x')
        
        # Plot 2: Top 15 districts (if available)
        if district_col:
            district_counts = feature_matrix[district_col].value_counts().head(15)
            axes[0, 1].barh(range(len(district_counts)), district_counts.values, color='#A23B72', alpha=0.7)
            axes[0, 1].set_yticks(range(len(district_counts)))
            axes[0, 1].set_yticklabels(district_counts.index)
            axes[0, 1].set_title('Top 15 Districts by Update Volume')
            axes[0, 1].set_xlabel('Number of Updates')
            axes[0, 1].invert_yaxis()
            axes[0, 1].grid(True, alpha=0.3, axis='x')
        else:
            axes[0, 1].text(0.5, 0.5, 'District data not available', 
                           ha='center', va='center', fontsize=12)
            axes[0, 1].axis('off')
        
        # Plot 3: State-wise update rate
        state_stats = feature_matrix.groupby(state_col).size().sort_values(ascending=True).tail(15)
        axes[1, 0].barh(range(len(state_stats)), state_stats.values, color='#F18F01', alpha=0.7)
        axes[1, 0].set_yticks(range(len(state_stats)))
        axes[1, 0].set_yticklabels(state_stats.index)
        axes[1, 0].set_title('State-wise Update Distribution')
        axes[1, 0].set_xlabel('Update Count')
        axes[1, 0].invert_yaxis()
        axes[1, 0].grid(True, alpha=0.3, axis='x')
        
        # Plot 4: Geographic summary statistics
        total_states = feature_matrix[state_col].nunique()
        total_updates = len(feature_matrix)
        avg_per_state = total_updates / total_states if total_states > 0 else 0
        
        summary_text = f"Geographic Summary\n\n"
        summary_text += f"Total States: {total_states}\n"
        if district_col:
            summary_text += f"Total Districts: {feature_matrix[district_col].nunique()}\n"
        summary_text += f"Total Updates: {total_updates:,}\n"
        summary_text += f"Avg Updates/State: {avg_per_state:.0f}\n\n"
        summary_text += f"Top State: {state_counts.index[0]}\n"
        summary_text += f"Top State Updates: {state_counts.values[0]:,}"
        
        axes[1, 1].text(0.1, 0.5, summary_text, fontsize=10, 
                       verticalalignment='center', family='monospace',
                       bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))
        axes[1, 1].axis('off')
        
        plt.tight_layout()
        save_and_clear('geographic_analysis.png', fig)
        print("✓ Geographic analysis complete")
    else:
        print(f"  ⚠ State column not found")
        
except Exception as e:
    print(f"  ✗ Error in geographic analysis: {str(e)}")
    clear_memory()

## 5. Demand Forecast

In [None]:
print("Creating demand forecast visualization...")

try:
    if predictions is not None:
        # Find actual and predicted columns
        actual_col = None
        predicted_col = None
        date_col = None
        
        for col in predictions.columns:
            if 'actual' in col.lower() or 'true' in col.lower():
                actual_col = col
            elif 'predict' in col.lower() or 'forecast' in col.lower():
                predicted_col = col
            elif 'date' in col.lower() or 'time' in col.lower():
                if predictions[col].dtype == 'datetime64[ns]':
                    date_col = col
        
        if actual_col and predicted_col:
            # Calculate metrics
            pred_data = predictions[[actual_col, predicted_col]].dropna()
            rmse = np.sqrt(np.mean((pred_data[actual_col] - pred_data[predicted_col])**2))
            mae = np.mean(np.abs(pred_data[actual_col] - pred_data[predicted_col]))
            
            fig, axes = plt.subplots(2, 1, figsize=(12, 8))
            fig.suptitle('Demand Forecast Performance', fontsize=14, fontweight='bold')
            
            # Plot 1: Actual vs Predicted scatter
            sample_size = min(5000, len(pred_data))
            sample_data = pred_data.sample(n=sample_size, random_state=42)
            
            axes[0].scatter(sample_data[actual_col], sample_data[predicted_col], 
                          alpha=0.3, s=20, color='#2E86AB')
            
            # Add diagonal line
            min_val = min(sample_data[actual_col].min(), sample_data[predicted_col].min())
            max_val = max(sample_data[actual_col].max(), sample_data[predicted_col].max())
            axes[0].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect Prediction')
            
            axes[0].set_title(f'Actual vs Predicted (RMSE: {rmse:.2f}, MAE: {mae:.2f})')
            axes[0].set_xlabel('Actual Demand')
            axes[0].set_ylabel('Predicted Demand')
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)
            
            # Plot 2: Time series (if date available)
            if date_col:
                time_data = predictions[[date_col, actual_col, predicted_col]].dropna()
                time_data = time_data.sort_values(date_col).tail(min(30, len(time_data)))
                
                axes[1].plot(time_data[date_col], time_data[actual_col], 
                           marker='o', linewidth=2, label='Actual', color='#2E86AB')
                axes[1].plot(time_data[date_col], time_data[predicted_col], 
                           marker='s', linewidth=2, label='Predicted', color='#F18F01')
                axes[1].set_title('Actual vs Predicted (Last 30 Days)')
                axes[1].set_xlabel('Date')
                axes[1].set_ylabel('Demand')
                axes[1].legend()
                axes[1].tick_params(axis='x', rotation=45)
                axes[1].grid(True, alpha=0.3)
            else:
                # Show residuals histogram instead
                residuals = pred_data[actual_col] - pred_data[predicted_col]
                axes[1].hist(residuals, bins=50, color='#A23B72', alpha=0.7, edgecolor='black')
                axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2)
                axes[1].set_title('Prediction Error Distribution')
                axes[1].set_xlabel('Residual (Actual - Predicted)')
                axes[1].set_ylabel('Frequency')
                axes[1].grid(True, alpha=0.3, axis='y')
            
            plt.tight_layout()
            save_and_clear('demand_forecast.png', fig)
            print("✓ Demand forecast visualization complete")
        else:
            print(f"  ⚠ Required columns not found (actual: {actual_col}, predicted: {predicted_col})")
    else:
        print("  ⚠ Predictions data not available")
        
except Exception as e:
    print(f"  ✗ Error in demand forecast: {str(e)}")
    clear_memory()

## 6. Regional Classification

In [None]:
print("Creating regional classification visualization...")

try:
    if regional_data is not None:
        # Find cluster column
        cluster_col = None
        for col in regional_data.columns:
            if 'cluster' in col.lower() or 'class' in col.lower() or 'category' in col.lower():
                cluster_col = col
                break
        
        if cluster_col:
            fig, axes = plt.subplots(2, 2, figsize=(12, 8))
            fig.suptitle('Regional Classification Analysis', fontsize=14, fontweight='bold')
            
            # Plot 1: Cluster distribution (pie chart)
            cluster_counts = regional_data[cluster_col].value_counts()
            colors = plt.cm.Set3(range(len(cluster_counts)))
            axes[0, 0].pie(cluster_counts.values, labels=cluster_counts.index, autopct='%1.1f%%',
                          colors=colors, startangle=90)
            axes[0, 0].set_title('Cluster Distribution')
            
            # Plot 2: Cluster sizes (bar chart)
            axes[0, 1].bar(range(len(cluster_counts)), cluster_counts.values, 
                          color=colors, alpha=0.7, edgecolor='black')
            axes[0, 1].set_xticks(range(len(cluster_counts)))
            axes[0, 1].set_xticklabels(cluster_counts.index)
            axes[0, 1].set_title('Regions per Cluster')
            axes[0, 1].set_xlabel('Cluster')
            axes[0, 1].set_ylabel('Number of Regions')
            axes[0, 1].grid(True, alpha=0.3, axis='y')
            
            # Plot 3: Cluster characteristics (if numeric columns available)
            numeric_cols = regional_data.select_dtypes(include=[np.number]).columns.tolist()
            if cluster_col in numeric_cols:
                numeric_cols.remove(cluster_col)
            
            if len(numeric_cols) >= 2:
                # Take first 2-3 numeric columns
                feature_cols = numeric_cols[:min(3, len(numeric_cols))]
                cluster_means = regional_data.groupby(cluster_col)[feature_cols].mean()
                
                x = np.arange(len(cluster_means.index))
                width = 0.25
                
                for i, col in enumerate(feature_cols):
                    axes[1, 0].bar(x + i*width, cluster_means[col], width, 
                                 label=col[:20], alpha=0.7)
                
                axes[1, 0].set_xlabel('Cluster')
                axes[1, 0].set_ylabel('Average Value')
                axes[1, 0].set_title('Cluster Characteristics')
                axes[1, 0].set_xticks(x + width)
                axes[1, 0].set_xticklabels(cluster_means.index)
                axes[1, 0].legend()
                axes[1, 0].grid(True, alpha=0.3, axis='y')
            else:
                axes[1, 0].text(0.5, 0.5, 'Insufficient numeric features\nfor cluster analysis', 
                               ha='center', va='center', fontsize=12)
                axes[1, 0].axis('off')
            
            # Plot 4: Sample regions per cluster
            summary_text = "Sample Regions per Cluster\n\n"
            region_col = None
            for col in regional_data.columns:
                if 'region' in col.lower() or 'state' in col.lower() or 'district' in col.lower():
                    region_col = col
                    break
            
            if region_col:
                for cluster in sorted(regional_data[cluster_col].unique()):
                    cluster_regions = regional_data[regional_data[cluster_col] == cluster][region_col].head(3).tolist()
                    summary_text += f"Cluster {cluster}:\n"
                    for region in cluster_regions:
                        summary_text += f"  • {str(region)[:25]}\n"
                    summary_text += "\n"
            else:
                summary_text += f"Total Clusters: {len(cluster_counts)}\n"
                summary_text += f"Total Regions: {len(regional_data)}\n"
                summary_text += f"Largest Cluster: {cluster_counts.index[0]}\n"
                summary_text += f"Regions in Largest: {cluster_counts.values[0]}"
            
            axes[1, 1].text(0.05, 0.95, summary_text, fontsize=9, 
                           verticalalignment='top', family='monospace',
                           bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))
            axes[1, 1].axis('off')
            
            plt.tight_layout()
            save_and_clear('regional_clusters.png', fig)
            print("✓ Regional classification visualization complete")
        else:
            print("  ⚠ Cluster column not found")
    else:
        print("  ⚠ Regional data not available")
        
except Exception as e:
    print(f"  ✗ Error in regional classification: {str(e)}")
    clear_memory()

## 7. Feature Importance

In [None]:
print("Creating feature importance visualization...")

try:
    if feature_importance is not None:
        # Find feature and importance columns
        feature_col = None
        importance_col = None
        
        for col in feature_importance.columns:
            if 'feature' in col.lower() or 'name' in col.lower():
                feature_col = col
            elif 'importance' in col.lower() or 'score' in col.lower() or 'weight' in col.lower():
                importance_col = col
        
        if feature_col and importance_col:
            # Get top 20 features
            top_features = feature_importance.nlargest(20, importance_col)
            
            fig, ax = plt.subplots(figsize=(12, 8))
            
            # Create color map based on importance
            colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(top_features)))
            
            bars = ax.barh(range(len(top_features)), top_features[importance_col].values, 
                          color=colors, alpha=0.8, edgecolor='black')
            
            ax.set_yticks(range(len(top_features)))
            ax.set_yticklabels(top_features[feature_col].values)
            ax.set_xlabel('Importance Score', fontsize=12)
            ax.set_title('Top 20 Feature Importance', fontsize=14, fontweight='bold', pad=20)
            ax.invert_yaxis()
            ax.grid(True, alpha=0.3, axis='x')
            
            # Add value labels
            for i, (bar, val) in enumerate(zip(bars, top_features[importance_col].values)):
                ax.text(val, i, f' {val:.4f}', va='center', fontsize=8)
            
            plt.tight_layout()
            save_and_clear('feature_analysis.png', fig)
            print("✓ Feature importance visualization complete")
        else:
            print(f"  ⚠ Required columns not found (feature: {feature_col}, importance: {importance_col})")
    else:
        print("  ⚠ Feature importance data not available")
        
except Exception as e:
    print(f"  ✗ Error in feature importance: {str(e)}")
    clear_memory()

## 8. Model Performance

In [None]:
print("Creating model performance visualization...")

try:
    if model_comparison is not None:
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        fig.suptitle('Model Performance Comparison', fontsize=14, fontweight='bold')
        
        # Find model name column
        model_col = None
        for col in model_comparison.columns:
            if 'model' in col.lower() or 'name' in col.lower():
                model_col = col
                break
        
        if model_col is None and len(model_comparison.columns) > 0:
            model_col = model_comparison.columns[0]
        
        # Plot 1: RMSE comparison
        rmse_cols = [col for col in model_comparison.columns if 'rmse' in col.lower()]
        if rmse_cols and model_col:
            rmse_col = rmse_cols[0]
            axes[0, 0].bar(range(len(model_comparison)), model_comparison[rmse_col], 
                          color='#2E86AB', alpha=0.7, edgecolor='black')
            axes[0, 0].set_xticks(range(len(model_comparison)))
            axes[0, 0].set_xticklabels(model_comparison[model_col], rotation=45, ha='right')
            axes[0, 0].set_title('RMSE Comparison (Lower is Better)')
            axes[0, 0].set_ylabel('RMSE')
            axes[0, 0].grid(True, alpha=0.3, axis='y')
        else:
            axes[0, 0].text(0.5, 0.5, 'RMSE data not available', ha='center', va='center')
            axes[0, 0].axis('off')
        
        # Plot 2: R² comparison
        r2_cols = [col for col in model_comparison.columns if 'r2' in col.lower() or 'r_squared' in col.lower()]
        if r2_cols and model_col:
            r2_col = r2_cols[0]
            axes[0, 1].bar(range(len(model_comparison)), model_comparison[r2_col], 
                          color='#06A77D', alpha=0.7, edgecolor='black')
            axes[0, 1].set_xticks(range(len(model_comparison)))
            axes[0, 1].set_xticklabels(model_comparison[model_col], rotation=45, ha='right')
            axes[0, 1].set_title('R² Score Comparison (Higher is Better)')
            axes[0, 1].set_ylabel('R² Score')
            axes[0, 1].grid(True, alpha=0.3, axis='y')
        else:
            axes[0, 1].text(0.5, 0.5, 'R² data not available', ha='center', va='center')
            axes[0, 1].axis('off')
        
        # Plot 3: Residuals (if predictions available)
        if predictions is not None:
            actual_col = None
            predicted_col = None
            for col in predictions.columns:
                if 'actual' in col.lower():
                    actual_col = col
                elif 'predict' in col.lower():
                    predicted_col = col
            
            if actual_col and predicted_col:
                residuals = predictions[actual_col] - predictions[predicted_col]
                residuals = residuals.dropna()
                axes[1, 0].hist(residuals, bins=50, color='#A23B72', alpha=0.7, edgecolor='black')
                axes[1, 0].axvline(x=0, color='red', linestyle='--', linewidth=2)
                axes[1, 0].set_title('Residual Distribution')
                axes[1, 0].set_xlabel('Residual (Actual - Predicted)')
                axes[1, 0].set_ylabel('Frequency')
                axes[1, 0].grid(True, alpha=0.3, axis='y')
            else:
                axes[1, 0].text(0.5, 0.5, 'Residual data not available', ha='center', va='center')
                axes[1, 0].axis('off')
        else:
            axes[1, 0].text(0.5, 0.5, 'Prediction data not available', ha='center', va='center')
            axes[1, 0].axis('off')
        
        # Plot 4: Model summary
        if model_col:
            summary_text = "Model Performance Summary\n\n"
            summary_text += f"Models Compared: {len(model_comparison)}\n\n"
            
            if rmse_cols:
                best_model_idx = model_comparison[rmse_cols[0]].idxmin()
                summary_text += f"Best Model (RMSE):\n"
                summary_text += f"  {model_comparison.loc[best_model_idx, model_col]}\n"
                summary_text += f"  RMSE: {model_comparison.loc[best_model_idx, rmse_cols[0]]:.4f}\n\n"
            
            if r2_cols:
                best_model_idx = model_comparison[r2_cols[0]].idxmax()
                summary_text += f"Best Model (R²):\n"
                summary_text += f"  {model_comparison.loc[best_model_idx, model_col]}\n"
                summary_text += f"  R²: {model_comparison.loc[best_model_idx, r2_cols[0]]:.4f}"
            
            axes[1, 1].text(0.1, 0.5, summary_text, fontsize=10, 
                           verticalalignment='center', family='monospace',
                           bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.3))
            axes[1, 1].axis('off')
        else:
            axes[1, 1].text(0.5, 0.5, 'Model summary not available', ha='center', va='center')
            axes[1, 1].axis('off')
        
        plt.tight_layout()
        save_and_clear('model_performance.png', fig)
        print("✓ Model performance visualization complete")
    else:
        print("  ⚠ Model comparison data not available")
        
except Exception as e:
    print(f"  ✗ Error in model performance: {str(e)}")
    clear_memory()

## 9. Executive Summary

In [None]:
print("Creating executive summary visualization...")

try:
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    
    # Create infographic-style summary
    fig.suptitle('UIDAI Hackathon PS-1: Executive Summary', 
                fontsize=16, fontweight='bold', y=0.95)
    
    # Calculate key metrics
    metrics = []
    
    # Metric 1: Total data points
    metrics.append({
        'title': 'Total Records',
        'value': f"{len(feature_matrix):,}",
        'subtitle': 'Data Points Analyzed'
    })
    
    # Metric 2: Geographic coverage
    state_col = None
    for col in feature_matrix.columns:
        if 'state' in col.lower():
            state_col = col
            break
    if state_col:
        metrics.append({
            'title': 'States Covered',
            'value': f"{feature_matrix[state_col].nunique()}",
            'subtitle': 'Geographic Regions'
        })
    
    # Metric 3: Model performance
    if predictions is not None:
        actual_col = None
        predicted_col = None
        for col in predictions.columns:
            if 'actual' in col.lower():
                actual_col = col
            elif 'predict' in col.lower():
                predicted_col = col
        
        if actual_col and predicted_col:
            pred_data = predictions[[actual_col, predicted_col]].dropna()
            rmse = np.sqrt(np.mean((pred_data[actual_col] - pred_data[predicted_col])**2))
            metrics.append({
                'title': 'Model RMSE',
                'value': f"{rmse:.2f}",
                'subtitle': 'Prediction Error'
            })
    
    # Metric 4: Features analyzed
    if feature_importance is not None:
        metrics.append({
            'title': 'Features',
            'value': f"{len(feature_importance)}",
            'subtitle': 'Variables Analyzed'
        })
    
    # Metric 5: Regional clusters
    if regional_data is not None:
        cluster_col = None
        for col in regional_data.columns:
            if 'cluster' in col.lower():
                cluster_col = col
                break
        if cluster_col:
            metrics.append({
                'title': 'Clusters',
                'value': f"{regional_data[cluster_col].nunique()}",
                'subtitle': 'Regional Groups'
            })
    
    # Metric 6: Models compared
    if model_comparison is not None:
        metrics.append({
            'title': 'Models Tested',
            'value': f"{len(model_comparison)}",
            'subtitle': 'ML Algorithms'
        })
    
    # Layout metrics in grid
    n_metrics = len(metrics)
    cols = 3
    rows = (n_metrics + cols - 1) // cols
    
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#06A77D', '#C73E1D', '#6A4C93']
    
    for idx, metric in enumerate(metrics):
        row = idx // cols
        col = idx % cols
        
        x = 0.15 + col * 0.30
        y = 0.75 - row * 0.35
        
        # Draw card background
        rect = plt.Rectangle((x-0.12, y-0.12), 0.24, 0.24, 
                            facecolor=colors[idx % len(colors)], 
                            alpha=0.15, transform=fig.transFigure)
        fig.patches.append(rect)
        
        # Add text
        fig.text(x, y+0.05, metric['title'], 
                fontsize=11, ha='center', weight='bold',
                transform=fig.transFigure)
        fig.text(x, y, metric['value'], 
                fontsize=24, ha='center', weight='bold',
                color=colors[idx % len(colors)],
                transform=fig.transFigure)
        fig.text(x, y-0.06, metric['subtitle'], 
                fontsize=9, ha='center', style='italic',
                transform=fig.transFigure)
    
    # Add footer
    fig.text(0.5, 0.05, 'Predictive Analysis of Aadhaar Update Demand', 
            fontsize=10, ha='center', style='italic',
            transform=fig.transFigure)
    
    save_and_clear('executive_summary.png', fig)
    print("✓ Executive summary visualization complete")
    
except Exception as e:
    print(f"  ✗ Error in executive summary: {str(e)}")
    clear_memory()

## 10. Final Summary

In [None]:
# List all generated figures
print("\n" + "="*60)
print("VISUALIZATION COMPLETE")
print("="*60)

figure_files = list(output_dir.glob('*.png'))
if figure_files:
    print(f"\n✓ Generated {len(figure_files)} visualizations:")
    for fig_file in sorted(figure_files):
        file_size = fig_file.stat().st_size / 1024  # KB
        print(f"  • {fig_file.name} ({file_size:.1f} KB)")
    print(f"\nAll figures saved to: {output_dir}")
else:
    print("\n⚠ No figures were generated")

# Final memory cleanup
clear_memory()
print("\n✓ Memory cleaned")
print("\n" + "="*60)