# Aadhaar Data Exploratory Data Analysis (EDA)

This notebook analyzes the aggregated Aadhaar enrolment data to uncover insights about:
1. Age Distribution (0-5, 5-17, 18+)
2. Geographic Trends (State/District wise)
3. Enrolment correlations

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set Layout
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading

In [None]:
# Load the processed dataset
try:
    df = pd.read_csv('../processed_records.csv')
    print(f"Data Loaded Successfully! Shape: {df.shape}")
except FileNotFoundError:
    print("Error: '../processed_records.csv' not found. Please run the backend data processor first.")

In [None]:
# Preview Data
df.head()

In [None]:
# Data Info
df.info()

### Detailed Column Statistics
Describes the statistical properties (mean, std, min, max) of the age group columns.

In [None]:
# Summary Statistics (Transposed for better readability)
df[['age_0_5', 'age_5_17', 'age_18_greater']].describe().T

## 2. Univariate Analysis: Age Distribution
We will visualize the distribution of population counts across the three age buckets.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 0-5 Years
sns.histplot(df['age_0_5'], bins=30, kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('Distribution: 0-5 Years')
axes[0].set_xlabel('Count')

# 5-17 Years
sns.histplot(df['age_5_17'], bins=30, kde=True, ax=axes[1], color='orange')
axes[1].set_title('Distribution: 5-17 Years')
axes[1].set_xlabel('Count')

# 18+ Years
sns.histplot(df['age_18_greater'], bins=30, kde=True, ax=axes[2], color='green')
axes[2].set_title('Distribution: 18+ Years')
axes[2].set_xlabel('Count')

plt.tight_layout()
plt.show()

## 3. Geographic Analysis: State-wise Enrolment

In [None]:
# Group by State
state_groups = df.groupby('state')[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()

# Add Total column for sorting
state_groups['total'] = state_groups['age_0_5'] + state_groups['age_5_17'] + state_groups['age_18_greater']
state_groups = state_groups.sort_values(by='total', ascending=False).head(15)

# Melt for plotting
state_melted = state_groups.melt(id_vars='state', value_vars=['age_0_5', 'age_5_17', 'age_18_greater'], 
                                 var_name='Age Group', value_name='Count')

plt.figure(figsize=(15, 8))
sns.barplot(data=state_melted, x='state', y='Count', hue='Age Group')
plt.title('Top 15 States by Enrolment Count (Age Breakdown)')
plt.xticks(rotation=45, ha='right')
plt.show()

## 4. Correlation Analysis

In [None]:
corr = df[['age_0_5', 'age_5_17', 'age_18_greater']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap: Age Groups')
plt.show()

## 5. Advanced Analysis
### 5.1 Violin Plot: Age Group Density Comparison

In [None]:
# Melting data for violin plot
df_melt = df.melt(value_vars=['age_0_5', 'age_5_17', 'age_18_greater'], var_name='Age Group', value_name='Population')

plt.figure(figsize=(10, 6))
sns.violinplot(x='Age Group', y='Population', data=df_melt, palette='muted')
plt.title('Violin Plot: Population Density by Age Group')
plt.yscale('log') # Log scale to handle large disparities
plt.show()

### 5.2 Top 10 Districts by Adult Population (18+)

In [None]:
district_group = df.groupby('district')[['age_18_greater']].sum().reset_index()
top_districts = district_group.sort_values(by='age_18_greater', ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(data=top_districts, x='district', y='age_18_greater', palette='Blues_d')
plt.title('Top 10 Districts by Adult Population (18+)')
plt.xticks(rotation=45)
plt.show()

### 5.3 Pair Plot: Variable Relationships

In [None]:
# Pair plot for numerical columns
sns.pairplot(df[['age_0_5', 'age_5_17', 'age_18_greater']], diag_kind='kde')
plt.suptitle('Pair Plot of Age Groups', y=1.02)
plt.show()