In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Data Loading & Merging ---
print("Loading datasets...")
# Load the three split files
file1 = 'api_data_aadhar_enrolment_0_500000.csv'
file2 = 'api_data_aadhar_enrolment_500000_1000000.csv'
file3 = 'api_data_aadhar_enrolment_1000000_1006029.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)

# Combine into one Master DataFrame
df = pd.concat([df1, df2, df3], ignore_index=True)
print(f"Data Merged. Total Records: {len(df)}")

# --- 2. Data Cleaning & Feature Engineering ---
# Convert date to datetime object for time-series analysis
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y', errors='coerce')

# Create a 'Total Operations' column (Sum of all age groups)
df['total_ops'] = df['age_0_5'] + df['age_5_17'] + df['age_18_greater']

# --- 3. Exploratory Data Analysis (EDA) ---

# Insight 1: Which States have the highest workload?
state_ops = df.groupby('state')['total_ops'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=state_ops.values, y=state_ops.index, palette='viridis')
plt.title('Top 10 States by Aadhaar Operations Volume')
plt.xlabel('Total Operations')
plt.ylabel('State')
plt.tight_layout()
plt.savefig('insight_1_state_volume.png') # Saves image for your report
plt.show()

# Insight 2: Demographics - Who is updating the most?
age_sums = df[['age_0_5', 'age_5_17', 'age_18_greater']].sum()
plt.figure(figsize=(8, 8))
plt.pie(age_sums, labels=['Infants (0-5)', 'School Age (5-17)', 'Adults (18+)'], 
        autopct='%1.1f%%', colors=['#ff9999','#66b3ff','#99ff99'])
plt.title('Demographic Distribution of Updates')
plt.savefig('insight_2_demographics.png')
plt.show()

# Insight 3: Time Series Trend (identifying peaks)
monthly_trend = df.resample('M', on='date')['total_ops'].sum()
plt.figure(figsize=(12, 6))
plt.plot(monthly_trend.index, monthly_trend.values, marker='o', linestyle='-', color='b')
plt.title('Monthly Operation Trends (Peak Detection)')
plt.grid(True)
plt.savefig('insight_3_timeline.png')
plt.show()

# --- 4. The "Model" (Resource Optimization Recommendation) ---
# Logic: Identify Districts with high infant (0-5) enrollment but likely low centers
# This serves as a "Predictive Insight" for the proposal
print("\n--- RESOURCE ALLOCATION INSIGHTS ---")
child_heavy_districts = df.groupby(['state', 'district'])['age_0_5'].sum().sort_values(ascending=False).head(5)
print("Recommendation: Prioritize additional Child Enrollment Centers in these districts:")
print(child_heavy_districts)

# Export processed data for dashboard (optional)
df.to_csv('processed_master_data.csv', index=False)