# Purpose

## (Make sure to run all the state_wise cleaning files before this)
Prepare analysis-ready datasets from cleaned Aadhaar data.
Input: data/processed/cleaned/
Output: data/processed/analysis/


In [1]:
import pandas as pd
from pathlib import Path

BASE_PATH = Path("..") / "data" / "processed"
CLEAN_PATH = BASE_PATH / "cleaned"
ANALYSIS_PATH = BASE_PATH / "analysis"

ANALYSIS_PATH.mkdir(parents=True, exist_ok=True)


In [2]:
# Enrolment → enrolment_analysis.csv

enrol = pd.read_csv(CLEAN_PATH / "enrolment_clean.csv")

# unified age buckets (SAFE RULE)
enrol["child_count"] = enrol["age_0_5"]
enrol["non_child_count"] = enrol["age_5_17"] + enrol["age_17_plus"]

enrol["total_enrolment"] = enrol["child_count"] + enrol["non_child_count"]

enrolment_analysis = enrol[
    [
        "date", "state", "district", "pincode",
        "child_count", "non_child_count", "total_enrolment"
    ]
]

enrolment_analysis.to_csv(
    ANALYSIS_PATH / "enrolment_analysis.csv",
    index=False
)

print("✅ enrolment_analysis.csv created")


✅ enrolment_analysis.csv created


In [3]:
print(enrol.columns.tolist())


['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_17_plus', 'state_clean', 'district_clean', 'child_count', 'non_child_count', 'total_enrolment']


In [4]:
# ================================
# Updates Analysis Dataset Creation
# ================================

import pandas as pd

# Load cleaned datasets
demo = pd.read_csv(CLEAN_PATH / "demographic_clean.csv")
bio = pd.read_csv(CLEAN_PATH / "biometric_clean.csv")

# Explicitly set update_type (overwrite if exists)
demo["update_type"] = "demographic"
bio["update_type"] = "biometric"

# Merge both updates
updates = pd.concat([demo, bio], ignore_index=True)

# ----------------
# Normalize geography fields
# ----------------
for col in ["state", "district", "pincode"]:
    updates[col] = (
        updates[col]
        .astype(str)
        .str.strip()
        .str.title()
    )
# ----------------
# Compute total updates
# (no 0–5 age group exists for updates)
# ----------------
updates["total_updates"] = updates.filter(like="_age_").sum(axis=1)

# ----------------
# Final analysis dataframe
# ----------------
updates_analysis = updates[
    [
        "date",
        "state",
        "district",
        "pincode",
        "update_type",
        "total_updates"
    ]
]

# ----------------
# Save final analysis file
# ----------------
updates_analysis.to_csv(
    ANALYSIS_PATH / "updates_analysis.csv",
    index=False
)

print("✅ updates_analysis.csv created with BOTH demographic and biometric updates")
print(updates_analysis["update_type"].value_counts())


✅ updates_analysis.csv created with BOTH demographic and biometric updates
update_type
biometric      1766159
demographic    1598010
Name: count, dtype: int64
