# Purpose

## (Make sure to run all the state_wise cleaning files before this)
Prepare analysis-ready datasets from cleaned Aadhaar data.
Input: data/processed/cleaned/
Output: data/processed/analysis/


In [16]:
import pandas as pd
from pathlib import Path

BASE_PATH = Path("..") / "data" / "processed"
CLEAN_PATH = BASE_PATH / "cleaned"
ANALYSIS_PATH = BASE_PATH / "analysis"

ANALYSIS_PATH.mkdir(parents=True, exist_ok=True)


In [17]:
# Enrolment → enrolment_analysis.csv

import pandas as pd

enrol = pd.read_csv(CLEAN_PATH / "enrolment_clean.csv")

# Use ONLY the three original age groups
enrol["total_enrolment"] = (
    enrol["age_0_5"]
    + enrol["age_5_17"]
    + enrol["age_17_plus"]
)

# Final analysis table (ONLY required columns)
enrolment_analysis = enrol[
    [
        "date",
        "state",
        "district",
        "pincode",
        "age_0_5",
        "age_5_17",
        "age_17_plus",
        "total_enrolment",
    ]
]

# Export
enrolment_analysis.to_csv(
    ANALYSIS_PATH / "enrolment_analysis.csv",
    index=False
)

print("✅ enrolment_analysis.csv created with 3 age groups (0–5, 5–17, 17+)")



✅ enrolment_analysis.csv created with 3 age groups (0–5, 5–17, 17+)


In [18]:
print(enrol.columns.tolist())


['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_17_plus', 'total_enrolment']


In [19]:
import pandas as pd

# Load cleaned datasets
demo = pd.read_csv(CLEAN_PATH / "demographic_clean.csv")
bio = pd.read_csv(CLEAN_PATH / "biometric_clean.csv")

# Explicitly set update_type
demo["update_type"] = "demographic"
bio["update_type"] = "biometric"

# Merge both updates
updates = pd.concat([demo, bio], ignore_index=True)

# ----------------
# Normalize geography fields
# ----------------
for col in ["state", "district", "pincode"]:
    updates[col] = (
        updates[col]
        .astype(str)
        .str.strip()
        .str.title()
    )

# ----------------
# Explicit age-group counts (NO assumptions)
# ----------------
updates["age_5_17_count"] = updates["age_5_17"]
updates["age_17_plus_count"] = updates["age_17_plus"]

# Total updates
updates["total_updates"] = (
    updates["age_5_17_count"]
    + updates["age_17_plus_count"]
)

# ----------------
# Final analysis dataframe
# ----------------
updates_analysis = updates[
    [
        "date",
        "state",
        "district",
        "pincode",
        "update_type",
        "age_5_17_count",
        "age_17_plus_count",
        "total_updates",
    ]
]

# ----------------
# Save final analysis file
# ----------------
updates_analysis.to_csv(
    ANALYSIS_PATH / "updates_analysis.csv",
    index=False
)

print("✅ updates_analysis.csv created with age-group counts (5–17, 17+)")
print(updates_analysis["update_type"].value_counts())



✅ updates_analysis.csv created with age-group counts (5–17, 17+)
update_type
biometric      1766159
demographic    1598010
Name: count, dtype: int64
