In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
RAW_DATA_PATH = "../Data/Raw"
PROCESSED_DATA_PATH = "../Data/Processed"

ENROLMENT_FILE = "enrollment_data.csv"
DEMOGRAPHIC_FILE = "demographic_update_data.csv"

In [3]:
enrolment_df = pd.read_csv(os.path.join(RAW_DATA_PATH, ENROLMENT_FILE))
demo_df = pd.read_csv(os.path.join(RAW_DATA_PATH, DEMOGRAPHIC_FILE))

print("Enrolment Data Shape:", enrolment_df.shape)
print("Demographic Data Shape:", demo_df.shape)

enrolment_df.head(), demo_df.head()

Enrolment Data Shape: (1006029, 7)
Demographic Data Shape: (1389150, 6)


(         date          state          district  pincode  age_0_5  age_5_17  \
 0  02-03-2025      Meghalaya  East Khasi Hills   793121       11        61   
 1  09-03-2025      Karnataka   Bengaluru Urban   560043       14        33   
 2  09-03-2025  Uttar Pradesh      Kanpur Nagar   208001       29        82   
 3  09-03-2025  Uttar Pradesh           Aligarh   202133       62        29   
 4  09-03-2025      Karnataka   Bengaluru Urban   560016       14        16   
 
    age_18_greater  
 0              37  
 1              39  
 2              12  
 3              15  
 4              21  ,
          date           state    district  pincode  demo_age_5_17  \
 0  01-03-2025   Uttar Pradesh   Gorakhpur   273213           49.0   
 1  01-03-2025  Andhra Pradesh    Chittoor   517132           22.0   
 2  01-03-2025         Gujarat      Rajkot   360006           65.0   
 3  01-03-2025  Andhra Pradesh  Srikakulam   532484           24.0   
 4  01-03-2025       Rajasthan     Udaipur   31

In [4]:
enrolment_df['date'] = pd.to_datetime(
    enrolment_df['date'],
    dayfirst=True,
    errors='coerce'
)

demo_df['date'] = pd.to_datetime(
    demo_df['date'],
    dayfirst=True,
    errors='coerce'
)

print("Invalid enrolment dates:", enrolment_df['date'].isna().sum())
print("Invalid demographic dates:", demo_df['date'].isna().sum())


Invalid enrolment dates: 0
Invalid demographic dates: 0


In [5]:
enrolment_df['month'] = enrolment_df['date'].dt.to_period('M')
demo_df['month'] = demo_df['date'].dt.to_period('M')

In [6]:
enrolment_df[['age_0_5', 'age_5_17', 'age_18_greater']] = (
    enrolment_df[['age_0_5', 'age_5_17', 'age_18_greater']].fillna(0)
)

demo_df[['demo_age_5_17', 'demo_age_17_']] = (
    demo_df[['demo_age_5_17', 'demo_age_17_']].fillna(0)
)

In [7]:
enrolment_df['total_enrolment'] = (
    enrolment_df['age_0_5'] +
    enrolment_df['age_5_17'] +
    enrolment_df['age_18_greater']
)

In [8]:
enrolment_monthly = enrolment_df.groupby(
    ['month', 'state', 'district'],
    as_index=False
).agg({
    'age_0_5': 'sum',
    'age_5_17': 'sum',
    'age_18_greater': 'sum',
    'total_enrolment': 'sum'
})

enrolment_monthly.head()

Unnamed: 0,month,state,district,age_0_5,age_5_17,age_18_greater,total_enrolment
0,2025-03,Andhra Pradesh,Spsr Nellore,43,44,29,116
1,2025-03,Assam,Baksa,11,14,13,38
2,2025-03,Assam,Barpeta,24,34,10,68
3,2025-03,Assam,Bongaigaon,25,33,31,89
4,2025-03,Assam,Chirang,12,29,15,56


In [9]:
demo_monthly = demo_df.groupby(
    ['month', 'state', 'district'],
    as_index=False
).agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_': 'sum'
})

demo_monthly['total_demo_updates'] = (
    demo_monthly['demo_age_5_17'] +
    demo_monthly['demo_age_17_']
)

demo_monthly.head()

Unnamed: 0,month,state,district,demo_age_5_17,demo_age_17_,total_demo_updates
0,2025-03,Andaman and Nicobar Islands,Nicobar,16.0,180.0,196.0
1,2025-03,Andaman and Nicobar Islands,North And Middle Andaman,20.0,402.0,422.0
2,2025-03,Andaman and Nicobar Islands,South Andaman,48.0,279.0,327.0
3,2025-03,Andhra Pradesh,Adilabad,261.0,2034.0,2295.0
4,2025-03,Andhra Pradesh,Alluri Sitharama Raju,334.0,2743.0,3077.0


In [10]:
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

enrolment_monthly.to_csv(
    os.path.join(PROCESSED_DATA_PATH, "enrollment_monthly.csv"),
    index=False
)

demo_monthly.to_csv(
    os.path.join(PROCESSED_DATA_PATH, "demographic_monthly.csv"),
    index=False
)

print("Cleaned datasets saved successfully!")

Cleaned datasets saved successfully!
