# UIDAI Data Hackathon 2026

## Project Title
Identifying Inequality, Data Quality Issues, and Anomalous Patterns in Aadhaar Enrolment and Updates

## Notebook 01
Data Cleaning & Preprocessing


In [3]:
# Basic libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Settings
pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8")


In [4]:
# File handling libraries
import glob
import os


In [5]:
# -------- Enrolment Data --------
enrolment_path = "../data/enrolment.csv/*.csv"
enrolment_files = glob.glob(enrolment_path)

enrol_df = pd.concat(
    (pd.read_csv(file) for file in enrolment_files),
    ignore_index=True
)

print("Enrolment Data Loaded:", enrol_df.shape)
enrol_df.head()


Enrolment Data Loaded: (1006029, 7)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [6]:
# -------- Demographic Update Data --------
demo_path = "../data/demographic_updates.csv/*.csv"
demo_files = glob.glob(demo_path)

demo_df = pd.concat(
    (pd.read_csv(file) for file in demo_files),
    ignore_index=True
)

print("Demographic Update Data Loaded:", demo_df.shape)
demo_df.head()



Demographic Update Data Loaded: (2071700, 6)


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [7]:
# -------- Biometric Update Data --------
bio_path = "../data/biometric_updates.csv/api_data_aadhar_biometric/*.csv"
bio_files = glob.glob(bio_path)

bio_df = pd.concat(
    (pd.read_csv(file) for file in bio_files),
    ignore_index=True
)

print("Biometric Update Data Loaded:", bio_df.shape)
bio_df.head()


Biometric Update Data Loaded: (1861108, 6)


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [8]:
# Check column names clearly
print("Enrolment Columns:\n", enrol_df.columns.tolist())
print("\nDemographic Update Columns:\n", demo_df.columns.tolist())
print("\nBiometric Update Columns:\n", bio_df.columns.tolist())


Enrolment Columns:
 ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

Demographic Update Columns:
 ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

Biometric Update Columns:
 ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [9]:
# Convert date column to datetime
enrol_df['date'] = pd.to_datetime(enrol_df['date'], dayfirst=True)
demo_df['date'] = pd.to_datetime(demo_df['date'], dayfirst=True)
bio_df['date'] = pd.to_datetime(bio_df['date'], dayfirst=True)

print("Date conversion done")


Date conversion done


In [10]:
# Standardize state and district names
for df in [enrol_df, demo_df, bio_df]:
    df['state'] = df['state'].str.strip().str.title()
    df['district'] = df['district'].str.strip().str.title()

print("State and district names standardized")


State and district names standardized


In [11]:
# Save cleaned datasets for next notebooks
enrol_df.to_csv("../data/enrolment_cleaned.csv", index=False)
demo_df.to_csv("../data/demographic_updates_cleaned.csv", index=False)
bio_df.to_csv("../data/biometric_updates_cleaned.csv", index=False)

print("Cleaned datasets saved successfully")


Cleaned datasets saved successfully
