# UIDAI Hackathon - Data Cleaning & Preprocessing

## Objective
This notebook performs comprehensive data cleaning and preprocessing on the Aadhaar datasets:
- Remove duplicates and invalid records
- Normalize dates and text fields
- Validate pincodes and geographical data
- Engineer temporal features
- Create derived metrics
- Detect and handle outliers

**Author:** Harsh Vardhan  
**Date:** January 13, 2026  
**Input:** Raw CSV data from Dataset/  
**Output:** Cleaned DataFrames ready for analysis

## 1. Setup Environment

In [None]:
# Standard libraries
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Data manipulation
import pandas as pd
import numpy as np

# Add src directory to path
project_root = Path(r'c:\Users\harsh\OneDrive - Indian Institute of Information Technology, Nagpur\IIIT Nagpur\6th Semester\Projects\IdentityLab')
sys.path.append(str(project_root / 'src'))

# Import custom modules
from data_loader import AadhaarDataLoader
from preprocessing import AadhaarDataPreprocessor, detect_outliers

print("✓ Environment setup complete")
print(f"✓ Project root: {project_root}")

## 2. Load Raw Data

In [None]:
# Initialize data loader
loader = AadhaarDataLoader(str(project_root))

print("Loading raw datasets...")
print("-" * 60)

In [None]:
# Load enrolment data
df_enrolment_raw = loader.load_enrolment_data()
print(f"✓ Loaded {len(df_enrolment_raw):,} raw enrolment records")

In [None]:
# Load demographic data
df_demographic_raw = loader.load_demographic_data()
print(f"✓ Loaded {len(df_demographic_raw):,} raw demographic records")

In [None]:
# Load biometric data
df_biometric_raw = loader.load_biometric_data()
print(f"✓ Loaded {len(df_biometric_raw):,} raw biometric records")

## 3. Data Cleaning Pipeline

Apply systematic cleaning using the AadhaarDataPreprocessor class.

In [None]:
# Initialize preprocessor
preprocessor = AadhaarDataPreprocessor()
print("✓ Preprocessor initialized")

### 3.1 Clean Enrolment Data

In [None]:
# Clean enrolment data
df_enrolment = preprocessor.clean_enrolment_data(df_enrolment_raw)

print("\nEnrolment Data Sample After Cleaning:")
display(df_enrolment.head())

print("\nNew Columns Added:")
print(df_enrolment.columns.tolist())

### 3.2 Clean Demographic Data

In [None]:
# Clean demographic data
df_demographic = preprocessor.clean_demographic_data(df_demographic_raw)

print("\nDemographic Data Sample After Cleaning:")
display(df_demographic.head())

print("\nNew Columns Added:")
print(df_demographic.columns.tolist())

### 3.3 Clean Biometric Data

In [None]:
# Clean biometric data
df_biometric = preprocessor.clean_biometric_data(df_biometric_raw)

print("\nBiometric Data Sample After Cleaning:")
display(df_biometric.head())

print("\nNew Columns Added:")
print(df_biometric.columns.tolist())

## 4. Cleaning Report

Review what was cleaned in each dataset.

In [None]:
# Print comprehensive cleaning report
preprocessor.print_cleaning_report()

## 5. Handle Duplicates

Remove duplicate records identified during exploration.

In [None]:
# Remove duplicates
print("Removing duplicate records...")
print("-" * 60)

# Enrolment
enrol_before = len(df_enrolment)
df_enrolment = df_enrolment.drop_duplicates()
enrol_removed = enrol_before - len(df_enrolment)
print(f"Enrolment: Removed {enrol_removed:,} duplicates ({enrol_removed/enrol_before*100:.2f}%)")

# Demographic
demo_before = len(df_demographic)
df_demographic = df_demographic.drop_duplicates()
demo_removed = demo_before - len(df_demographic)
print(f"Demographic: Removed {demo_removed:,} duplicates ({demo_removed/demo_before*100:.2f}%)")

# Biometric
bio_before = len(df_biometric)
df_biometric = df_biometric.drop_duplicates()
bio_removed = bio_before - len(df_biometric)
print(f"Biometric: Removed {bio_removed:,} duplicates ({bio_removed/bio_before*100:.2f}%)")

## 6. Outlier Detection

Identify outliers in enrolment counts using IQR method.

In [None]:
# Detect outliers in enrolment data
outliers_enrol = detect_outliers(df_enrolment, 'total_enrolments', method='iqr', threshold=3.0)

print(f"Outliers in Enrolment Data: {outliers_enrol.sum():,} ({outliers_enrol.sum()/len(df_enrolment)*100:.2f}%)")
print("\nOutlier Statistics:")
print(df_enrolment[outliers_enrol]['total_enrolments'].describe())

# Show some examples
print("\nSample Outlier Records:")
display(df_enrolment[outliers_enrol][['date', 'state', 'district', 'total_enrolments']].head(10))

In [None]:
# Detect outliers in demographic data
outliers_demo = detect_outliers(df_demographic, 'total_demo_updates', method='iqr', threshold=3.0)

print(f"Outliers in Demographic Data: {outliers_demo.sum():,} ({outliers_demo.sum()/len(df_demographic)*100:.2f}%)")
print("\nOutlier Statistics:")
print(df_demographic[outliers_demo]['total_demo_updates'].describe())

In [None]:
# Detect outliers in biometric data
outliers_bio = detect_outliers(df_biometric, 'total_bio_updates', method='iqr', threshold=3.0)

print(f"Outliers in Biometric Data: {outliers_bio.sum():,} ({outliers_bio.sum()/len(df_biometric)*100:.2f}%)")
print("\nOutlier Statistics:")
print(df_biometric[outliers_bio]['total_bio_updates'].describe())

## 7. Data Validation

Verify cleaned data quality.

In [None]:
# Validate date ranges
print("Date Range Validation:")
print("-" * 60)
print(f"Enrolment: {df_enrolment['date'].min()} to {df_enrolment['date'].max()}")
print(f"Demographic: {df_demographic['date'].min()} to {df_demographic['date'].max()}")
print(f"Biometric: {df_biometric['date'].min()} to {df_biometric['date'].max()}")

In [None]:
# Validate geographical coverage
print("\nGeographical Coverage:")
print("-" * 60)
print(f"Enrolment - States: {df_enrolment['state'].nunique()}, Districts: {df_enrolment['district'].nunique()}")
print(f"Demographic - States: {df_demographic['state'].nunique()}, Districts: {df_demographic['district'].nunique()}")
print(f"Biometric - States: {df_biometric['state'].nunique()}, Districts: {df_biometric['district'].nunique()}")

In [None]:
# Validate pincode format
print("\nPincode Validation:")
print("-" * 60)

# Check if all pincodes are 6 digits
invalid_pincodes_enrol = df_enrolment[df_enrolment['pincode'].str.len() != 6]
invalid_pincodes_demo = df_demographic[df_demographic['pincode'].str.len() != 6]
invalid_pincodes_bio = df_biometric[df_biometric['pincode'].str.len() != 6]

print(f"Invalid pincodes in Enrolment: {len(invalid_pincodes_enrol)}")
print(f"Invalid pincodes in Demographic: {len(invalid_pincodes_demo)}")
print(f"Invalid pincodes in Biometric: {len(invalid_pincodes_bio)}")
print("\n✓ All pincodes are properly formatted (6 digits)" if len(invalid_pincodes_enrol) == 0 else "⚠ Some pincodes need fixing")

## 8. Final Summary

Summary of cleaned datasets ready for analysis.

In [None]:
# Create summary table
summary_data = []

datasets = {
    'Enrolment': (df_enrolment_raw, df_enrolment),
    'Demographic': (df_demographic_raw, df_demographic),
    'Biometric': (df_biometric_raw, df_biometric)
}

for name, (raw_df, clean_df) in datasets.items():
    summary_data.append({
        'Dataset': name,
        'Raw Records': f"{len(raw_df):,}",
        'Clean Records': f"{len(clean_df):,}",
        'Removed': f"{len(raw_df) - len(clean_df):,}",
        'Removal %': f"{(len(raw_df) - len(clean_df))/len(raw_df)*100:.2f}%",
        'Final Columns': clean_df.shape[1],
        'Memory (MB)': f"{clean_df.memory_usage(deep=True).sum() / 1024**2:.2f}"
    })

summary_df = pd.DataFrame(summary_data)

print("="*80)
print("DATA CLEANING SUMMARY")
print("="*80)
display(summary_df)

print("\n✓ Data cleaning complete!")
print("\nCleaned datasets are ready for:")
print("1. Temporal analysis and trend detection")
print("2. Geographical pattern analysis")
print("3. Cross-dataset correlation studies")
print("4. Visualization and reporting")

## 9. Optional: Save Cleaned Data

Save cleaned datasets for reuse (optional step).

In [None]:
# Uncomment to save cleaned data
# output_dir = project_root / 'outputs' / 'cleaned_data'
# output_dir.mkdir(parents=True, exist_ok=True)

# df_enrolment.to_csv(output_dir / 'enrolment_cleaned.csv', index=False)
# df_demographic.to_csv(output_dir / 'demographic_cleaned.csv', index=False)
# df_biometric.to_csv(output_dir / 'biometric_cleaned.csv', index=False)

# print(f"✓ Cleaned data saved to {output_dir}")
print("Cleaned data retained in memory for analysis")