# Data Cleaning and Preprocessing

Objective:
- Standardize column names
- Handle missing and inconsistent values
- Extract temporal information
- Prepare clean datasets for analysis


In [None]:
import pandas as pd
import numpy as np
import os


In [None]:
enrolment_folder = "../data/raw/enrolment/"
demographic_folder = "../data/raw/demographic_updates/"
biometric_folder = "../data/raw/biometric_updates/"

def load_multiple_csv(folder_path):
    all_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    df_list = []
    for file in all_files:
        temp_df = pd.read_csv(os.path.join(folder_path, file))
        temp_df["source_file"] = file
        df_list.append(temp_df)
    return pd.concat(df_list, ignore_index=True)

enrolment_df = load_multiple_csv(enrolment_folder)
demographic_df = load_multiple_csv(demographic_folder)
biometric_df = load_multiple_csv(biometric_folder)


In [None]:
def standardize_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
    )
    return df

enrolment_df = standardize_columns(enrolment_df)
demographic_df = standardize_columns(demographic_df)
biometric_df = standardize_columns(biometric_df)


In [None]:
enrolment_df.isna().sum()
demographic_df.isna().sum()
biometric_df.isna().sum()



In [None]:
enrolment_df = enrolment_df.dropna(how="all")
demographic_df = demographic_df.dropna(how="all")
biometric_df = biometric_df.dropna(how="all")


In [None]:
def extract_year(df):
    df["year"] = df["source_file"].str.extract(r"(\d{4})")
    return df

enrolment_df = extract_year(enrolment_df)
demographic_df = extract_year(demographic_df)
biometric_df = extract_year(biometric_df)


In [None]:
enrolment_df["year"] = pd.to_numeric(enrolment_df["year"], errors="coerce")
demographic_df["year"] = pd.to_numeric(demographic_df["year"], errors="coerce")
biometric_df["year"] = pd.to_numeric(biometric_df["year"], errors="coerce")


In [None]:
os.makedirs("../data/processed/", exist_ok=True)

enrolment_df.to_csv("../data/processed/enrolment_cleaned.csv", index=False)
demographic_df.to_csv("../data/processed/demographic_update_cleaned.csv", index=False)
biometric_df.to_csv("../data/processed/biometric_update_cleaned.csv", index=False)


### Cleaning Summary

- Combined multiple CSV files into unified datasets
- Standardized column naming conventions
- Handled missing values appropriately for aggregated data
- Extracted year information for temporal analysis
- Saved cleaned datasets for downstream analysis
