# Notebook 01: Data Overview and Loading

Objective:
- Load Aadhaar Enrolment, Demographic Update, and Biometric Update datasets
- Understand structure, columns, and basic statistics
- Prepare for cleaning and analysis in later notebooks


In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
enrolment_folder = "../data/raw/enrolment/"
demographic_folder = "../data/raw/demographic_updates/"
biometric_folder = "../data/raw/biometric_updates/"


In [None]:
def load_multiple_csv(folder_path):
    all_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    df_list = []
    
    for file in all_files:
        file_path = os.path.join(folder_path, file)
        temp_df = pd.read_csv(file_path)
        temp_df["source_file"] = file
        df_list.append(temp_df)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df


In [None]:
enrolment_df = load_multiple_csv(enrolment_folder)
demographic_df = load_multiple_csv(demographic_folder)
biometric_df = load_multiple_csv(biometric_folder)


In [None]:
enrolment_df.head()
demographic_df.head()
biometric_df.head()


In [None]:
print(enrolment_df.columns)
print(demographic_df.columns)
print(biometric_df.columns)


In [None]:
enrolment_df.info()
demographic_df.info()
biometric_df.info()


### Initial Observations

- Each dataset consists of multiple CSV files combined into a single dataframe
- Source file information is preserved for traceability
- Geographic identifiers such as State, District, and PIN Code are present
- Time-based analysis is possible using year information from files or columns
- Further cleaning and standardisation are required before analysis
