In [9]:
import pandas as pd
import glob
import os


In [10]:
os.getcwd()


'd:\\UIDAI Data Hackathon 2026\\notebooks'

In [11]:
# --- Function to load CSV chunks ---
def load_dataset_chunks(folder_path, file_pattern):
    import pandas as pd
    import glob
    import os
    
    search_path = os.path.join(folder_path, file_pattern)
    files = glob.glob(search_path)
    print(f"Searching: {search_path}")
    print(f"Found {len(files)} files.")
    
    if len(files) == 0:
        return None
    
    li = []
    for filename in files:
        # Phase 0 speed: first 100k rows per file
        df = pd.read_csv(filename, nrows=100000)
        li.append(df)
    
    return pd.concat(li, axis=0, ignore_index=True)

# --- Test Enrolment Loader ---
enrol_folder = "../data/api_data_aadhar_enrolment"
df_enrol = load_dataset_chunks(enrol_folder, "*.csv")

print(f"Enrolment shape: {df_enrol.shape if df_enrol is not None else 'Not Found'}")


Searching: ../data/api_data_aadhar_enrolment\*.csv
Found 3 files.
Enrolment shape: (206029, 7)


In [12]:
# --- Vital stats check for Enrolment ---

print("COLUMNS:")
print(df_enrol.columns.tolist())
print("\n")

print("NULL COUNT:")
print(df_enrol.isnull().sum())
print("\n")

# Date-related columns
date_cols = [c for c in df_enrol.columns if 'date' in c.lower() or 'year' in c.lower() or 'month' in c.lower()]
print("DATE-RELATED COLUMNS FOUND:")
print(date_cols)
print("\n")

# Geography checks
if 'District' in df_enrol.columns:
    print("Unique Districts:", df_enrol['District'].nunique())
    print("Null Districts:", df_enrol['District'].isnull().sum())

if 'State' in df_enrol.columns:
    print("Unique States:", df_enrol['State'].nunique())


COLUMNS:
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']


NULL COUNT:
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64


DATE-RELATED COLUMNS FOUND:
['date']




In [13]:
# --- Load Demographic Update Data ---

demo_folder = "../data/api_data_aadhar_demographic"
df_demo = load_dataset_chunks(demo_folder, "*.csv")

print(f"Demographic shape: {df_demo.shape if df_demo is not None else 'Not Found'}")


Searching: ../data/api_data_aadhar_demographic\*.csv
Found 5 files.
Demographic shape: (471700, 6)


In [14]:
# --- Vital stats check for Demographic Update ---

print("COLUMNS:")
print(df_demo.columns.tolist())
print("\n")

print("NULL COUNT:")
print(df_demo.isnull().sum())
print("\n")

# Date-related columns
date_cols = [c for c in df_demo.columns if 'date' in c.lower() or 'year' in c.lower() or 'month' in c.lower()]
print("DATE-RELATED COLUMNS FOUND:")
print(date_cols)
print("\n")

# Geography checks
if 'District' in df_demo.columns:
    print("Unique Districts:", df_demo['District'].nunique())
    print("Null Districts:", df_demo['District'].isnull().sum())

if 'State' in df_demo.columns:
    print("Unique States:", df_demo['State'].nunique())


COLUMNS:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']


NULL COUNT:
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64


DATE-RELATED COLUMNS FOUND:
['date']




In [15]:
# --- Load Biometric Update Data ---

bio_folder = "../data/api_data_aadhar_biometric"
df_bio = load_dataset_chunks(bio_folder, "*.csv")

print(f"Biometric shape: {df_bio.shape if df_bio is not None else 'Not Found'}")


Searching: ../data/api_data_aadhar_biometric\*.csv
Found 4 files.
Biometric shape: (400000, 6)


In [16]:
# --- Vital stats check for Biometric Update ---

print("COLUMNS:")
print(df_bio.columns.tolist())
print("\n")

print("NULL COUNT:")
print(df_bio.isnull().sum())
print("\n")

# Date-related columns
date_cols = [c for c in df_bio.columns if 'date' in c.lower() or 'year' in c.lower() or 'month' in c.lower()]
print("DATE-RELATED COLUMNS FOUND:")
print(date_cols)
print("\n")

# Geography checks
if 'District' in df_bio.columns:
    print("Unique Districts:", df_bio['District'].nunique())
    print("Null Districts:", df_bio['District'].isnull().sum())

if 'State' in df_bio.columns:
    print("Unique States:", df_bio['State'].nunique())


COLUMNS:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


NULL COUNT:
date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64


DATE-RELATED COLUMNS FOUND:
['date']


