In [11]:
import pandas as pd
import glob
import os

def concatenate_csvs(input_folder, output_path):
    """
    Concatenates all CSV files in a folder into a single DataFrame
    and saves it to output_path.
    """

    csv_files = glob.glob(os.path.join(input_folder, "*.csv"))
    print(f"Found {len(csv_files)} CSV files")

    df_list = []

    for file in csv_files:
        print(f"Reading {file}")
        df = pd.read_csv(file)
        df["source_file"] = os.path.basename(file)
        df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)

    print("Final shape:", combined_df.shape)

    combined_df.to_csv(output_path, index=False)
    print(f"Saved combined file to {output_path}")

    return combined_df

In [12]:
enrolment_df = concatenate_csvs(
    input_folder="data/enrolment/raw",
    output_path="data/enrolment/clean/enrolment_combined.csv"
)

Found 3 CSV files
Reading data/enrolment/raw\api_data_aadhar_enrolment_0_500000.csv
Reading data/enrolment/raw\api_data_aadhar_enrolment_1000000_1006029.csv
Reading data/enrolment/raw\api_data_aadhar_enrolment_500000_1000000.csv
Final shape: (1006029, 8)
Saved combined file to data/enrolment/clean/enrolment_combined.csv


In [13]:
demographic_df = concatenate_csvs(
    input_folder="data/demographic/raw",
    output_path="data/demographic/clean/demographic_combined.csv"
)

Found 5 CSV files
Reading data/demographic/raw\api_data_aadhar_demographic_0_500000.csv
Reading data/demographic/raw\api_data_aadhar_demographic_1000000_1500000.csv
Reading data/demographic/raw\api_data_aadhar_demographic_1500000_2000000.csv
Reading data/demographic/raw\api_data_aadhar_demographic_2000000_2071700.csv
Reading data/demographic/raw\api_data_aadhar_demographic_500000_1000000.csv
Final shape: (2071700, 7)
Saved combined file to data/demographic/clean/demographic_combined.csv


In [14]:
biometric_df = concatenate_csvs(
    input_folder="data/biometric/raw",
    output_path="data/biometric/clean/biometric_combined.csv"
)

Found 4 CSV files
Reading data/biometric/raw\api_data_aadhar_biometric_0_500000.csv
Reading data/biometric/raw\api_data_aadhar_biometric_1000000_1500000.csv
Reading data/biometric/raw\api_data_aadhar_biometric_1500000_1861108.csv
Reading data/biometric/raw\api_data_aadhar_biometric_500000_1000000.csv
Final shape: (1861108, 7)
Saved combined file to data/biometric/clean/biometric_combined.csv


In [15]:
print(enrolment_df.info())
print(enrolment_df.head())
print(enrolment_df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   date            1006029 non-null  object
 1   state           1006029 non-null  object
 2   district        1006029 non-null  object
 3   pincode         1006029 non-null  int64 
 4   age_0_5         1006029 non-null  int64 
 5   age_5_17        1006029 non-null  int64 
 6   age_18_greater  1006029 non-null  int64 
 7   source_file     1006029 non-null  object
dtypes: int64(4), object(4)
memory usage: 61.4+ MB
None
         date          state          district  pincode  age_0_5  age_5_17  \
0  02-03-2025      Meghalaya  East Khasi Hills   793121       11        61   
1  09-03-2025      Karnataka   Bengaluru Urban   560043       14        33   
2  09-03-2025  Uttar Pradesh      Kanpur Nagar   208001       29        82   
3  09-03-2025  Uttar Pradesh           Aligarh   202133     

In [16]:
print(biometric_df.info())
print(biometric_df.head())
print(biometric_df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 7 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
 6   source_file   object
dtypes: int64(3), object(4)
memory usage: 99.4+ MB
None
         date              state      district  pincode  bio_age_5_17  \
0  01-03-2025            Haryana  Mahendragarh   123029           280   
1  01-03-2025              Bihar     Madhepura   852121           144   
2  01-03-2025  Jammu and Kashmir         Punch   185101           643   
3  01-03-2025              Bihar       Bhojpur   802158           256   
4  01-03-2025         Tamil Nadu       Madurai   625514           271   

   bio_age_17_                             source_file  
0          577  api_data_aadhar_biometric_0_500000.csv  
1          369  api_data_aadhar_biometric_0_5

In [17]:
print(demographic_df.info())
print(demographic_df.head())
print(demographic_df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 7 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   date           object
 1   state          object
 2   district       object
 3   pincode        int64 
 4   demo_age_5_17  int64 
 5   demo_age_17_   int64 
 6   source_file    object
dtypes: int64(3), object(4)
memory usage: 110.6+ MB
None
         date           state    district  pincode  demo_age_5_17  \
0  01-03-2025   Uttar Pradesh   Gorakhpur   273213             49   
1  01-03-2025  Andhra Pradesh    Chittoor   517132             22   
2  01-03-2025         Gujarat      Rajkot   360006             65   
3  01-03-2025  Andhra Pradesh  Srikakulam   532484             24   
4  01-03-2025       Rajasthan     Udaipur   313801             45   

   demo_age_17_                               source_file  
0           529  api_data_aadhar_demographic_0_500000.csv  
1           375  api_data_aadhar_demographic_0_500000

In [18]:
len(enrolment_df)

1006029

In [19]:
len(demographic_df)

2071700

In [20]:
len(biometric_df)

1861108

In [21]:
print(
    enrolment_df['date'].min(), enrolment_df['date'].max(),
    demographic_df['date'].min(), demographic_df['date'].max(),
    biometric_df['date'].min(), biometric_df['date'].max(), sep="\n\n"
)

01-04-2025

31-12-2025

01-03-2025

31-10-2025

01-03-2025

31-10-2025


In [22]:
(enrolment_df[['age_0_5','age_5_17','age_18_greater']] < 0).sum()
enrolment_df[['age_0_5','age_5_17','age_18_greater']].describe()


Unnamed: 0,age_0_5,age_5_17,age_18_greater
count,1006029.0,1006029.0,1006029.0
mean,3.525709,1.710074,0.1673441
std,17.53851,14.36963,3.220525
min,0.0,0.0,0.0
25%,1.0,0.0,0.0
50%,2.0,0.0,0.0
75%,3.0,1.0,0.0
max,2688.0,1812.0,855.0


In [23]:
(demographic_df[['demo_age_5_17','demo_age_17_']] < 0).sum()
demographic_df[['demo_age_5_17','demo_age_17_']].describe()


Unnamed: 0,demo_age_5_17,demo_age_17_
count,2071700.0,2071700.0
mean,2.347552,21.44701
std,14.90355,125.2498
min,0.0,0.0
25%,0.0,2.0
50%,1.0,6.0
75%,2.0,15.0
max,2690.0,16166.0


In [24]:
(biometric_df[['bio_age_5_17','bio_age_17_']] < 0).sum()
biometric_df[['bio_age_5_17','bio_age_17_']].describe()

Unnamed: 0,bio_age_5_17,bio_age_17_
count,1861108.0,1861108.0
mean,18.39058,19.09413
std,83.70421,88.06502
min,0.0,0.0
25%,1.0,1.0
50%,3.0,4.0
75%,11.0,10.0
max,8002.0,7625.0


In [25]:
print("State: ",enrolment_df['state'].nunique())
print("District: ",enrolment_df['district'].nunique())

State:  55
District:  985


In [26]:
print("State: ",biometric_df['state'].nunique())
print("District: ",biometric_df['district'].nunique())

State:  57
District:  974


In [27]:
print("State: ",demographic_df['state'].nunique())
print("District: ",demographic_df['district'].nunique())

State:  65
District:  983


In [28]:
print(enrolment_df[['age_0_5','age_5_17','age_18_greater']].sum()
,demographic_df[['demo_age_5_17','demo_age_17_']].sum()
,biometric_df[['bio_age_5_17','bio_age_17_']].sum(),sep="\n\n")

age_0_5           3546965
age_5_17          1720384
age_18_greater     168353
dtype: int64

demo_age_5_17     4863424
demo_age_17_     44431763
dtype: int64

bio_age_5_17    34226855
bio_age_17_     35536240
dtype: int64


In [29]:
for df in [enrolment_df, demographic_df, biometric_df]:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [30]:
df['date'].isna().sum()

np.int64(944100)

In [31]:
enrolment_df.duplicated().sum()

np.int64(299669)

In [32]:
demographic_df.duplicated().sum()

np.int64(317166)

In [33]:
enrolment_df.duplicated().sum()

np.int64(299669)

In [34]:
enrolment_df['pincode'].dtype
demographic_df['pincode'].dtype
biometric_df['pincode'].dtype

dtype('int64')

In [35]:
key_cols = ['date', 'state', 'district', 'pincode']
enrolment_df.duplicated(subset=key_cols).sum()

np.int64(658089)

In [36]:
enrolment_df.groupby(key_cols).size().sort_values(ascending=False).head()

date        state        district     pincode
2025-03-11  West Bengal  Kolkata      700044     2
2025-02-11  Karnataka    Chitradurga  577543     2
            Bihar        Gaya         804407     2
2025-03-11  West Bengal  Kolkata      700046     2
2025-02-11  Bihar        Gaya         805109     2
dtype: int64

In [37]:
enrolment_df = enrolment_df.drop_duplicates()
demographic_df = demographic_df.drop_duplicates()
biometric_df = biometric_df.drop_duplicates()

In [38]:
enrolment_df['date'].astype(str).value_counts().head(10)

date
NaT           386642
2025-02-11     17521
2025-09-09     16789
2025-08-09     16768
2025-10-09     16518
2025-12-09     16107
2025-01-09     15971
2025-11-09     15950
2025-02-09     15622
2025-05-11     15587
Name: count, dtype: int64

In [39]:
for df in [enrolment_df, demographic_df, biometric_df]:
    df['date'] = pd.to_datetime(
        df['date'],
        format='mixed',
        dayfirst=True,
        errors='coerce'
    )

In [40]:
df['date'].isna().sum(), len(df)

(np.int64(790395), 1663824)

In [41]:
print(enrolment_df.duplicated().sum(),
demographic_df.duplicated().sum(),
biometric_df.duplicated().sum())

0 0 0


In [42]:
print(len(enrolment_df), len(demographic_df), len(biometric_df))

706360 1754534 1663824


In [43]:
enrolment_df = enrolment_df.reset_index(drop=True)
demographic_df = demographic_df.reset_index(drop=True)
biometric_df = biometric_df.reset_index(drop=True)

In [44]:
enrolment_df.to_csv(
    "data/enrolment/clean/enrolment_final.csv",
    index=False
)

demographic_df.to_csv(
    "data/demographic/clean/demographic_final.csv",
    index=False
)

biometric_df.to_csv(
    "data/biometric/clean/biometric_final.csv",
    index=False
)

# Data Freeze

The datasets have been concatenated, cleaned, deduplicated, and validated.
All subsequent analysis and feature engineering are performed using these final cleaned datasets.