In [1]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"api_data_aadhar_demographic/api_data_aadhar_demographic")

files = sorted(DATA_DIR.glob("api_data_aadhar_demographic*.csv"))
dfs = []

for file in files:
    print(f"Reading {file.name}")
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all files
df_all = pd.concat(dfs, ignore_index=True)

# Optional: ensure date is datetime
df_all["date"] = pd.to_datetime(df_all["date"], dayfirst=True, errors="coerce")

# Sort by state (and date within state)
df_all = df_all.sort_values(
    by=["state", "date", "district"],
    ascending=[True, True, True]
)

demographic_concat = df_all
demographic_concat.head()


Reading api_data_aadhar_demographic_0_500000.csv
Reading api_data_aadhar_demographic_1000000_1500000.csv
Reading api_data_aadhar_demographic_1500000_2000000.csv
Reading api_data_aadhar_demographic_2000000_2071700.csv
Reading api_data_aadhar_demographic_500000_1000000.csv


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
1507370,2025-12-20,100000,100000,100000,0,1
1866861,2025-12-23,100000,100000,100000,0,1
6660,2025-09-01,Andaman & Nicobar Islands,Andamans,744103,0,1
420893,2025-09-01,Andaman & Nicobar Islands,Andamans,744105,0,1
420894,2025-09-01,Andaman & Nicobar Islands,Andamans,744106,0,2


In [2]:
dup_rows = demographic_concat[
    demographic_concat.duplicated(subset=['date', 'pincode'], keep=False)
]

dup_rows.sort_values(by=['date', 'pincode'], inplace=True)
dup_rows


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup_rows.sort_values(by=['date', 'pincode'], inplace=True)


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
1158258,2025-03-01,Delhi,New Delhi,110001,74,437
1902486,2025-03-01,Delhi,New Delhi,110001,74,437
414420,2025-03-01,Delhi,Central Delhi,110002,65,714
741892,2025-03-01,Delhi,Central Delhi,110002,65,714
1158074,2025-03-01,Delhi,North Delhi,110006,196,2469
...,...,...,...,...,...,...
1852143,2025-12-29,Bihar,Purnia,855105,1,2
1566974,2025-12-29,Bihar,Kishanganj,855108,10,135
1852085,2025-12-29,Bihar,Kishanganj,855108,10,135
1566949,2025-12-29,Bihar,East Champaran,855456,1,0


In [3]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"api_data_aadhar_biometric/api_data_aadhar_biometric")

files = sorted(DATA_DIR.glob("api_data_aadhar_biometric*.csv"))
dfs = []

for file in files:
    print(f"Reading {file.name}")
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all files
df_all = pd.concat(dfs, ignore_index=True)

# Optional: ensure date is datetime
df_all["date"] = pd.to_datetime(df_all["date"], dayfirst=True, errors="coerce")

# Sort by state (and date within state)
df_all = df_all.sort_values(
    by=["state", "date", "district"],
    ascending=[True, True, True]
)

biometric_concat = df_all
biometric_concat.head()


Reading api_data_aadhar_biometric_0_500000.csv
Reading api_data_aadhar_biometric_1000000_1500000.csv
Reading api_data_aadhar_biometric_1500000_1861108.csv
Reading api_data_aadhar_biometric_500000_1000000.csv


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
4687,2025-03-01,Andaman & Nicobar Islands,Andamans,744101,16,193
38211,2025-04-01,Andaman & Nicobar Islands,Andamans,744101,17,167
55493,2025-05-01,Andaman & Nicobar Islands,Andamans,744106,10,30
59019,2025-05-01,Andaman & Nicobar Islands,Andamans,744101,12,128
67056,2025-06-01,Andaman & Nicobar Islands,Andamans,744101,11,129


In [4]:
biometric_concat = biometric_concat.drop_duplicates(keep="first")
print("Remaining duplicates:", biometric_concat[['date','state','district','pincode']].duplicated().sum())


Remaining duplicates: 0


In [5]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"api_data_aadhar_enrolment/api_data_aadhar_enrolment")

files = sorted(DATA_DIR.glob("api_data_aadhar_enrolment*.csv"))
dfs = []

for file in files:
    print(f"Reading {file.name}")
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all files
df_all = pd.concat(dfs, ignore_index=True)

# Optional: ensure date is datetime
df_all["date"] = pd.to_datetime(df_all["date"], dayfirst=True, errors="coerce")

# Sort by state (and date within state)
df_all = df_all.sort_values(
    by=["state", "date", "district"],
    ascending=[True, True, True]
)

enrolment_concat = df_all
enrolment_concat.tail()


Reading api_data_aadhar_enrolment_0_500000.csv
Reading api_data_aadhar_enrolment_1000000_1006029.csv
Reading api_data_aadhar_enrolment_500000_1000000.csv


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
114368,2025-09-09,andhra pradesh,rangareddi,501218,1,0,0
199918,2025-09-15,andhra pradesh,chittoor,517520,1,0,0
537588,2025-10-29,andhra pradesh,chittoor,517520,1,0,0
573505,2025-10-31,andhra pradesh,chittoor,517520,1,0,0
910884,2025-12-23,andhra pradesh,chittoor,517520,1,0,0


In [6]:
enrolment_concat.rename(
    columns={
        'age_0_5': 'enrol_age_0_5',
        'age_5_17': 'enrol_age_5_17',
        'age_18_greater': 'enrol_age_18_greater'
    },
    inplace=True
)
enrolment_concat.head()


Unnamed: 0,date,state,district,pincode,enrol_age_0_5,enrol_age_5_17,enrol_age_18_greater
23108,2025-09-02,100000,100000,100000,0,0,3
46946,2025-09-03,100000,100000,100000,0,0,1
97816,2025-09-08,100000,100000,100000,0,0,1
115798,2025-09-09,100000,100000,100000,0,0,1
153156,2025-09-11,100000,100000,100000,0,0,2


In [7]:
"""keys = ['date', 'state', 'district', 'pincode']

enrolment_concat = enrolment_concat.groupby(
    keys, as_index=False
).sum(numeric_only=True, min_count=1)

demographic_concat = demographic_concat.groupby(
    keys, as_index=False
).sum(numeric_only=True, min_count=1)

biometric_concat = biometric_concat.groupby(
    keys, as_index=False
).sum(numeric_only=True, min_count=1)"""

# yeh nhi use kra tha

"keys = ['date', 'state', 'district', 'pincode']\n\nenrolment_concat = enrolment_concat.groupby(\n    keys, as_index=False\n).sum(numeric_only=True, min_count=1)\n\ndemographic_concat = demographic_concat.groupby(\n    keys, as_index=False\n).sum(numeric_only=True, min_count=1)\n\nbiometric_concat = biometric_concat.groupby(\n    keys, as_index=False\n).sum(numeric_only=True, min_count=1)"

In [8]:
# Check for duplicates in each dataset BEFORE merging
print("Enrolment duplicates:", enrolment_concat.duplicated(subset=['date', 'state', 'district', 'pincode']).sum())
print("Demographic duplicates:", demographic_concat.duplicated(subset=['date', 'state', 'district', 'pincode']).sum())
print("Biometric duplicates:", biometric_concat.duplicated(subset=['date', 'state', 'district', 'pincode']).sum())


Enrolment duplicates: 22957
Demographic duplicates: 473601
Biometric duplicates: 0


In [9]:
# Remove duplicates from each dataset before merging
keys = ['date', 'state', 'district', 'pincode']

enrolment_concat = enrolment_concat.drop_duplicates(subset=keys, keep='first')
demographic_concat = demographic_concat.drop_duplicates(subset=keys, keep='first')
biometric_concat = biometric_concat.drop_duplicates(subset=keys, keep='first')

print(f"After deduplication:")
print(f"Enrolment: {enrolment_concat.shape[0]} rows")
print(f"Demographic: {demographic_concat.shape[0]} rows")
print(f"Biometric: {biometric_concat.shape[0]} rows")


After deduplication:
Enrolment: 983072 rows
Demographic: 1598099 rows
Biometric: 1766212 rows


In [10]:
keys = ['date', 'state', 'district', 'pincode']
df_merged = (
    enrolment_concat
    .merge(demographic_concat, on=keys, how='outer')
    .merge(biometric_concat, on=keys, how='outer')
)

In [11]:
print(f"Merged dataframe shape: {df_merged.shape}")
print(f"Columns: {df_merged.columns.tolist()}")


Merged dataframe shape: (2330468, 11)
Columns: ['date', 'state', 'district', 'pincode', 'enrol_age_0_5', 'enrol_age_5_17', 'enrol_age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_']


In [12]:
# Check for duplicates in MERGED dataframe
keys = ['date', 'state', 'district', 'pincode']

dup_rows = df_merged[df_merged.duplicated(subset=keys, keep=False)]
print(f"Number of duplicate rows in merged data: {len(dup_rows)}")


Number of duplicate rows in merged data: 0


In [13]:
dup_rows.sort_values(by=keys)

Unnamed: 0,date,state,district,pincode,enrol_age_0_5,enrol_age_5_17,enrol_age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_


In [14]:
df_merged.head()

Unnamed: 0,date,state,district,pincode,enrol_age_0_5,enrol_age_5_17,enrol_age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,2025-03-01,Andaman & Nicobar Islands,Andamans,744101,,,,,,16.0,193.0
1,2025-03-01,Andaman and Nicobar Islands,Nicobar,744301,,,,16.0,180.0,101.0,48.0
2,2025-03-01,Andaman and Nicobar Islands,Nicobar,744302,,,,,,15.0,12.0
3,2025-03-01,Andaman and Nicobar Islands,Nicobar,744303,,,,,,46.0,27.0
4,2025-03-01,Andaman and Nicobar Islands,Nicobar,744304,,,,,,16.0,14.0


In [15]:
df_merged.to_csv("merged_v4.csv", index=False)

In [16]:
df_merged.shape

(2330468, 11)

In [17]:
df_merged = df_merged.drop_duplicates(keep="first")

In [18]:
cols = ["date", "state", "district", "pincode"]

num_rows_with_common_keys = df_merged.duplicated(subset=cols, keep=False).sum()
num_rows_with_common_keys


np.int64(0)

In [19]:
biometric_concat[biometric_concat['pincode']==515001]

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
10835,2025-03-01,Andhra Pradesh,Anantapur,515001,257,428
17012,2025-03-01,Andhra Pradesh,Ananthapur,515001,764,182
5396,2025-03-01,Andhra Pradesh,Ananthapuramu,515001,857,213
25085,2025-04-01,Andhra Pradesh,Anantapur,515001,286,346
25975,2025-04-01,Andhra Pradesh,Ananthapur,515001,827,177
...,...,...,...,...,...,...
1299423,2025-12-27,Andhra Pradesh,Ananthapuramu,515001,18,9
1332443,2025-12-28,Andhra Pradesh,Ananthapuramu,515001,5,1
1351880,2025-12-29,Andhra Pradesh,Anantapur,515001,4,6
1347363,2025-12-29,Andhra Pradesh,Ananthapur,515001,5,6


In [20]:
demographic_concat.shape[0]

1598099