In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats 

In [2]:
demo=pd.read_csv(r'C:\Users\Lenovo\Desktop\UIDAI_Data_Hackathon_2026\data\combined\combined_aadhar_demographic.csv')

In [3]:
enrol=pd.read_csv(r"C:\Users\Lenovo\Desktop\UIDAI_Data_Hackathon_2026\data\combined\combined_aadhar_enrolment.csv")

In [4]:
bio=pd.read_csv(r"C:\Users\Lenovo\Desktop\UIDAI_Data_Hackathon_2026\data\combined\combined_aadhar_biometric.csv")

In [5]:
for df in [enrol, demo, bio]:
    df['state'] = df['state'].str.strip().str.title()
    df['district'] = df['district'].str.strip().str.title()
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)


In [6]:
merged_1 = enrol.merge(
    demo,
    on=['date', 'state', 'district', 'pincode'],
    how='outer'
)


In [7]:
aadhar_df = merged_1.merge(
    bio,
    on=['date', 'state', 'district', 'pincode'],
    how='outer'
)


In [8]:
aadhar_df

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,2025-03-01,Andaman & Nicobar Islands,Andamans,744101,,,,,,16.0,193.0
1,2025-03-01,Andaman And Nicobar Islands,Nicobar,744301,,,,16.0,180.0,101.0,48.0
2,2025-03-01,Andaman And Nicobar Islands,Nicobar,744301,,,,16.0,180.0,101.0,48.0
3,2025-03-01,Andaman And Nicobar Islands,Nicobar,744302,,,,,,15.0,12.0
4,2025-03-01,Andaman And Nicobar Islands,Nicobar,744303,,,,,,46.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...
2952416,2025-12-31,West Bengal,West Midnapore,721426,0.0,1.0,0.0,,,,
2952417,2025-12-31,West Bengal,West Midnapore,721504,1.0,0.0,0.0,,,,
2952418,2025-12-31,West Bengal,West Midnapore,721506,1.0,1.0,0.0,,,,
2952419,2025-12-31,West Bengal,West Midnapore,721507,1.0,0.0,0.0,,,,


In [9]:
aadhar_df.shape


(2952421, 11)

In [10]:
aadhar_df.isna().sum()


date                    0
state                   0
district                0
pincode                 0
age_0_5           1696652
age_5_17          1696652
age_18_greater    1696652
demo_age_5_17      768897
demo_age_17_       768897
bio_age_5_17       693611
bio_age_17_        693611
dtype: int64

In [11]:
aadhar_df.duplicated(
    subset=['date','state','district','pincode']
).sum()


np.int64(625161)

In [12]:
keys = ['date', 'state', 'district', 'pincode']

enrol_dups = enrol.duplicated(subset=keys, keep=False).sum()
demo_dups  = demo.duplicated(subset=keys, keep=False).sum()
bio_dups   = bio.duplicated(subset=keys, keep=False).sum()

enrol_dups, demo_dups, bio_dups


(np.int64(47437), np.int64(949382), np.int64(193491))

In [13]:
enrol_clean = (
    enrol
    .groupby(keys, as_index=False)
    .agg({
        'age_0_5': 'sum',
        'age_5_17': 'sum',
        'age_18_greater': 'sum'
    })
)


In [14]:
demo_clean = (
    demo
    .groupby(keys, as_index=False)
    .agg({
        'demo_age_5_17': 'sum',
        'demo_age_17_': 'sum'
    })
)


In [15]:
bio_clean = (
    bio
    .groupby(keys, as_index=False)
    .agg({
        'bio_age_5_17': 'sum',
        'bio_age_17_': 'sum'
    })
)


In [16]:
enrol_clean.duplicated(subset=keys).sum()  # MUST be 0


np.int64(0)

In [17]:
demo_clean.duplicated(subset=keys).sum() 

np.int64(0)

In [18]:
bio_clean.duplicated(subset=keys).sum() 

np.int64(0)

In [19]:
aadhar_df = (
    enrol_clean
    .merge(demo_clean, on=keys, how='outer')
    .merge(bio_clean, on=keys, how='outer')
)


In [20]:
aadhar_df.duplicated(subset=keys).sum()


np.int64(0)

In [21]:
activity_cols = [
    'age_0_5','age_5_17','age_18_greater',
    'demo_age_5_17','demo_age_17_',
    'bio_age_5_17','bio_age_17_'
]

aadhar_df[activity_cols] = aadhar_df[activity_cols].fillna(0).astype(int)


In [22]:
aadhar_df.duplicated(subset=keys).sum()


np.int64(0)

In [23]:
(aadhar_df[activity_cols] < 0).sum()


age_0_5           0
age_5_17          0
age_18_greater    0
demo_age_5_17     0
demo_age_17_      0
bio_age_5_17      0
bio_age_17_       0
dtype: int64

In [24]:
aadhar_df['total_activity'] = aadhar_df[activity_cols].sum(axis=1)
aadhar_df['total_activity'].describe()

count    2.327260e+06
mean     5.349380e+01
std      2.795450e+02
min      0.000000e+00
25%      4.000000e+00
50%      1.200000e+01
75%      3.600000e+01
max      4.586100e+04
Name: total_activity, dtype: float64

In [25]:
aadhar_df.to_csv(r"C:\Users\Lenovo\Desktop\UIDAI_Data_Hackathon_2026\data\combined\final_aadhar_data.csv", index=False)

In [29]:
aadhar_df

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,total_activity
0,2025-03-01,Andaman & Nicobar Islands,Andamans,744101,0,0,0,0,0,16,193,209
1,2025-03-01,Andaman And Nicobar Islands,Nicobar,744301,0,0,0,32,360,101,48,541
2,2025-03-01,Andaman And Nicobar Islands,Nicobar,744302,0,0,0,0,0,15,12,27
3,2025-03-01,Andaman And Nicobar Islands,Nicobar,744303,0,0,0,0,0,46,27,73
4,2025-03-01,Andaman And Nicobar Islands,Nicobar,744304,0,0,0,0,0,16,14,30
...,...,...,...,...,...,...,...,...,...,...,...,...
2327255,2025-12-31,West Bengal,West Midnapore,721426,0,1,0,0,0,0,0,1
2327256,2025-12-31,West Bengal,West Midnapore,721504,1,0,0,0,0,0,0,1
2327257,2025-12-31,West Bengal,West Midnapore,721506,1,1,0,0,0,0,0,2
2327258,2025-12-31,West Bengal,West Midnapore,721507,1,0,0,0,0,0,0,1


In [26]:
aadhar_df.shape

(2327260, 12)

In [27]:
aadhar_df.isna().sum()

date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
demo_age_5_17     0
demo_age_17_      0
bio_age_5_17      0
bio_age_17_       0
total_activity    0
dtype: int64

In [28]:
aadhar_df.duplicated(subset=keys).sum()

np.int64(0)