In [1]:
import pandas as pd

In [2]:
biometric_df = pd.read_csv("../data/processed/interim/biometric_raw_merged.csv")

In [3]:
biometric_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [4]:
biometric_df.columns


Index(['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_'], dtype='object')

In [5]:
biometric_df = biometric_df.rename(columns={
    "bio_age_17_": "bio_age_17_plus"
})


In [6]:
biometric_df.shape

(1861108, 6)

In [7]:
biometric_df["state"] = biometric_df["state"].str.strip().str.title()

In [8]:
biometric_df["district"] = biometric_df["district"].str.strip().str.title()

In [9]:
biometric_df["date"] = pd.to_datetime(biometric_df["date"], dayfirst=True)

In [10]:
biometric_df.dtypes

date               datetime64[ns]
state                      object
district                   object
pincode                     int64
bio_age_5_17                int64
bio_age_17_plus             int64
dtype: object

In [11]:
biometric_df = biometric_df.drop_duplicates()


In [12]:
biometric_df.shape

(1766159, 6)

In [13]:
biometric_df[
    biometric_df["district"].isin(["100000", 100000]) |
    biometric_df["state"].isin(["100000", 100000])
].shape[0]


0

In [14]:
biometric_df.shape

(1766159, 6)

In [15]:
biometric_df.isnull().sum()


date               0
state              0
district           0
pincode            0
bio_age_5_17       0
bio_age_17_plus    0
dtype: int64

In [16]:
biometric_df["state"].value_counts()


state
Tamil Nadu                                  174934
Andhra Pradesh                              160231
Uttar Pradesh                               147138
Maharashtra                                 143609
Karnataka                                   135773
West Bengal                                 125316
Kerala                                       93951
Gujarat                                      84630
Odisha                                       83191
Bihar                                        78078
Telangana                                    77850
Rajasthan                                    76698
Madhya Pradesh                               66020
Punjab                                       46414
Assam                                        44418
Jharkhand                                    35044
Chhattisgarh                                 30048
Himachal Pradesh                             28723
Haryana                                      25083
Uttarakhand              

In [17]:
biometric_state_mapping = {
    "Orissa": "Odisha",
    "Pondicherry": "Puducherry",

    "West Bangal": "West Bengal",
    "Westbengal": "West Bengal",
    "West  Bengal": "West Bengal",

    "Jammu & Kashmir": "Jammu And Kashmir",

    "Chhatisgarh": "Chhattisgarh",
    "Uttaranchal": "Uttarakhand",

    "Tamilnadu": "Tamil Nadu",

    "Dadra And Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Dadra & Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman And Diu": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman & Diu": "Dadra And Nagar Haveli And Daman And Diu",

    "Andaman & Nicobar Islands": "Andaman And Nicobar Islands"
}


In [18]:
biometric_df["state"] = biometric_df["state"].replace(biometric_state_mapping)


In [19]:
# -----------------------------
# Fix Hyderabad → Telangana
# -----------------------------

mask = (
    biometric_df["district"].astype(str).str.strip().str.title() == "Hyderabad"
)

affected_rows = mask.sum()

biometric_df.loc[mask, "state"] = "Telangana"

print(f"✔ Hyderabad correction applied to {affected_rows} rows")


✔ Hyderabad correction applied to 8831 rows


In [20]:
biometric_df["state"].value_counts()


state
Tamil Nadu                                  174935
Andhra Pradesh                              156043
Uttar Pradesh                               147138
Maharashtra                                 143609
Karnataka                                   135773
West Bengal                                 125413
Odisha                                       95958
Kerala                                       93951
Gujarat                                      84630
Telangana                                    82038
Bihar                                        78078
Rajasthan                                    76698
Madhya Pradesh                               66020
Punjab                                       46414
Assam                                        44418
Jharkhand                                    35044
Chhattisgarh                                 30053
Himachal Pradesh                             28723
Haryana                                      25083
Uttarakhand              

In [21]:
before = len(biometric_df)

biometric_df = biometric_df[
    (biometric_df["state"] != "<unset>") &
    (biometric_df["district"] != "<unset>") &
    (biometric_df["pincode"] != "<unset>")
]

after = len(biometric_df)

print(f"Removed {before - after} rows with <unset> in key columns")


Removed 0 rows with <unset> in key columns


In [22]:
biometric_df.to_csv(
    "../data/processed/cleaned/biometric_clean.csv",
    index=False
)
