In [31]:
import pandas as pd

In [32]:
biometric_df = pd.read_csv("../data/processed/interim/biometric_raw_merged.csv")

In [33]:
biometric_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [34]:
biometric_df.columns


Index(['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_'], dtype='object')

In [35]:
biometric_df = biometric_df.rename(columns={
    "bio_age_5_17" : "age_5_17",
    "bio_age_17_": "age_17_plus"
})


In [36]:
biometric_df.shape

(1861108, 6)

In [37]:
biometric_df["state"] = biometric_df["state"].str.strip().str.title()

In [38]:
biometric_df["district"] = biometric_df["district"].str.strip().str.title()

In [39]:
biometric_df["date"] = pd.to_datetime(biometric_df["date"], dayfirst=True)

In [40]:
biometric_df.dtypes

date           datetime64[ns]
state                  object
district               object
pincode                 int64
age_5_17                int64
age_17_plus             int64
dtype: object

In [41]:
biometric_df = biometric_df.drop_duplicates()


In [42]:
biometric_df.shape

(1766159, 6)

In [43]:
biometric_df[
    biometric_df["district"].isin(["100000", 100000]) |
    biometric_df["state"].isin(["100000", 100000])
].shape[0]


0

In [44]:
biometric_df.shape

(1766159, 6)

In [45]:
biometric_df.isnull().sum()


date           0
state          0
district       0
pincode        0
age_5_17       0
age_17_plus    0
dtype: int64

In [46]:
biometric_df["state"].value_counts()


state
Tamil Nadu                                  174934
Andhra Pradesh                              160231
Uttar Pradesh                               147138
Maharashtra                                 143609
Karnataka                                   135773
West Bengal                                 125316
Kerala                                       93951
Gujarat                                      84630
Odisha                                       83191
Bihar                                        78078
Telangana                                    77850
Rajasthan                                    76698
Madhya Pradesh                               66020
Punjab                                       46414
Assam                                        44418
Jharkhand                                    35044
Chhattisgarh                                 30048
Himachal Pradesh                             28723
Haryana                                      25083
Uttarakhand              

In [47]:
biometric_state_mapping = {
    "Orissa": "Odisha",
    "Pondicherry": "Puducherry",

    "West Bangal": "West Bengal",
    "Westbengal": "West Bengal",
    "West  Bengal": "West Bengal",

    "Jammu & Kashmir": "Jammu And Kashmir",

    "Chhatisgarh": "Chhattisgarh",
    "Uttaranchal": "Uttarakhand",

    "Tamilnadu": "Tamil Nadu",

    "Dadra And Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Dadra & Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman And Diu": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman & Diu": "Dadra And Nagar Haveli And Daman And Diu",

    "Andaman & Nicobar Islands": "Andaman And Nicobar Islands"
}


In [48]:
biometric_df["state"] = biometric_df["state"].replace(biometric_state_mapping)


In [49]:
telangana_districts = [
    "Adilabad"
    "Hyderabad",
    "K.V.Rangareddy",
    "K.V. Rangareddy",
    "Karim Nagar",
    "Karimnagar",
    "Khammam",
    "Mahabub Nagar",
    "Mahabubnagar",
    "Mahbubnagar",
    "Medak",
    "Nalgonda",
    "Nizamabad",
    "Rangareddi",
    "Warangal",
    "Y.S.R. Kadapa"
    # add more if needed
]


mask = (
    biometric_df["district"]
    .astype(str)
    .str.strip()
    .str.title()
    .isin(telangana_districts)
)

affected_rows = mask.sum()

biometric_df.loc[mask, "state"] = "Telangana"

print(f"✔ Telangana correction applied to {affected_rows} rows")


✔ Telangana correction applied to 70039 rows


In [50]:
# -----------------------------
# Fix Kamrup → Assam
# -----------------------------

mask = (
    biometric_df["district"].astype(str).str.strip().str.title() == "Kamrup"
)

affected_rows = mask.sum()

biometric_df.loc[mask, "state"] = "Assam"

print(f"✔ Kamrup correction applied to {affected_rows} rows")

✔ Kamrup correction applied to 2469 rows


In [51]:
## -----------------------------
# Fix Leh → Ladakh
# -----------------------------

mask = biometric_df["district"].astype(str).str.strip().str.title().isin(
    ["Leh", "Leh(Ladakh)"]
)

affected_rows = mask.sum()
biometric_df.loc[mask, "state"] = "Ladakh"

print(f"✔ Leh correction applied to {affected_rows} rows")

✔ Leh correction applied to 650 rows


In [52]:
## -----------------------------
# Fix Leh → Ladakh
# -----------------------------

mask = (
    biometric_df["district"].astype(str).str.strip().str.title() == "Leh"
)

affected_rows = mask.sum()
biometric_df.loc[mask, "state"] = "Ladakh"

print(f"✔ Leh correction applied to {affected_rows} rows")

✔ Leh correction applied to 650 rows


In [53]:
## -----------------------------
# Fix Leh → Ladakh
# -----------------------------

mask = (
    biometric_df["district"].astype(str).str.strip().str.title() == "Leh(Ladakh)"
)

affected_rows = mask.sum()
biometric_df.loc[mask, "state"] = "Ladakh"

print(f"✔ Leh correction applied to {affected_rows} rows")

✔ Leh correction applied to 0 rows


In [54]:
biometric_df["state"].value_counts()


state
Tamil Nadu                                  174935
Uttar Pradesh                               147138
Maharashtra                                 143609
Karnataka                                   135773
Andhra Pradesh                              130521
West Bengal                                 125413
Telangana                                   107560
Odisha                                       95958
Kerala                                       93951
Gujarat                                      84630
Bihar                                        78078
Rajasthan                                    76698
Madhya Pradesh                               66020
Punjab                                       46414
Assam                                        44418
Jharkhand                                    35044
Chhattisgarh                                 30053
Himachal Pradesh                             28723
Haryana                                      25083
Uttarakhand              

In [55]:
before = len(biometric_df)

biometric_df = biometric_df[
    (biometric_df["state"] != "<unset>") &
    (biometric_df["district"] != "<unset>") &
    (biometric_df["pincode"] != "<unset>")
]

after = len(biometric_df)

print(f"Removed {before - after} rows with <unset> in key columns")


Removed 0 rows with <unset> in key columns


In [56]:
biometric_df.to_csv(
    "../data/processed/cleaned/biometric_clean.csv",
    index=False
)
