In [None]:
import pandas as pd
demographic_df = pd.read_csv("../data/processed/interim/demographic_raw_merged.csv")


In [None]:
demographic_df.head()

In [None]:
demographic_df.columns

In [None]:
demographic_df.shape

In [None]:
demographic_df["state"] = demographic_df["state"].str.strip().str.title()
demographic_df["district"] = demographic_df["district"].str.strip().str.title()

In [None]:
demographic_df = demographic_df.rename(columns={
    "demo_age_17_": "demo_age_17_plus"
})


In [None]:
demographic_df["date"] = pd.to_datetime(demographic_df["date"], dayfirst=True)

In [None]:
demographic_df[
    demographic_df["district"].isin(["100000", 100000])
].shape[0]


In [None]:
demographic_df[
    demographic_df["state"].isin(["100000", 100000])
].shape[0]


In [None]:
demographic_df = demographic_df[
    ~(
        demographic_df["district"].isin(["100000", 100000]) |
        demographic_df["state"].isin(["100000", 100000])
    )
]

In [None]:
demographic_df.shape

In [None]:
demographic_df = demographic_df.drop_duplicates()

In [None]:
demographic_df.shape

In [None]:
demographic_df.isnull().sum()


In [None]:
demographic_df.to_csv(
    "../data/processed/cleaned/demographic_clean.csv",
    index=False
)


In [None]:
demographic_df["state"].value_counts()


In [None]:
state_mapping = {
    # West Bengal
    "West Bangal": "West Bengal",
    "Westbengal": "West Bengal",
    "West  Bengal": "West Bengal",
    "West Bengli": "West Bengal",

    # Odisha
    "Orissa": "Odisha",

    # J&K
    "Jammu & Kashmir": "Jammu And Kashmir",

    # Puducherry
    "Pondicherry": "Puducherry",

    # UT mergers
    "Dadra And Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Dadra & Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman And Diu": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman & Diu": "Dadra And Nagar Haveli And Daman And Diu",

    # Andaman
    "Andaman & Nicobar Islands": "Andaman And Nicobar Islands",

    # Uttarakhand
    "Uttaranchal": "Uttarakhand",

    # Chhattisgarh
    "Chhatisgarh": "Chhattisgarh"
}

In [None]:
demographic_df["state"] = demographic_df["state"].replace(state_mapping)


In [None]:
state_correction = {
    "Darbhanga": "Bihar",
    "Jaipur": "Rajasthan",
    "Nagpur": "Maharashtra",
    "Puttenahalli": "Karnataka",
    "Balanagar": "Telangana",
    "Madanapalle": "Andhra Pradesh",
    "Raja Annamalai Puram": "Tamil Nadu"
}


In [None]:
demographic_df["state"] = demographic_df["state"].replace(state_correction)


In [None]:
# -----------------------------
# Fix Hyderabad → Telangana
# -----------------------------

mask = (
    demographic_df["district"].astype(str).str.strip().str.title() == "Hyderabad"
)

affected_rows = mask.sum()

demographic_df.loc[mask, "state"] = "Telangana"

print(f"✔ Hyderabad correction applied to {affected_rows} rows")


In [None]:
# -----------------------------
# Fix Adilabad → Telangana
# -----------------------------

mask = (
    demographic_df["district"].astype(str).str.strip().str.title() == "Adilabad"
)

affected_rows = mask.sum()

demographic_df.loc[mask, "state"] = "Telangana"

print(f"✔ Hyderabad correction applied to {affected_rows} rows")


In [None]:
demographic_df["state"].value_counts()


In [None]:
# Remove rows with <unset> in key location columns

before = len(demographic_df)

demographic_df = demographic_df[
    (demographic_df["state"] != "<unset>") &
    (demographic_df["district"] != "<unset>") &
    (demographic_df["pincode"] != "<unset>")
]

after = len(demographic_df)

print(f"Removed {before - after} rows with <unset> in key columns")


In [None]:
demographic_df.to_csv(
    "../data/processed/cleaned/demographic_clean.csv",
    index=False
)
