In [None]:
import pandas as pd

In [None]:
enrolment_df = pd.read_csv("../data/processed/interim/enrolment_raw_merged.csv")


In [None]:
enrolment_df.head()

In [None]:
enrolment_df.columns

In [None]:
enrolment_df = enrolment_df.rename(columns={"age_18_greater": "age_17_plus"})

In [None]:
enrolment_df["state"] = enrolment_df["state"].str.strip().str.title()
enrolment_df["district"] = enrolment_df["district"].str.strip().str.title()


In [None]:
enrolment_df.isnull().sum()

In [None]:
enrolment_df.shape


In [None]:
enrolment_df = enrolment_df.drop_duplicates()

In [None]:
enrolment_df["date"] = pd.to_datetime(enrolment_df["date"], dayfirst=True)


In [None]:
enrolment_df.dtypes

In [None]:
enrolment_df.shape


In [None]:
enrolment_df[enrolment_df["district"] == "100000"]


In [None]:
enrolment_df[enrolment_df["district"] == "100000"].shape[0]

In [None]:
enrolment_df[enrolment_df["state"] == "100000"]

In [None]:
enrolment_df[enrolment_df["state"] == "100000"].shape[0]

In [None]:
enrolment_df = enrolment_df[
    ~(
        enrolment_df["district"].isin(["100000", 100000]) |
        enrolment_df["state"].isin(["100000", 100000])
    )
]


In [None]:
enrolment_df.shape


In [None]:
enrolment_df["state"].value_counts()

In [None]:
enrolment_state_mapping = {
    "Orissa": "Odisha",
    "Jammu & Kashmir": "Jammu And Kashmir",
    "Pondicherry": "Puducherry",

    "West  Bengal": "West Bengal",
    "West Bangal": "West Bengal",
    "Westbengal": "West Bengal",

    "Dadra And Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Dadra & Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman And Diu": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman & Diu": "Dadra And Nagar Haveli And Daman And Diu",
    "The Dadra And Nagar Haveli And Daman And Diu": "Dadra And Nagar Haveli And Daman And Diu",

    "Andaman & Nicobar Islands": "Andaman And Nicobar Islands"
}


In [None]:
# -----------------------------
# Fix Hyderabad â†’ Telangana
# -----------------------------

mask = (
    enrolment_df["district"].astype(str).str.strip().str.title() == "Hyderabad"
)

affected_rows = mask.sum()

enrolment_df.loc[mask, "state"] = "Telangana"

print(f"âœ” Hyderabad correction applied to {affected_rows} rows")


In [None]:
# -----------------------------
# Fix Adilabad â†’ Telangana
# -----------------------------

mask = (
    enrolment_df["district"].astype(str).str.strip().str.title() == "Adilabad"
)

affected_rows = mask.sum()

enrolment_df.loc[mask, "state"] = "Telangana"

print(f"âœ” Hyderabad correction applied to {affected_rows} rows")


In [None]:
enrolment_df["state"] = enrolment_df["state"].replace(enrolment_state_mapping)


In [None]:
# normalize state & district names
enrolment_df["state_clean"] = (
    enrolment_df["state"]
    .astype(str)
    .str.strip()
    .str.lower()
)

enrolment_df["district_clean"] = (
    enrolment_df["district"]
    .astype(str)
    .str.strip()
    .str.lower()
)



In [None]:
before = len(enrolment_df)

enrolment_df = enrolment_df[
    (enrolment_df["state"] != "<unset>") &
    (enrolment_df["district"] != "<unset>") &
    (enrolment_df["pincode"] != "<unset>")
]

after = len(enrolment_df)

print(f"Removed {before - after} rows with <unset> in key columns")


In [None]:
enrolment_df["state"].value_counts()

In [None]:
enrolment_df.to_csv(
    "../data/processed/cleaned/enrolment_clean.csv",
    index=False
)


In [None]:
# import pandas as pd
# import nbformat
# from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell
# from pathlib import Path
#
# # =====================================================
# # Paths
# # =====================================================
# CLEAN_DIR = Path("../data/processed/cleaned")
# NOTEBOOK_DIR = Path("../Notebooks/state_wise_cleaning")
# REFERENCE_DIR = Path("../data/processed/reference")
#
# NOTEBOOK_DIR.mkdir(parents=True, exist_ok=True)
# REFERENCE_DIR.mkdir(parents=True, exist_ok=True)
#
# # =====================================================
# # Load enrolment data to get states (source of truth)
# # =====================================================
# enrol_df = pd.read_csv(CLEAN_DIR / "enrolment_clean.csv")
#
# enrol_df["state"] = enrol_df["state"].astype(str).str.strip().str.title()
# enrol_df["district"] = enrol_df["district"].astype(str).str.strip().str.title()
#
# states = sorted(enrol_df["state"].dropna().unique())
#
# print(f"Creating notebooks for {len(states)} states")
#
#
# # =====================================================
# # Notebook template generation
# # =====================================================
# for state in states:
#     safe_state = state.replace(" ", "_")
#     nb_path = NOTEBOOK_DIR / f"{safe_state}_district_cleaning.ipynb"
#
#     nb = new_notebook(cells=[
#
#         # ----------------------------
#         # Markdown intro
#         # ----------------------------
#         new_markdown_cell(
#             f"# District Cleaning â€” {state}\n\n"
#             "This notebook standardizes **district names** for this state across:\n"
#             "- Enrolment data\n"
#             "- Demographic update data\n"
#             "- Biometric update data\n\n"
#             "**All data is saved back to the same cleaned files.**"
#         ),
#
#         # ----------------------------
#         # Load datasets
#         # ----------------------------
#         new_code_cell(
#             """import pandas as pd
# from pathlib import Path
#
# pd.set_option("display.max_rows", None)
# pd.set_option("display.width", None)
#
# CLEAN_DIR = Path("../../data/processed/cleaned")
#
# enrol_df = pd.read_csv(CLEAN_DIR / "enrolment_clean.csv")
# demo_df  = pd.read_csv(CLEAN_DIR / "demographic_clean.csv")
# bio_df   = pd.read_csv(CLEAN_DIR / "biometric_clean.csv")
#
# for df in [enrol_df, demo_df, bio_df]:
#     df["state"] = df["state"].astype(str).str.strip().str.title()
#     df["district"] = df["district"].astype(str).str.strip().str.title()
#
# print("âœ… All datasets loaded and normalized (Title Case)")
# """
#         ),
#
#         # ----------------------------
#         # Print districts (FULL HEIGHT)
#         # ----------------------------
#         new_code_cell(
#             f"""STATE_NAME = "{state}"
#
# districts = sorted(
#     set(
#         enrol_df.loc[enrol_df["state"] == STATE_NAME, "district"].dropna()
#         .tolist()
#     )
# )
#
# print(f"State: {{STATE_NAME}}")
# print(f"Number of unique districts: {{len(districts)}}")
#
# pd.DataFrame(
#     {{"District Name": districts}}
# )
# """
#         ),
#
#         # ----------------------------
#         # Mapping instructions
#         # ----------------------------
#         new_markdown_cell(
#             "## District Mapping\n\n"
#             "Add mappings in **Title Case only**.\n\n"
#             "Format:\n"
#             "```python\n"
#             "DISTRICT_MAPPING = {\n"
#             "    \"Correct District\": [\"Wrong Name 1\", \"Wrong Name 2\"],\n"
#             "}\n"
#             "```"
#         ),
#
#         # ----------------------------
#         # Apply mapping (SAFE for all datasets)
#         # ----------------------------
#         new_code_cell(
#             f"""DISTRICT_MAPPING = {{
#     # "Correct District": ["Wrong Variant 1", "Wrong Variant 2"]
# }}
#
# def apply_mapping(df, state, mapping, label):
#     total = 0
#     for correct, wrongs in mapping.items():
#         mask = (
#             (df["state"] == state) &
#             (df["district"].isin(wrongs))
#         )
#         count = mask.sum()
#         df.loc[mask, "district"] = correct
#         total += count
#         if count > 0:
#             print(f"âœ” {{label}} â†’ {{correct}} : {{count}} rows fixed")
#     return total
#
# total_fixes = 0
# total_fixes += apply_mapping(enrol_df, STATE_NAME, DISTRICT_MAPPING, "Enrolment")
# total_fixes += apply_mapping(demo_df,  STATE_NAME, DISTRICT_MAPPING, "Demographic")
# total_fixes += apply_mapping(bio_df,   STATE_NAME, DISTRICT_MAPPING, "Biometric")
#
# print(f"âœ… Total fixes in {{STATE_NAME}}: {{total_fixes}}")
# """
#         ),
#
#         # ----------------------------
#         # Save back to SAME files
#         # ----------------------------
#         new_code_cell(
#             """enrol_df.to_csv(CLEAN_DIR / "enrolment_clean.csv", index=False)
# demo_df.to_csv(CLEAN_DIR / "demographic_clean.csv", index=False)
# bio_df.to_csv(CLEAN_DIR / "biometric_clean.csv", index=False)
#
# print("ðŸ’¾ All cleaned files saved successfully (overwritten)")
# """
#         )
#     ])
#
#     with open(nb_path, "w", encoding="utf-8") as f:
#         nbformat.write(nb, f)
#
#     print(f"âœ” Created: {nb_path}")
#
# print("ðŸŽ‰ All state-wise district cleaning notebooks generated successfully.")
