In [None]:
import os
import pandas as pd

# -------------------------
# PATHS
# -------------------------
logs_folder = r"D:\TRAINING MODEL\logs"
out_folder  = r"D:\TRAINING MODEL\data\processed"

# Files
master_csv   = os.path.join(out_folder, "master_dataset.csv")              # original (keep safe)
cleaned_csv  = os.path.join(out_folder, "master_dataset_cleaned.csv")      # processed (we keep using this)
case_dict_file = os.path.join(logs_folder, "case_dictionary.json")

# -------------------------
# LOAD FILE
# -------------------------
final_df = pd.read_csv(master_csv)

# Strip spaces from all column headers
final_df.columns = final_df.columns.str.strip()

# -------------------------
# CREATE MAPPING DICTIONARIES
# -------------------------

# Sex encoding
sex_map = {"Male": 1, "Female": 0}

# Age range encoding
age_map = {
    "Under 1": 0,
    "1-4": 1,
    "5-9": 2,
    "10-14": 3,
    "15-18": 4,
    "19-24": 5,
    "25-29": 6,
    "30-34": 7,
    "35-39": 8,
    "40-44": 9,
    "45-49": 10,
    "50-54": 11,
    "55-59": 12,
    "60-64": 13,
    "65-69": 14,
    "70": 15,
    "70 Over": 15,
    "70 & OVER": 15
}

# Consultation_Type encoding
consult_map = {name: idx for idx, name in enumerate(final_df["Consultation_Type"].dropna().unique(), start=1)}

# Case encoding
case_map = {name: idx for idx, name in enumerate(final_df["Case"].dropna().unique(), start=1)}

# -------------------------
# BUILD AGE+SEX MAPPING DICTIONARY
# -------------------------
mapping_dict = {}
for col in final_df.columns[3:]:  # Skip Month_year, Consultation_Type, Case
    col_clean = col.strip()
    parts = col_clean.split()
    if len(parts) >= 2:
        sex = parts[-1]
        age = " ".join(parts[:-1])
        mapping_dict[col_clean] = {"Age_range": age, "Sex": sex}

# -------------------------
# RESHAPE INTO LONG FORMAT
# -------------------------
reshaped_df = final_df.melt(
    id_vars=["Month_year", "Consultation_Type", "Case"],
    value_vars=final_df.columns[3:],
    var_name="Age_Sex",
    value_name="Total"
)

# Clean Age_Sex column
reshaped_df["Age_Sex"] = reshaped_df["Age_Sex"].str.strip()

# Map Age_range and Sex safely
reshaped_df["Age_range"] = reshaped_df["Age_Sex"].map(
    lambda x: mapping_dict.get(x, {"Age_range": "Unknown"})["Age_range"]
)
reshaped_df["Sex"] = reshaped_df["Age_Sex"].map(
    lambda x: mapping_dict.get(x, {"Sex": "Unknown"})["Sex"]
)

# -------------------------
# SPLIT MONTH_YEAR INTO NUMERIC MONTH + YEAR
# -------------------------
reshaped_df["Month_year"] = pd.to_datetime(reshaped_df["Month_year"], errors="coerce")
reshaped_df["Month"] = reshaped_df["Month_year"].dt.month
reshaped_df["Year"] = reshaped_df["Month_year"].dt.year

# -------------------------
# ENCODE TO NUMERIC
# -------------------------
reshaped_df["Sex"] = reshaped_df["Sex"].map(sex_map).fillna(-1).astype(int)
reshaped_df["Age_range"] = reshaped_df["Age_range"].map(age_map).fillna(-1).astype(int)
reshaped_df["Consultation_Type"] = reshaped_df["Consultation_Type"].map(consult_map).fillna(-1).astype(int)
reshaped_df["Case"] = reshaped_df["Case"].map(case_map).fillna(-1).astype(int)

# -------------------------
# FINAL NUMERIC STRUCTURE
# -------------------------
reshaped_df = reshaped_df[["Year", "Month", "Consultation_Type", "Case", "Sex", "Age_range", "Total"]]

# -------------------------
# SAVE TO CLEANED CSV
# -------------------------
reshaped_df.to_csv(cleaned_csv, index=False)

print(f"✅ Cleaned numeric CSV saved as: {cleaned_csv}")
print(f"Number of rows: {reshaped_df.shape[0]}, Number of columns: {reshaped_df.shape[1]}")

# -------------------------
# PRINT ENCODINGS
# -------------------------
print("\n🔑 Encodings Used:")
print("Sex:", sex_map)
print("Age_range:", age_map)
print("Consultation_Type:", consult_map)
print("Case:", case_map)


📂 No cleaned file found, processing from original...
✅ Cleaned dataset created: D:\TRAINING MODEL\data\processed\master_dataset_cleaned.csv
🎯 Working with dataset: D:\TRAINING MODEL\data\processed\master_dataset_cleaned.csv
✅ Final columns: ['Month_year', 'Consultation_Type', 'Case', 'Sex', 'Age_Range', 'Count']
  Month_year  Consultation_Type  Case  Sex  Age_Range Count
0   2023 - 4                NaN   217  NaN        NaN  Male
1   2023 - 4                NaN   218  NaN        NaN  Male
2   2023 - 4                NaN   219  NaN        NaN  Male
3   2023 - 4                NaN   220  NaN        NaN  Male
4   2023 - 4                NaN   221  NaN        NaN  Male
5   2023 - 4                NaN   222  NaN        NaN  Male
6   2023 - 4                NaN   223  NaN        NaN  Male
7   2023 - 4                NaN   224  NaN        NaN  Male
8   2023 - 4                NaN   225  NaN        NaN  Male
9   2023 - 4                NaN   226  NaN        NaN  Male
