In [1]:
import os
import pandas as pd

# -------------------------
# PATHS
# -------------------------
# It's recommended to use a subfolder for processed data
base_folder = r"D:\TRAINING MODEL"
logs_folder = os.path.join(base_folder, "logs")
out_folder  = os.path.join(base_folder, "data", "processed")

# Create output directory if it doesn't exist
os.makedirs(out_folder, exist_ok=True)
os.makedirs(logs_folder, exist_ok=True)


# Files
master_csv   = os.path.join(out_folder, "master_dataset.csv")          # original (keep safe)
cleaned_csv  = os.path.join(out_folder, "master_dataset_cleaned.csv")  # processed (we keep using this)
case_dict_file = os.path.join(logs_folder, "case_dictionary.json")

# -------------------------
# LOAD FILE
# -------------------------
# Using a try-except block for better error handling
try:
    final_df = pd.read_csv(master_csv)
except FileNotFoundError:
    print(f"Error: The file was not found at {master_csv}")
    print("Please make sure the 'master_dataset.csv' file exists in the correct directory.")
    exit()


# Strip spaces from all column headers
final_df.columns = final_df.columns.str.strip()

# -------------------------
# CREATE MAPPING DICTIONARIES
# -------------------------

# Sex encoding
sex_map = {"Male": 1, "Female": 0}

# Age range encoding
age_map = {
    "Under 1": 0,
    "1-4": 1,
    "5-9": 2,
    "10-14": 3,
    "15-18": 4,
    "19-24": 5,
    "25-29": 6,
    "30-34": 7,
    "35-39": 8,
    "40-44": 9,
    "45-49": 10,
    "50-54": 11,
    "55-59": 12,
    "60-64": 13,
    "65-69": 14,
    "70": 15,
    "70 Over": 15,
    "70 & OVER": 15
}

# Consultation_Type encoding
consult_map = {name: idx for idx, name in enumerate(final_df["Consultation_Type"].dropna().unique(), start=1)}

# Case encoding
case_map = {name: idx for idx, name in enumerate(final_df["Case"].dropna().unique(), start=1)}

# -------------------------
# BUILD AGE+SEX MAPPING DICTIONARY
# -------------------------
mapping_dict = {}
# Skip first 3 columns: Month_year, Consultation_Type, Case
for col in final_df.columns[3:]:
    col_clean = col.strip()
    parts = col_clean.split()
    if len(parts) >= 2:
        sex = parts[-1]
        age = " ".join(parts[:-1])
        mapping_dict[col_clean] = {"Age_range": age, "Sex": sex}

# -------------------------
# RESHAPE INTO LONG FORMAT
# -------------------------
reshaped_df = final_df.melt(
    id_vars=["Month_year", "Consultation_Type", "Case"],
    value_vars=final_df.columns[3:],
    var_name="Age_Sex",
    value_name="Total"
)

# Clean Age_Sex column
reshaped_df["Age_Sex"] = reshaped_df["Age_Sex"].str.strip()

# Map Age_range and Sex safely
reshaped_df["Age_range"] = reshaped_df["Age_Sex"].map(
    lambda x: mapping_dict.get(x, {"Age_range": "Unknown"})["Age_range"]
)
reshaped_df["Sex"] = reshaped_df["Age_Sex"].map(
    lambda x: mapping_dict.get(x, {"Sex": "Unknown"})["Sex"]
)

# -------------------------
# SPLIT MONTH_YEAR INTO NUMERIC MONTH + YEAR
# -------------------------
reshaped_df["Month_year"] = pd.to_datetime(reshaped_df["Month_year"], errors="coerce")
reshaped_df["Month"] = reshaped_df["Month_year"].dt.month
reshaped_df["Year"] = reshaped_df["Month_year"].dt.year

# -------------------------
# HANDLE MISSING/EMPTY 'TOTAL' VALUES
# -------------------------
# This is the new line you requested. It fills empty 'Total' cells with 0.
reshaped_df["Total"] = pd.to_numeric(reshaped_df["Total"], errors='coerce').fillna(0).astype(int)


# -------------------------
# ENCODE TO NUMERIC
# -------------------------
reshaped_df["Sex"] = reshaped_df["Sex"].map(sex_map).fillna(-1).astype(int)
reshaped_df["Age_range"] = reshaped_df["Age_range"].map(age_map).fillna(-1).astype(int)
reshaped_df["Consultation_Type"] = reshaped_df["Consultation_Type"].map(consult_map).fillna(-1).astype(int)
reshaped_df["Case"] = reshaped_df["Case"].map(case_map).fillna(-1).astype(int)

# -------------------------
# FINAL NUMERIC STRUCTURE
# -------------------------
final_columns = ["Year", "Month", "Consultation_Type", "Case", "Sex", "Age_range", "Total"]
reshaped_df = reshaped_df[final_columns]

# Drop rows where year or month could not be parsed
reshaped_df.dropna(subset=['Year', 'Month'], inplace=True)
reshaped_df['Year'] = reshaped_df['Year'].astype(int)
reshaped_df['Month'] = reshaped_df['Month'].astype(int)


# -------------------------
# SAVE TO CLEANED CSV
# -------------------------
reshaped_df.to_csv(cleaned_csv, index=False)

print(f"✅ Cleaned numeric CSV saved as: {cleaned_csv}")
print(f"Number of rows: {reshaped_df.shape[0]}, Number of columns: {reshaped_df.shape[1]}")

# -------------------------
# PRINT ENCODINGS
# -------------------------
print("\n🔑 Encodings Used:")
print("Sex:", sex_map)
print("Age_range:", age_map)
print("Consultation_Type:", consult_map)
print("Case:", case_map)


  reshaped_df["Month_year"] = pd.to_datetime(reshaped_df["Month_year"], errors="coerce")


✅ Cleaned numeric CSV saved as: D:\TRAINING MODEL\data\processed\master_dataset_cleaned.csv
Number of rows: 20032, Number of columns: 7

🔑 Encodings Used:
Sex: {'Male': 1, 'Female': 0}
Age_range: {'Under 1': 0, '1-4': 1, '5-9': 2, '10-14': 3, '15-18': 4, '19-24': 5, '25-29': 6, '30-34': 7, '35-39': 8, '40-44': 9, '45-49': 10, '50-54': 11, '55-59': 12, '60-64': 13, '65-69': 14, '70': 15, '70 Over': 15, '70 & OVER': 15}
Consultation_Type: {1: 1, 2: 2, 3: 3}
Case: {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69,