In [None]:
import os
import pandas as pd

# -------------------------
# CONFIG
# -------------------------
input_file = os.path.join("cleanExcel", "cleanedBook.xlsx")
output_file = os.path.join("cleanExcel", "numericBook.xlsx")

# -------------------------
# LOAD FILE
# -------------------------
final_df = pd.read_excel(input_file)

# Strip spaces from all column headers
final_df.columns = final_df.columns.str.strip()

# -------------------------
# CREATE MAPPING DICTIONARIES
# -------------------------

# Sex encoding
sex_map = {"Male": 1, "Female": 0}

# Age range encoding
age_map = {
    "Under 1": 0,
    "1-4": 1,
    "5-9": 2,
    "10-14": 3,
    "15-18": 4,
    "19-24": 5,
    "25-29": 6,
    "30-34": 7,
    "35-39": 8,
    "40-44": 9,
    "45-49": 10,
    "50-54": 11,
    "55-59": 12,
    "60-64": 13,
    "65-69": 14,
    "70": 15,
    "70 Over": 15,
    "70 & OVER": 15
}

# Consultation_Type encoding
consult_map = {name: idx for idx, name in enumerate(final_df["Consultation_Type"].dropna().unique(), start=1)}

# Case encoding
case_map = {name: idx for idx, name in enumerate(final_df["Case"].dropna().unique(), start=1)}

# -------------------------
# BUILD AGE+SEX MAPPING DICTIONARY
# -------------------------
mapping_dict = {}
for col in final_df.columns[3:]:  # Skip Month_year, Consultation_Type, Case
    col_clean = col.strip()
    parts = col_clean.split()
    if len(parts) >= 2:
        sex = parts[-1]
        age = " ".join(parts[:-1])
        mapping_dict[col_clean] = {"Age_range": age, "Sex": sex}

# -------------------------
# RESHAPE INTO LONG FORMAT
# -------------------------
reshaped_df = final_df.melt(
    id_vars=["Month_year", "Consultation_Type", "Case"],
    value_vars=final_df.columns[3:],
    var_name="Age_Sex",
    value_name="Total"
)

# Strip spaces from Age_Sex column
reshaped_df["Age_Sex"] = reshaped_df["Age_Sex"].str.strip()

# Map Age_range and Sex safely
reshaped_df["Age_range"] = reshaped_df["Age_Sex"].map(lambda x: mapping_dict.get(x, {"Age_range": "Unknown"})["Age_range"])
reshaped_df["Sex"] = reshaped_df["Age_Sex"].map(lambda x: mapping_dict.get(x, {"Sex": "Unknown"})["Sex"])

# -------------------------
# SPLIT MONTH_YEAR INTO NUMERIC MONTH + YEAR
# -------------------------
reshaped_df["Month_year"] = pd.to_datetime(reshaped_df["Month_year"], errors="coerce")
reshaped_df["Month"] = reshaped_df["Month_year"].dt.month
reshaped_df["Year"] = reshaped_df["Month_year"].dt.year

# -------------------------
# ENCODE TO NUMERIC
# -------------------------
reshaped_df["Sex"] = reshaped_df["Sex"].map(sex_map).fillna(-1).astype(int)
reshaped_df["Age_range"] = reshaped_df["Age_range"].map(age_map).fillna(-1).astype(int)
reshaped_df["Consultation_Type"] = reshaped_df["Consultation_Type"].map(consult_map).fillna(-1).astype(int)
reshaped_df["Case"] = reshaped_df["Case"].map(case_map).fillna(-1).astype(int)

# -------------------------
# FINAL NUMERIC STRUCTURE
# -------------------------
reshaped_df = reshaped_df[["Year", "Month", "Consultation_Type", "Case", "Sex", "Age_range", "Total"]]

# -------------------------
# SAVE TO NEW EXCEL
# -------------------------
reshaped_df.to_excel(output_file, index=False)

print(f"✅ Numeric Excel saved as: {output_file}")
print(f"Number of rows: {reshaped_df.shape[0]}, Number of columns: {reshaped_df.shape[1]}")

# -------------------------
# PRINT ENCODINGS
# -------------------------
print("\n🔑 Encodings Used:")
print("Sex:", sex_map)
print("Age_range:", age_map)
print("Consultation_Type:", consult_map)
print("Case:", case_map)

In [None]:

final_df.columns = final_df.columns.str.strip()

# Filter for Case=2 and Sex=1 (Male)
case2_male_total = final_df.loc[
    (final_df["Case"] == 2) & (final_df["Sex"] == 1) &( final_df["Age_range"]==3),
    "Total"
].sum()

print(f"✅ Total patients for Case 2 (Male): {case2_male_total}")
