In [None]:
# Work 11: HFRS Calculation & Visualization Notebook: ICD-10-Based Risk Scoring and Distribution
# [W11.HFRS.5.Visualize_the_HFRS.ipynb]

################
# "This notebook calculates Hospital Frailty Risk Score (HFRS) from ICD-10 codes, then plots risk distributions for each dataset.
#  Summarizes missing diagnoses and categorizes by risk level fully."

########################################################################################################
#  Sequence list
########################################################################################################
#
# 1: File paths for HFRS calculation
#    - Specifies where to read each dataset (dgn_path, kertomus_path, til_tap_path)
#    - Specifies the ICD-10 points file (icd_points_path)
#
# 2: Load data (CSV files)
#    - Reads dgn_df, kertomus_df, til_tap_df using pandas
#
# 3: Load ICD-10 code points (Excel file)
#    - Reads icd_points_df and creates the icd_points dictionary
#
# 4: Calculate HFRS and save results for each dataset
#    - 4.1: Reshape data with pd.melt (long format)
#    - 4.2: Truncate ICD-10 codes
#    - 4.3: Map truncated ICD-10 codes to HFRS point values
#    - 4.4: Print any unmatched ICD codes
#    - 4.5: Sum HFRS per patient and save CSV
#    - 4.6: Print number of patients without diagnoses
#
# 5: Load HFRS scores from CSV for plotting (plot_risk_distribution function)
#
# 6: Define risk categories (Low, Intermediate, High) based on HFRS values
#
# 7: Apply risk categorization to each patient’s HFRS score
#
# 8: Calculate proportions of each risk category
#
# 9: Plot the risk distribution (bar chart)
#
# 10: Add percentage values on top of the bars
#
# 11: Save plots into folder (as PNG files)
#
# 12: File paths for HFRS score CSV outputs
#    - dgn_file_path, kertomus_file_path, til_tap_file_path
#
# 13: Output paths for plots
#    - dgn_plot_path, kertomus_plot_path, til_tap_plot_path
#
# 14: Plotting
#    - Call plot_risk_distribution for each dataset to generate and save final plots
#
########################################################################################################
########################################################################################################

import pandas as pd
import matplotlib.pyplot as plt


def truncate_icd10_code(icd_code):
    """Truncate the ICD-10 code to the first letter and first two digits."""
    return icd_code[:3]


def calculate_hfrs(df, icd_points, icd_columns, output_path):
    df = df.melt(
        id_vars=["Potilas_ID"],
        value_vars=icd_columns,
        var_name="Diagnosis_Type",
        value_name="ICD_code",
    )
    df.dropna(subset=["ICD_code"], inplace=True)
    df["ICD_code"] = df["ICD_code"].astype(str)
    df["ICD_code"] = df["ICD_code"].apply(truncate_icd10_code)
    df["HFRS"] = df["ICD_code"].map(icd_points).fillna(0).astype(float)

    # Print unmatched ICD-10 codes
    unmatched_codes = df.loc[df["HFRS"] == 0, "ICD_code"].unique()
    if len(unmatched_codes) > 0:
        print(f"Unmatched ICD-10 codes in {output_path}: {unmatched_codes}")

    # Print a few rows to verify HFRS mapping
    print(f"Sample HFRS mapping for {output_path}:")
    print(df[["Potilas_ID", "ICD_code", "HFRS"]].head(10))

    hfrs_scores = df.groupby("Potilas_ID")["HFRS"].sum().reset_index()
    hfrs_scores.to_csv(output_path, index=False)
    print(f"HFRS points were calculated and saved to '{output_path}'.")

    # Calculate missing diagnoses per patient
    total_patients = df["Potilas_ID"].nunique()
    patients_with_diagnoses = df.dropna(subset=["ICD_code"])["Potilas_ID"].nunique()
    patients_without_diagnoses = total_patients - patients_with_diagnoses
    print(f"Number of patients without diagnoses in {output_path}: {patients_without_diagnoses}")


# 1: File paths for HFRS calculation
dgn_path = "/home/work/dataset1.csv"
kertomus_path = "/home/work/dataset2.csv"
til_tap_path = "/home/work/dataset3.csv"
icd_points_path = "https://raw.githubusercontent.com/Tupatuko2023/Python-R-Scripts/main/tables/W8.HFRS.ICD-10_points.xlsx"

# 2: Load data
dgn_df = pd.read_csv(dgn_path, sep="|", usecols=["Potilas_ID", "ICD_code"])
kertomus_df = pd.read_csv(kertomus_path, sep="|", usecols=["Potilas_ID", "ICD_code"])
til_tap_df = pd.read_csv(til_tap_path, sep="|", usecols=["Potilas_ID", "ICD_code1", "ICD_code2"])

# 3: Load ICD-10 code points
icd_points_df = pd.read_excel(icd_points_path)
icd_points = dict(
    zip(icd_points_df["ICD-10 Code"], icd_points_df["Points"].str.replace(",", ".").astype(float))
)

# 4: Calculate HFRS and save results for each dataset
calculate_hfrs(dgn_df, icd_points, ["ICD_code"], "/home/work/dataset1_hfrs.csv")
calculate_hfrs(kertomus_df, icd_points, ["ICD_code"], "/home/work/dataset2_hfrs.csv")
calculate_hfrs(til_tap_df, icd_points, ["ICD_code1", "ICD_code2"], "/home/work/dataset3_hfrs.csv")


def plot_risk_distribution(file_path, output_path, dataset_name):
    # 5: Load HFRS scores
    hfrs_scores_df = pd.read_csv(file_path)
    print(f"5: Loaded the HFRS scores for {dataset_name}")

    # Verify distribution of HFRS scores
    print(f"Distribution of HFRS scores in {dataset_name}:")
    print(hfrs_scores_df["HFRS"].describe())

    # 6: Define risk categories
    def categorize_risk(hfrs):
        if hfrs < 5:
            return "Low risk"
        elif 5 <= hfrs <= 15:
            return "Intermediate risk"
        else:
            return "High risk"

    print(f"6: Defined the risk categories for {dataset_name}")

    # 7: Apply risk categorization
    hfrs_scores_df["Risk_Category"] = hfrs_scores_df["HFRS"].apply(categorize_risk)
    print(f"7: Applied the risk categorization for {dataset_name}")

    # 8: Calculate proportions
    risk_distribution = hfrs_scores_df["Risk_Category"].value_counts(normalize=True) * 100
    print(f"8: Calculated the proportions for {dataset_name}")

    # 9: Plot the risk distribution
    plt.figure(figsize=(8, 6))
    bars = plt.bar(risk_distribution.index, risk_distribution, color=["green", "orange", "red"])
    plt.title(f"Risk Distribution for {dataset_name}")
    plt.xlabel("Risk Category")
    plt.ylabel("Proportion (%)")
    plt.xticks(rotation=0)
    print(f"9: Plotted the risk distribution for {dataset_name}")

    # 10: Adding the percentage values on top of the bars
    for bar in bars:
        height = bar.get_height()
        plt.annotate(
            f"{height:.2f}%",
            xy=(bar.get_x() + bar.get_width() / 2, height),
            xytext=(0, 3),  # 3 points vertical offset
            textcoords="offset points",
            ha="center",
            va="bottom",
        )
    print(f"10: Added the percentage values for {dataset_name}")

    # 11: Save plots into folder
    plt.savefig(output_path)
    print(f"11: Saved {dataset_name} plot into folder {output_path}")
    plt.close()


# 12: File paths for HFRS score CSV outputs
dgn_file_path = "/home/work/dataset1_hfrs_scores.csv"
kertomus_file_path = "/home/work/dataset2_hfrs_scores.csv"
til_tap_file_path = "/home/work/dataset3_hfrs_scores.csv"

# 13: Output paths for plots
dgn_plot_path = (
    "/home/HUSTIETOALLAS/ext13144568/mounts/research/Tomi_K/plots/k4rl_dgn_risk_distribution.png"
)
kertomus_plot_path = "/home/HUSTIETOALLAS/ext13144568/mounts/research/Tomi_K/plots/k4rl_kertomus_risk_distribution.png"
til_tap_plot_path = "/home/HUSTIETOALLAS/ext13144568/mounts/research/Tomi_K/plots/k4rl_til_tap_risk_distribution.png"

# 14: Plotting
plot_risk_distribution(dgn_file_path, dgn_plot_path, "dgn_df")
plot_risk_distribution(kertomus_file_path, kertomus_plot_path, "kertomus_df")
plot_risk_distribution(til_tap_file_path, til_tap_plot_path, "til_tap_df")


########################################################################################################
########################################################################################################