In [None]:
# Work 10: ICD-10–Based HFRS Score Calculation and Risk Classification Visualization: A Jupyter Notebook 
# [W10.HFRS.4.HFRS_Scores_Risk_Classification.ipynb]

# "This Notebook calculates ICD-10–based Hospital Frailty Risk Score (HFRS) scores, prints unmatched codes, 
#  calculates missing diagnoses, and visualizes patient risk categories with bar charts too."

########################################################################################################
#  Sequence list
########################################################################################################

# 1. Define `truncate_icd10_code` function
#    - Truncates the ICD-10 code to the first letter and first two digits.

# 2. Define `calculate_hfrs` function
#    2.1. Reshape data using `pd.melt` and retain only non-null ICD codes.
#    2.2. Convert ICD codes to strings, truncate them, and map each to HFRS points.
#    2.3. Print any unmatched ICD-10 codes.
#    2.4. Print a sample of rows to verify HFRS mapping correctness.
#    2.5. Sum HFRS points per patient and export the results to a CSV file.
#    2.6. Calculate and print the number of patients without diagnoses.

# 3. Specify file paths for HFRS calculation
#    - Paths to the three input datasets and the ICD-10 points spreadsheet.

# 4. Load data from CSV
#    - Use `pd.read_csv` to load relevant columns from each dataset.

# 5. Load ICD-10 code points from Excel
#    - Convert the points column to float and map the ICD-10 codes to points.

# 6. Define `plot_risk_distribution` function
#    6.1. Load the previously calculated HFRS scores from CSV.
#    6.2. Print distribution statistics of HFRS scores.
#    6.3. Define risk categories (Low, Intermediate, High) based on HFRS score thresholds.
#    6.4. Assign each patient’s HFRS score to a risk category.
#    6.5. Calculate proportions of patients in each category.
#    6.6. Plot the risk distribution as a bar chart.
#    6.7. Annotate bars with category percentages.
#    6.8. Save the plot to the specified output path and close it.

# 7. Specify file paths for the HFRS CSV outputs
#    - Points to the CSV files created in step 2.5 (e.g., dgn_file_path, kertomus_file_path, til_tap_file_path).

# 8. Specify output paths for the risk distribution plots
#    - Points to the locations for saving the final bar charts (e.g., dgn_plot_path, kertomus_plot_path, til_tap_plot_path).

# 9. Generate and save risk distribution plots
#    - Call `plot_risk_distribution` for each dataset (dgn, kertomus, til_tap) 
#      using the paths from steps 7 and 8.

########################################################################################################
########################################################################################################

import pandas as pd
import matplotlib.pyplot as plt

# 1. Define `truncate_icd10_code` function
def truncate_icd10_code(icd_code):
    """
    Truncates the ICD-10 code to the first letter and first two digits.
    """
    return icd_code[:3]

# 2. Define `calculate_hfrs` function
#    2.1. Reshape data using `pd.melt` and retain only non-null ICD codes.
#    2.2. Convert ICD codes to strings, truncate them, and map each to HFRS points.
#    2.3. Print any unmatched ICD-10 codes.
#    2.4. Print a sample of rows to verify HFRS mapping correctness.
#    2.5. Sum HFRS points per patient and export the results to a CSV file.
#    2.6. Calculate and print the number of patients without any diagnoses.
def calculate_hfrs(df, icd_points, icd_columns, output_path):
    df = df.melt(
        id_vars=['Potilas_ID'],
        value_vars=icd_columns,
        var_name='Diagnosis_Type',
        value_name='ICD_code'
    )
    df.dropna(subset=['ICD_code'], inplace=True)
    df['ICD_code'] = df['ICD_code'].astype(str)
    df['ICD_code'] = df['ICD_code'].apply(truncate_icd10_code)
    df['HFRS'] = df['ICD_code'].map(icd_points).fillna(0).astype(float)

    unmatched_codes = df.loc[df['HFRS'] == 0, 'ICD_code'].unique()
    if len(unmatched_codes) > 0:
        print(f"Unmatched ICD-10 codes in {output_path}: {unmatched_codes}")

    print(f"Sample HFRS mapping for {output_path}:")
    print(df[['Potilas_ID', 'ICD_code', 'HFRS']].head(10))

    hfrs_scores = df.groupby('Potilas_ID')['HFRS'].sum().reset_index()
    hfrs_scores.to_csv(output_path, index=False)
    print(f"HFRS points were calculated and saved to '{output_path}'.")

    total_patients = df['Potilas_ID'].nunique()
    patients_with_diagnoses = df.dropna(subset=['ICD_code'])['Potilas_ID'].nunique()
    patients_without_diagnoses = total_patients - patients_with_diagnoses
    print(f"Number of patients without diagnoses in {output_path}: {patients_without_diagnoses}")

# 3. Specify file paths for HFRS calculation
dgn_path = '/home/work/dataset1.csv'
kertomus_path = '/home/work/dataset2.csv'
til_tap_path = '/home/work/dataset3.csv'
icd_points_path = 'https://raw.githubusercontent.com/Tupatuko2023/Python-R-Scripts/main/tables/W8.HFRS.ICD-10_points.xlsx'

# 4. Load data from CSV
dgn_df = pd.read_csv(dgn_path, sep='|', usecols=['Potilas_ID','ICD_code'])
kertomus_df = pd.read_csv(kertomus_path, sep='|', usecols=['Potilas_ID', 'ICD_code'])
til_tap_df = pd.read_csv(til_tap_path, sep='|', usecols=['Potilas_ID', 'ICD_code1', 'ICD_code2'])

# 5. Load ICD-10 code points from Excel
icd_points_df = pd.read_excel(icd_points_path)
icd_points = dict(
    zip(
        icd_points_df['ICD-10 Code'],
        icd_points_df['Points'].str.replace(',', '.').astype(float)
    )
)

# 6. Define `plot_risk_distribution` function
#    6.1. Load the previously calculated HFRS scores from CSV.
#    6.2. Print distribution statistics of HFRS scores.
#    6.3. Define risk categories (Low, Intermediate, High).
#    6.4. Assign each patient’s HFRS score to a risk category.
#    6.5. Calculate proportions of patients in each category.
#    6.6. Plot the distribution as a bar chart.
#    6.7. Annotate bars with category percentages.
#    6.8. Save and close the plot.
def plot_risk_distribution(file_path, output_path, dataset_name):
    hfrs_scores_df = pd.read_csv(file_path)
    print(f"6.1: Loaded the HFRS scores for {dataset_name}")

    print(f"6.2: Distribution of HFRS scores in {dataset_name}:")
    print(hfrs_scores_df['HFRS'].describe())

    def categorize_risk(hfrs):
        if hfrs < 5:
            return 'Low risk'
        elif 5 <= hfrs <= 15:
            return 'Intermediate risk'
        else:
            return 'High risk'

    print(f"6.3: Defined the risk categories for {dataset_name}")

    hfrs_scores_df['Risk_Category'] = hfrs_scores_df['HFRS'].apply(categorize_risk)
    print(f"Applied the risk categorization for {dataset_name}")

    risk_distribution = hfrs_scores_df['Risk_Category'].value_counts(normalize=True) * 100
    print(f"Calculated the proportions for {dataset_name}")

    plt.figure(figsize=(8, 6))
    bars = plt.bar(risk_distribution.index, risk_distribution, color=['green', 'orange', 'red'])
    plt.title(f'Risk Distribution for {dataset_name}')
    plt.xlabel('Risk Category')
    plt.ylabel('Proportion (%)')
    plt.xticks(rotation=0)
    print(f"Plotted the risk distribution for {dataset_name}")

    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:.2f}%',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),
                     textcoords="offset points",
                     ha='center', va='bottom')
    print(f"Added the percentage values for {dataset_name}")

    plt.savefig(output_path)
    print(f"Saved {dataset_name} plot into folder {output_path}")
    plt.close()

# 7. Specify file paths for the HFRS CSV outputs
dgn_file_path = '/home/work/dataset1_hfrs.csv'
kertomus_file_path = '/home/work/dataset2_hfrs.csv'
til_tap_file_path = '/home/work/dataset3_hfrs.csv'

# 8. Specify output paths for the risk distribution plots
dgn_plot_path = '/home/work/plots/k4rl_dgn_risk_distribution.png'
kertomus_plot_path = '/home/work/plots/k4rl_kertomus_risk_distribution.png'
til_tap_plot_path = '/home/work/plots/k4rl_til_tap_risk_distribution.png'

# 9. Generate and save risk distribution plots
plot_risk_distribution(dgn_file_path, dgn_plot_path, "dgn_df")
plot_risk_distribution(kertomus_file_path, kertomus_plot_path, "kertomus_df")
plot_risk_distribution(til_tap_file_path, til_tap_plot_path, "til_tap_df")
