In [1]:
"""
Feature: charlson_score
Calculate Charlson Comorbidity Index score
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load diagnoses
diagnoses = pd.read_csv(
    os.path.join(hosp_path, "diagnoses_icd.csv"),
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)
diagnoses["icd_code"] = diagnoses["icd_code"].astype(str)

def charlson_comorbidity(icd_codes):
    """Calculate Charlson comorbidities from ICD codes"""
    comorbidity = {
        "myocardial_infarction": 0, "congestive_heart_failure": 0,
        "peripheral_vascular_disease": 0, "cerebrovascular_disease": 0,
        "dementia": 0, "chronic_pulmonary_disease": 0,
        "rheumatic_disease": 0, "peptic_ulcer_disease": 0,
        "mild_liver_disease": 0, "diabetes_without_complication": 0,
        "diabetes_with_complication": 0, "paraplegia": 0,
        "renal_disease": 0, "cancer": 0,
        "moderate_severe_liver_disease": 0, "metastatic_cancer": 0,
        "aids_hiv": 0
    }

    for code, version in icd_codes:
        c = str(code).replace(".", "").upper()

        if version == 9:
            if c.startswith(("410", "412")):
                comorbidity["myocardial_infarction"] = 1
            if c.startswith("428") or c.startswith("39891") or \
               (c.startswith("402") and len(c) > 3 and c[3] == "1") or \
               (c.startswith("404") and len(c) > 3 and c[3] == "1"):
                comorbidity["congestive_heart_failure"] = 1
            if c.startswith(("440", "441", "4439", "7854")):
                comorbidity["peripheral_vascular_disease"] = 1
            if c[:3].isdigit() and 430 <= int(c[:3]) <= 438:
                comorbidity["cerebrovascular_disease"] = 1
            if c.startswith("290"):
                comorbidity["dementia"] = 1
            if c[:3].isdigit() and 490 <= int(c[:3]) <= 496:
                comorbidity["chronic_pulmonary_disease"] = 1
            if c[:3].isdigit() and 710 <= int(c[:3]) <= 714:
                comorbidity["rheumatic_disease"] = 1
            if c[:3].isdigit() and 531 <= int(c[:3]) <= 534:
                comorbidity["peptic_ulcer_disease"] = 1
            if c.startswith("571"):
                comorbidity["mild_liver_disease"] = 1
            if c.startswith(("5722", "5723", "5724", "5728")):
                comorbidity["moderate_severe_liver_disease"] = 1
            if c.startswith("250") and len(c) > 3 and c[3] in "0123":
                comorbidity["diabetes_without_complication"] = 1
            if c.startswith("250") and len(c) > 3 and c[3] in "456789":
                comorbidity["diabetes_with_complication"] = 1
            if c.startswith(("342", "343", "3441")):
                comorbidity["paraplegia"] = 1
            if c.startswith(("585", "586")):
                comorbidity["renal_disease"] = 1
            if c[:3].isdigit() and 140 <= int(c[:3]) <= 199:
                comorbidity["cancer"] = 1
            if c[:3].isdigit() and 196 <= int(c[:3]) <= 199:
                comorbidity["metastatic_cancer"] = 1
            if c[:3].isdigit() and 42 <= int(c[:3]) <= 44:
                comorbidity["aids_hiv"] = 1

        elif version == 10:
            if c.startswith(("I21", "I22", "I23", "I24", "I25")):
                comorbidity["myocardial_infarction"] = 1
            if c.startswith("I50") or c.startswith("I11") or c.startswith("I13"):
                comorbidity["congestive_heart_failure"] = 1
            if c[0] == "I" and c[1:3].isdigit() and 70 <= int(c[1:3]) <= 79:
                comorbidity["peripheral_vascular_disease"] = 1
            if (c.startswith("I") and c[1:3].isdigit() and 60 <= int(c[1:3]) <= 69) or c.startswith("G45"):
                comorbidity["cerebrovascular_disease"] = 1
            if c.startswith(("F01", "F02", "F03")):
                comorbidity["dementia"] = 1
            if c.startswith("J") and c[1:3].isdigit() and 40 <= int(c[1:3]) <= 44:
                comorbidity["chronic_pulmonary_disease"] = 1
            if c.startswith(("M05", "M06", "M32")):
                comorbidity["rheumatic_disease"] = 1
            if c.startswith("K") and c[1:3].isdigit() and 25 <= int(c[1:3]) <= 28:
                comorbidity["peptic_ulcer_disease"] = 1
            if (c.startswith("K") and c[1:3].isdigit() and 70 <= int(c[1:3]) <= 73) or c.startswith("K76"):
                comorbidity["mild_liver_disease"] = 1
            if c.startswith(("K721", "K729", "K766")):
                comorbidity["moderate_severe_liver_disease"] = 1
            if c.startswith(("E10", "E11", "E13")) and c.endswith("9"):
                comorbidity["diabetes_without_complication"] = 1
            if c.startswith(("E10", "E11", "E12", "E13", "E14")) and \
               any(x in c for x in ["2", "3", "4", "5", "6", "7"]):
                comorbidity["diabetes_with_complication"] = 1
            if c.startswith(("G81", "G82")):
                comorbidity["paraplegia"] = 1
            if c.startswith(("N18", "N19")):
                comorbidity["renal_disease"] = 1
            if c.startswith("C") and c[1:3].isdigit() and 0 <= int(c[1:3]) <= 75:
                comorbidity["cancer"] = 1
            if c.startswith("C") and c[1:3].isdigit() and 76 <= int(c[1:3]) <= 80:
                comorbidity["metastatic_cancer"] = 1
            if c.startswith(("B20", "B21", "B22", "B24")):
                comorbidity["aids_hiv"] = 1

    return comorbidity

def compute_charlson_score(comorb):
    """Compute Charlson score from comorbidities"""
    score = 0
    weight1 = ["myocardial_infarction", "congestive_heart_failure", "peripheral_vascular_disease",
               "cerebrovascular_disease", "dementia", "chronic_pulmonary_disease", "rheumatic_disease",
               "peptic_ulcer_disease", "mild_liver_disease", "diabetes_without_complication"]
    score += sum(comorb[c] for c in weight1)
    
    weight2 = ["diabetes_with_complication", "paraplegia", "renal_disease", "cancer"]
    score += 2 * sum(comorb[c] for c in weight2)
    
    weight3 = ["moderate_severe_liver_disease"]
    score += 3 * sum(comorb[c] for c in weight3)
    
    weight6 = ["metastatic_cancer", "aids_hiv"]
    score += 6 * sum(comorb[c] for c in weight6)
    
    return score

def charlson_age_points(age):
    """Calculate age points for Charlson score"""
    if pd.isna(age):
        return 0
    if age >= 80:
        return 4
    elif age >= 70:
        return 3
    elif age >= 60:
        return 2
    elif age >= 50:
        return 1
    return 0

# Group diagnoses by hadm_id
grouped = diagnoses.groupby("hadm_id").apply(
    lambda g: list(zip(g["icd_code"], g["icd_version"])), include_groups=False
)

# Calculate Charlson scores
charlson_results = {}
for hadm_id, icd_list in grouped.items():
    comorb = charlson_comorbidity(icd_list)
    base_score = compute_charlson_score(comorb)
    charlson_results[hadm_id] = {"charlson_base": base_score}

charlson_df = pd.DataFrame.from_dict(charlson_results, orient="index")
charlson_df.reset_index(inplace=True)
charlson_df.rename(columns={"index": "hadm_id"}, inplace=True)

# Add subject_id
charlson_df = charlson_df.merge(
    diagnoses[["subject_id", "hadm_id"]].drop_duplicates(),
    on="hadm_id",
    how="left"
)

# Merge with dataset
df = df.merge(
    charlson_df[["subject_id", "hadm_id", "charlson_base"]],
    on=["subject_id", "hadm_id"],
    how="left"
)

# Add age points and calculate final score
df["charlson_age_points"] = df["anchor_age"].apply(charlson_age_points)
df["charlson_score"] = df["charlson_base"].fillna(0) + df["charlson_age_points"]

# Drop intermediate columns
df.drop(columns=["charlson_base", "charlson_age_points"], inplace=True)

# Fill missing values with 0
df["charlson_score"] = df["charlson_score"].fillna(0).astype(int)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'charlson_score' added. Range: {df['charlson_score'].min()} - {df['charlson_score'].max()}")
print(f"Dataset shape: {df.shape}")



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 49)
Feature 'charlson_score' added. Range: 0 - 22
Dataset shape: (158020, 49)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "charlson_score"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)