In [None]:
# Work 18: Calculation of Modified Charlson Comorbidity Index (CCI) Scores Using ICD-10 Codes:
#  [W18.CCI.5.calculates_moCCI_scores.ipynb]

# "This Jupyter notebook calculates Modified Charlson Comorbidity Index (CCI) scores from ICD-10 codes, cleans 
#   data, and saves results to a CSV file."

# Note: This notebook is a work in progress and currently identifies only about 20% of CCI ICD-codes from the dataset.

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Paths to datasets
# 2: Load the main dataset
# 3: Load ICD-10 -> CCI scores and conditions
# 4: Create dictionaries for Weight and Condition
# 4.1: If a subgroup is found in the mapping, ensure the base code is also added
# 4.2: Add the base code (3 characters) if it is not already present
# 5: Updated function for cleaning and formatting ICD codes
# 5.1: Remove non-alphanumeric characters
# 5.2: If the code is longer than 3 characters and the fourth character is not a digit, truncate after three characters
# 5.3: Ensure the first letter is uppercase
# 5.4: If the code is shorter than 4 characters, pad with 'x'
# 5.5: Return only the first 4 characters
# 6: Preprocess ICD_code and format the codes
# 7: Calculate the average length of ICD codes in both datasets
# 8: Debug: Empty values after preprocessing
# 9: Add scores and conditions
# 9.1: Add CCI_Score column
# 9.2: Add Condition column
# 10: Analyze empty ICD_code values
# 11: Debug: ICD codes not found in the mapping
# 12: Calculate how many ICD_code values are found in the mapping
# 13: Save the dataset to a file
# 14: Display execution time

########################################################################################################
########################################################################################################

import pandas as pd
import time
import re  

# Start timing
start_time = time.time()

# 1: Paths to datasets
data_path = '/home/work/pp_all_data.csv'
cci_map_path = 'https://raw.githubusercontent.com/Tupatuko2023/Python-R-Scripts/main/tables/W16.mo_cci_no_dots.csv'
output_path = '/home/work/pp_all_data_with_cci.csv'

print("1: Paths to datasets loaded")

# 2: Load the main dataset
data = pd.read_csv(data_path, dtype=str)

# 3: Load ICD-10 -> CCI scores and conditions
cci_mapping = pd.read_csv(cci_map_path, dtype=str)

# 4: Create dictionaries for Weight and Condition
cci_weight_map = {}
cci_condition_map = {}

# 4.1: If a subgroup is found in the mapping, ensure the base code is also added
for _, row in cci_mapping.iterrows():
    base_code = row['ICD-10 Code']
    cci_weight_map[base_code] = int(row['Weight'])
    cci_condition_map[base_code] = row['Condition']
    
    # 4.2: Add the base code (3 characters) if it is not already present
    if len(base_code) > 3:
        short_code = base_code[:3]
        if short_code not in cci_weight_map:
            cci_weight_map[short_code] = int(row['Weight'])
            cci_condition_map[short_code] = row['Condition']

print("4: Datasets loaded and CCI mapping created")

# 5: Updated function for cleaning and formatting ICD codes
def clean_and_format_icd10_code(icd_code):
    """Combines cleaning and formatting of ICD-10 codes."""
    if isinstance(icd_code, str):
        # 5.1: Remove non-alphanumeric characters
        cleaned_code = re.sub(r'[^A-Za-z0-9]', '', icd_code)
        
        # 5.2: If the code is longer than 3 characters and the fourth character is not a digit, truncate after three characters
        if len(cleaned_code) > 3 and not cleaned_code[3].isdigit():
            cleaned_code = cleaned_code[:3]
        
        # 5.3: Ensure the first letter is uppercase
        if len(cleaned_code) > 0:
            cleaned_code = cleaned_code[0].upper() + cleaned_code[1:]
        
        # 5.4: If the code is shorter than 4 characters, pad with 'x'
        if len(cleaned_code) < 4:
            cleaned_code = cleaned_code.ljust(4, 'x')
        
        # 5.5. Return only the first 4 characters
        return cleaned_code[:4]
    else:
        return ''  # Return empty if the code is not a string


# 6: Preprocess ICD_code and format the codes
data['ICD_code'] = data['ICD_code'].apply(clean_and_format_icd10_code)
cci_mapping['ICD-10 Code'] = cci_mapping['ICD-10 Code'].apply(clean_and_format_icd10_code)

print("6: ICD codes cleaned and formatted")

# 7: Calculate the average length of ICD codes in both datasets
def calculate_avg_icd_length(column):
    return column.apply(lambda x: len(x) if isinstance(x, str) else 0).mean()

avg_length_data = calculate_avg_icd_length(data['ICD_code'])
avg_length_mapping = calculate_avg_icd_length(cci_mapping['ICD-10 Code'])

print(f"7: Average length of ICD codes in the dataset: {avg_length_data:.2f}")
print(f"7: Average length of ICD codes in the mapping: {avg_length_mapping:.2f}")

# 8: Debug: Empty values after preprocessing
original_icd_empty = data['ICD_code'][data['ICD_code'] == '']
print(f"8: Empty ICD_code values after preprocessing: {len(original_icd_empty)}")
original_icd_empty.to_csv('/home/HUSTIETOALLAS/ext13144568/mounts/research/Tomi_K/4A/Aineistot/CCI_4.empty_icd_codes.csv', index=False)

# 9: Add scores and conditions
def assign_cci_data(data, weight_map, condition_map):
    # 9.1: Add CCI_Score column
    data['CCI_Score'] = data['ICD_code'].map(weight_map).fillna(0).astype(int)
    # 9.2: Add Condition column
    data['Condition'] = data['ICD_code'].map(condition_map).fillna('Unknown')
    return data

data_with_scores = assign_cci_data(data, cci_weight_map, cci_condition_map)

print("9: ICD codes scored and conditions added")

# 10: Analyze empty ICD_code values
empty_icd_count = data_with_scores['ICD_code'].isnull().sum() + (data_with_scores['ICD_code'] == '').sum()
total_rows = len(data_with_scores)
empty_icd_percentage = (empty_icd_count / total_rows) * 100

print(f"10.1: Empty ICD_code rows: {empty_icd_count}")
print(f"10.2: Percentage of the entire dataset: {empty_icd_percentage:.2f}%")

# 11: Debug: ICD codes not found in the mapping
not_in_mapping = data['ICD_code'][~data['ICD_code'].isin(cci_weight_map.keys()) & (data['ICD_code'] != '')]
not_in_mapping.to_csv('/home/HUSTIETOALLAS/ext13144568/mounts/research/Tomi_K/4A/Aineistot/CCI_4.debug_not_in_mapping.csv', index=False)

print(f"11: Codes not found in the mapping saved to file: {len(not_in_mapping)}")


# 12: Calculate how many ICD_code values are found in the mapping
mapped_codes = data_with_scores['ICD_code'][data_with_scores['ICD_code'].isin(cci_weight_map.keys())]
mapped_count = len(mapped_codes)
mapped_percentage = (mapped_count / total_rows) * 100

print(f"12.1: Number of ICD_code rows found in the mapping: {mapped_count}")
print(f"12.2: Percentage found in the mapping: {mapped_percentage:.2f}%")

# 13: Save the dataset to a file
data_with_scores.to_csv(output_path, index=False)

print(f"13: Results saved to file: {output_path}")

# 14: Display execution time
end_time = time.time()
print(f"14: Total execution time: {end_time - start_time:.2f} seconds")