In [None]:
# Work 17: Preprocessing and Truncating ICD-10 Codes in Large Medical Datasets 
# [W17.CCI.4.procc_CCI_data.ipynb]

# "This Jupyter notebook preprocesses medical data by truncating ICD-10 codes, removing empty rows, 
#   converting dates, and saving the cleaned dataset."

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Function to truncate diagnosis code
# 2: File paths
# 3: Chunk size
# 4: Load data in chunks
# 5: Process each chunk
#    - Remove empty rows
#    - Truncate ICD codes
#    - Convert date
# 6: Combine all processed chunks and save

########################################################################################################
########################################################################################################

import pandas as pd

# 1: Function to truncate diagnosis code
def truncate_icd10_code(icd_code):
    """Truncate ICD-10 code to the first letter and first 5 digits and keep the dot."""
    if pd.isnull(icd_code):  # Check for null value
        return icd_code  # Return the original null value
    if len(icd_code) < 6:  # If the code is too short, return the original
        return icd_code
    return icd_code[:6]  # Return the first 6 characters

# 2: File paths
input_path = '/home/work/all_data.csv'
output_path = '/home/work/pp_all_data.csv'

# 3: Chunk size
chunk_size = 100000  # Rows per processing

# 4: Load data in chunks
chunks = pd.read_csv(input_path, dtype=str, chunksize=chunk_size, usecols=['Patient_ID', 'date', 'ICD_code'])

print("Preprocessing started")
processed_chunks = []

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}")
    chunk = chunk.dropna(subset=['Patient_ID', 'date', 'ICD_code'])  # Remove empty rows
    chunk['ICD_code'] = chunk['ICD_code'].apply(truncate_icd10_code)  # Truncate ICD codes
    chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce').dt.strftime('%Y-%m-%d')  # Convert date
    processed_chunks.append(chunk)

# 5: Combine all processed chunks and save
print("5: Combining all processed chunks")
processed_data = pd.concat(processed_chunks)
processed_data.to_csv(output_path, index=False)

print("6: Processed data saved:", output_path)