**<h1 align="center">CSV Preprocessing</h1>**

## Global Variables

### Project Specific Variables

In [None]:
# CSV Files
CSV_FOLDER = "../../data/"
# CSV_FOLDER = "/home/pyuser/data/"
CSV_LABELS_FILE = "paradise_csi.csv"
CSV_PATIENTS_FILE = "PatientIds.csv"
CSV_ARCHIMED_DATA = "ArchiMed_Data.csv"
CSV_TO_EXPLORE_1 = "paradise_csi_drop_non_nan_w_classes.csv"
CSV_TO_EXPLORE_2 = "paradise_csi_w_classes_w_non_nan.csv"
CSV_SEPARATOR = ","  # Specify the CSV separator, e.g., ',' or '\t'
IMPORT_COLUMNS = []  # If empty, import all columns
CHUNK_SIZE = 50000  # Number of rows per chunk

# Project Specific Variables
EXAM_CODE_START = "2020-128 01-"


### Colors

In [None]:
# ANSI escape codes for colored output
ANSI = {
    'R' : '\033[91m',  # Red
    'G' : '\033[92m',  # Green
    'B' : '\033[94m',  # Blue
    'Y' : '\033[93m',  # Yellow
    'W' : '\033[0m',  # White
}

## Import CSVs to Dataframe

### CSV Import

In [None]:
import pandas as pd
import numpy as np

# Import CSV files into dataframes
try:
    # Import labels data
    df_labels = pd.read_csv(
        CSV_FOLDER + CSV_LABELS_FILE,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_labels = pd.concat(df_labels, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_LABELS_FILE}")
    
    # Import patient data  
    df_patients = pd.read_csv(
        CSV_FOLDER + CSV_PATIENTS_FILE,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_patients = pd.concat(df_patients, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_PATIENTS_FILE}")
    
    # Import ArchiMed CSV
    df_archimed = pd.read_csv(
        CSV_FOLDER + CSV_ARCHIMED_DATA,
        sep=';',  # ArchiMed CSV separator is ';'
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_archimed = pd.concat(df_archimed, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_ARCHIMED_DATA}")
    
    # Import CSV to explore 1
    df_to_explore_1 = pd.read_csv(
        CSV_FOLDER + CSV_TO_EXPLORE_1,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_to_explore_1 = pd.concat(df_to_explore_1, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_TO_EXPLORE_1}")
    
    # Import CSV to explore 2
    df_to_explore_2 = pd.read_csv(
        CSV_FOLDER + CSV_TO_EXPLORE_2,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_to_explore_2 = pd.concat(df_to_explore_2, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_TO_EXPLORE_2}")

except Exception as e:
    print(f"{ANSI['R']}Error importing CSV files: {str(e)}{ANSI['W']}")


## Rename columns

### ArchiMed CSV

In [None]:
# Rename columns in df_archimed to be more readable
df_archimed.rename(columns={
    "Exam": 'ExamCode',  # Not really the patient's name but the exam code
    'Instance Number': 'InstanceNumber',  # ID of the image series
    # 'FileID': 'FileID',  # ID of the image file
    'Admission ID - (0038,0010)': 'AdmissionID',  # ID of the patient's hospital stay
    'Image Type - (0008,0008)': 'ImageType',  # Type of the image (PRIMARY, etc.)
    'Series Description - (0008,103E)': 'SeriesDescription',  # Description of the image series
    'Derivation Description - (0008,2111)': 'DerivationDescription',  # Description of the image derivation
}, inplace=True)

### Labeled Data CSV

In [None]:
# Rename columns in df_labels to be more readable
df_labels.rename(columns={
    'number': 'ExamCodeEnd',  # Last 4 digits of the exam code
    'id_number': 'AdmissionID',  # ID of the patient's hospital stay
}, inplace=True)

## Check ArchiMed data to merge with Labeled data

In [None]:
# Count number of primary images per exam
primary_images_per_exam = df_archimed[df_archimed['ImageType'].str.contains('PRIMARY', na=False)].groupby('ExamCode').size()

# Group exams by number of primary images they contain
exams_distribution = primary_images_per_exam.value_counts().sort_index()
print(f"{ANSI['G']}Distribution of PRIMARY images per exam:{ANSI['W']}")
for count, num_exams in exams_distribution.items():
    print(f"  {num_exams} exams have {count} PRIMARY images")

# Get total number of unique exams
total_exams = df_archimed['ExamCode'].nunique()

# Calculate exams with no primary images
exams_with_no_primary = total_exams - len(primary_images_per_exam)
exams_with_multiple = len(primary_images_per_exam[primary_images_per_exam > 1])

print(f"\n{ANSI['G']}Summary:{ANSI['W']}")
print(f"  {len(primary_images_per_exam)} exams have at least one PRIMARY image")
print(f"  {exams_with_no_primary} exams have no PRIMARY images")
print(f"  {exams_with_multiple} exams have multiple PRIMARY images")

# Filter ArchiMed data to keep only PRIMARY images
df_archimed_primary = df_archimed[df_archimed['ImageType'].str.contains('PRIMARY', na=False)]
print(f"\n{ANSI['G']}Successfully filtered{ANSI['W']} PRIMARY images from ArchiMed data")


In [None]:
# Find series with multiple primary images
series_with_multiple = primary_images_per_exam[primary_images_per_exam > 1].sort_values(ascending=False)
print(f"{ANSI['G']}Series with multiple PRIMARY images:{ANSI['W']}")
for exam_code, num_primary in series_with_multiple.items():
    print(f"  ExamCode: {exam_code} ({num_primary} primary images)")


## Merge ArchiMed Data with Labeled Data

In [None]:
# Merge df_archimed with df_labels on ExamCodeEnd

# Extract last 4 digits from ExamCode in df_archimed and convert to int
df_archimed['ExamCodeEnd'] = df_archimed['ExamCode'].str[-4:].astype(int)

# Merge dataframes on ExamCodeEnd
df_merged = pd.merge(df_archimed, df_labels, on='ExamCodeEnd', how='inner')

# Get unmerged entries from both dataframes
df_unmerged_archimed = df_archimed[~df_archimed['ExamCodeEnd'].isin(df_merged['ExamCodeEnd'])]
df_unmerged_labels = df_labels[~df_labels['ExamCodeEnd'].isin(df_merged['ExamCodeEnd'])]

print(f"\n{ANSI['G']}Successfully merged{ANSI['W']} ArchiMed data with Labeled data")
print(f"Number of rows in merged dataset: {len(df_merged)}")
print(f"Number of unmerged rows from ArchiMed: {len(df_unmerged_archimed)}")
print(f"Number of unmerged rows from Labels: {len(df_unmerged_labels)}")


In [None]:
# Merge df_unmerged_labels with df_archimed on AdmissionID
df_merged_labels_from_unmerged = pd.merge(df_unmerged_labels, df_archimed, on='AdmissionID', how='inner')

print(f"\n{ANSI['G']}Successfully merged{ANSI['W']} unmerged Labeled data with ArchiMed data")
print(f"Number of rows in merged dataset: {len(df_merged_labels_from_unmerged)}")

## Delete Series that are not 'Chest' when there are multiple series

In [None]:
# First, identify ExamCodes with multiple distinct Series
series_per_exam = df_merged.groupby('ExamCode')['Serie'].nunique()
exams_with_multiple = series_per_exam[series_per_exam > 1].index

# Filter rows where ExamCode has multiple series but Serie is not 'Chest' or 'LIT'
mask = (
    (df_merged['ExamCode'].isin(exams_with_multiple)) & 
    (~df_merged['Serie'].str.contains('Chest|LIT', case=False, na=False))
)

# Save rows that will be removed
df_lines_removed = df_merged[mask].copy()

# Remove these rows
df_merged = df_merged[~mask]

print(f"\n{ANSI['G']}Successfully removed non-Chest series{ANSI['W']} from exams with multiple series")
print(f"Number of rows removed: {mask.sum()}")
print(f"Number of rows remaining: {len(df_merged)}")
print(f"Removed rows saved in df_lines_removed")

In [None]:
# Check that there is still at least 1 serie per exam after removing Series that are not 'Chest'
# Get unique ExamCodes from removed rows
removed_exam_codes = df_lines_removed['ExamCode'].unique()

# Check how many distinct series remain for each of these ExamCodes
remaining_series = df_merged[df_merged['ExamCode'].isin(removed_exam_codes)].groupby('ExamCode')['Serie'].nunique()

# Check if any ExamCode has 0 remaining series
empty_exams = removed_exam_codes[~np.isin(removed_exam_codes, remaining_series.index)]

if len(empty_exams) > 0:
    print(f"\n{ANSI['R']}Warning:{ANSI['W']} {len(empty_exams)} exams have no remaining series after removal:")
    print(empty_exams)
else:
    print(f"\n{ANSI['G']}All exams still have at least one series{ANSI['W']} after removing non-Chest series")
    print("Minimum series per exam:", remaining_series.min())
    print("Maximum series per exam:", remaining_series.max())
    print("Average series per exam: {:.2f}".format(remaining_series.mean()))