**<h1 align="center">CSV Preprocessing</h1>**

## Global Variables

### Project Specific Variables

In [None]:
# CSV Files
CSV_FOLDER = "../../data/"
# CSV_FOLDER = "/home/pyuser/data/"
CSV_LABELS_FILE = "paradise_csi.csv"
CSV_PATIENTS_FILE = "PatientIds.csv"
CSV_ARCHIMED_DATA = "ArchiMed_Data.csv"
CSV_TO_EXPLORE_1 = "paradise_csi_drop_non_nan_w_classes.csv"
CSV_TO_EXPLORE_2 = "paradise_csi_w_classes_w_non_nan.csv"
CSV_SEPARATOR = ","  # Specify the CSV separator, e.g., ',' or '\t'
IMPORT_COLUMNS = []  # If empty, import all columns
CHUNK_SIZE = 50000  # Number of rows per chunk

# Project Specific Variables
EXAM_CODE_START = "2020-128 01"


### Colors

In [None]:
# ANSI escape codes for colored output
ANSI = {
    'R' : '\033[91m',  # Red
    'G' : '\033[92m',  # Green
    'B' : '\033[94m',  # Blue
    'Y' : '\033[93m',  # Yellow
    'W' : '\033[0m',  # White
}

## Import CSV to Dataframe

### CSV Import

In [None]:
import pandas as pd

# Import CSV files into dataframes
try:
    # Import labels data
    df_labels = pd.read_csv(
        CSV_FOLDER + CSV_LABELS_FILE,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_labels = pd.concat(df_labels, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_LABELS_FILE}")
    
    # Import patient data  
    df_patients = pd.read_csv(
        CSV_FOLDER + CSV_PATIENTS_FILE,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_patients = pd.concat(df_patients, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_PATIENTS_FILE}")
    
    # Import ArchiMed CSV
    df_archimed = pd.read_csv(
        CSV_FOLDER + CSV_ARCHIMED_DATA,
        sep=';',  # ArchiMed CSV separator is ';'
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_archimed = pd.concat(df_archimed, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_ARCHIMED_DATA}")
    
    # Import CSV to explore 1
    df_to_explore_1 = pd.read_csv(
        CSV_FOLDER + CSV_TO_EXPLORE_1,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_to_explore_1 = pd.concat(df_to_explore_1, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_TO_EXPLORE_1}")
    
    # Import CSV to explore 2
    df_to_explore_2 = pd.read_csv(
        CSV_FOLDER + CSV_TO_EXPLORE_2,
        sep=CSV_SEPARATOR,
        usecols=IMPORT_COLUMNS if IMPORT_COLUMNS else None,
        chunksize=CHUNK_SIZE
    )
    df_to_explore_2 = pd.concat(df_to_explore_2, ignore_index=True)
    print(f"{ANSI['G']}Successfully imported{ANSI['W']} {CSV_TO_EXPLORE_2}")

except Exception as e:
    print(f"{ANSI['R']}Error importing CSV files: {str(e)}{ANSI['W']}")


In [None]:
# Rename columns in df_archimed to be more readable
df_archimed.rename(columns={
    "Patient's Name - (0010,0010)": 'ExamCode',
    'Instance Number - (0020,0013)': 'InstanceNumber',
    'Admission ID - (0038,0010)': 'AdmissionID',
    'Image Type - (0008,0008)': 'ImageType',
    'Derivation Description - (0008,2111)': 'Derivation',
}, inplace=True)

# Count number of primary images per series
primary_images_per_series = df_archimed[df_archimed['ImageType'].str.contains('PRIMARY', na=False)].groupby('ExamCode').size()

# Group series by number of primary images they contain
series_distribution = primary_images_per_series.value_counts().sort_index()
print(f"{ANSI['G']}Distribution of PRIMARY images per series:{ANSI['W']}")
for count, num_series in series_distribution.items():
    print(f"  {num_series} series have {count} PRIMARY images")

# Get total number of unique series
total_series = df_archimed['ExamCode'].nunique()

# Calculate series with no primary images
series_with_no_primary = total_series - len(primary_images_per_series)
series_with_multiple = len(primary_images_per_series[primary_images_per_series > 1])

print(f"\n{ANSI['G']}Summary:{ANSI['W']}")
print(f"  {len(primary_images_per_series)} series have at least one PRIMARY image")
print(f"  {series_with_no_primary} series have no PRIMARY images")
print(f"  {series_with_multiple} series have multiple PRIMARY images")

# Filter ArchiMed data to keep only PRIMARY images
df_archimed_primary = df_archimed[df_archimed['ImageType'].str.contains('PRIMARY', na=False)]
print(f"{ANSI['G']}Successfully filtered{ANSI['W']} PRIMARY images from ArchiMed data")
