**<h1 align="center">CSV Preprocessing</h1>**

## Global Variables

### Project Specific Variables

In [None]:
# CSV Files
CSV_FOLDER = "../../data/Paradise_CSV/"
# CSV_FOLDER = "/home/pyuser/data/"
CSV_LABELS_FILE = "Labeled_Data_RAW_Sample.csv"
CSV_SEPARATOR = ";"  # Specify the CSV separator, e.g., ',' or '\t'
IMPORT_COLUMNS = []  # If empty, import all columns
CHUNK_SIZE = 50000  # Number of rows per chunk

# Download parameters
DOWNLOAD_PATH = '../../data/Paradise_DICOMs'
IMAGES_PATH = '../../data/Paradise_Images'
CONVERT = True

# Conversion parameters
BATCH_SIZE = 10  # Number of files to process in each batch for progress reporting
BIT_DEPTH = 8  # Bit depth for output images (8, 12, or 16)
CREATE_SUBFOLDERS = False  # If True, create subfolders named after ExamCode for output files
DELETE_DICOM = False  # If True, delete the DICOM file and its containing subfolder after conversion
MONOCHROME = 1  # Monochrome type (1 or 2) to use for converted images
# RESIZE_X = 518
RESIZE_Y = 750

### Colors

In [None]:
# ANSI escape codes for colored output
ANSI = {
    'R' : '\033[91m',  # Red
    'G' : '\033[92m',  # Green
    'B' : '\033[94m',  # Blue
    'Y' : '\033[93m',  # Yellow
    'W' : '\033[0m',  # White
}

In [None]:
# from ArchiMedConnector.A3_Connector import A3_Connector
import ArchiMedConnector.A3_Connector as A3_Conn

# Initialize ArchiMed connector
# a3conn= A3_Connector()
a3conn = A3_Conn.A3_Connector()

import pandas as pd
import os
import pydicom
import numpy as np
from PIL import Image
import glob
from tqdm import tqdm
import shutil
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

## Import CSVs to Dataframe

### CSV Import

### Function

In [None]:
def import_csv_to_dataframe(file_path, separator=';', columns=None, chunk_size=None):
    """
    Import CSV file into a pandas DataFrame.
    
    Args:
        file_path (str): Path to the CSV file
        separator (str): CSV separator character
        columns (list): List of columns to import (if None, import all)
        chunk_size (int): Number of rows to read at a time (if None, read all at once)
        
    Returns:
        pandas.DataFrame: The imported data
    """
    try:
        # Determine which columns to use
        usecols = columns if columns and len(columns) > 0 else None
        
        if chunk_size:
            # Read in chunks and concatenate
            chunks = []
            for chunk in pd.read_csv(file_path, sep=separator, usecols=usecols, chunksize=chunk_size):
                chunks.append(chunk)
            return pd.concat(chunks, ignore_index=True)
        else:
            # Read all at once
            return pd.read_csv(file_path, sep=separator, usecols=usecols)
    except Exception as e:
        print(f"{ANSI['R']}Error importing CSV: {e}{ANSI['W']}")
        return None

### Execution

In [None]:
# Import labeled data
csv_path = os.path.join(CSV_FOLDER, CSV_LABELS_FILE)
print(f"{ANSI['B']}Importing labeled data from: {csv_path}{ANSI['W']}")

df_labeled_data = import_csv_to_dataframe(
    file_path=csv_path,
    separator=CSV_SEPARATOR,
    columns=IMPORT_COLUMNS,
    chunk_size=CHUNK_SIZE
)

if df_labeled_data is not None:
    print(f"{ANSI['G']}Successfully imported {len(df_labeled_data)} rows of labeled data{ANSI['W']}")
    display(df_labeled_data.head())
else:
    print(f"{ANSI['R']}Failed to import labeled data{ANSI['W']}")


## Download the files from ArchiMed

### Function

In [None]:
def collect_metadata(file_ids):
    """
    Collect metadata from DICOM file headers for one or more FileIDs.
    
    Parameters:
    -----------
    file_ids : str or list
        A single FileID or a list of FileIDs to process
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the metadata from the DICOM headers
    """
    # Convert single FileID to list for consistent processing
    if isinstance(file_ids, str):
        file_ids = [file_ids]
    
    metadata_list = []
    
    for file_id in file_ids:
        # Construct path to the DICOM file
        subfolder_path = os.path.join(DOWNLOAD_PATH, file_id)
        dicom_file_path = os.path.join(subfolder_path, f"{file_id}.dcm")
        
        try:
            if os.path.exists(dicom_file_path):
                # Read the DICOM file
                dicom_data = pydicom.dcmread(dicom_file_path)
                
                # Extract metadata as a dictionary
                metadata = {}
                metadata['FileID'] = file_id
                
                # Extract common DICOM attributes
                for attr in dir(dicom_data):
                    # Skip private attributes, methods, and sequences
                    if not attr.startswith('_') and not callable(getattr(dicom_data, attr)) and attr != 'PixelData':
                        try:
                            value = getattr(dicom_data, attr)
                            # Skip sequence attributes which are complex objects
                            if not isinstance(value, pydicom.sequence.Sequence):
                                metadata[attr] = str(value)
                        except Exception as e:
                            metadata[attr] = f"Error: {str(e)}"
                
                metadata_list.append(metadata)
            else:
                print(f"{ANSI['R']}DICOM file not found: {dicom_file_path}{ANSI['W']}")
        except Exception as e:
            print(f"{ANSI['R']}Error processing {file_id}: {str(e)}{ANSI['W']}")
    
    # Create DataFrame from the collected metadata
    if metadata_list:
        return pd.DataFrame(metadata_list)
    else:
        print(f"{ANSI['Y']}No metadata collected.{ANSI['W']}")
        return pd.DataFrame()

In [None]:
def convert_dicom_to_png(
    import_folder,
    export_folder,
    bit_depth=8,
    create_subfolders=False,
    resize_x=None,
    resize_y=None,
    monochrome=2,
    delete_dicom=True
):
    """
    Convert all DICOM files in import_folder (including subfolders) to PNG format.
    
    Parameters:
    -----------
    import_folder : str
        Path to folder containing DICOM files to convert
    export_folder : str
        Path to folder where PNG files will be saved
    bit_depth : int
        Bit depth for output images (8, 12, or 16)
    create_subfolders : bool
        If True, create subfolders named after ExamCode for output files
    resize_x : int or None
        Width to resize images to (if None, no resizing unless resize_y is specified)
    resize_y : int or None
        Height to resize images to (if None, no resizing unless resize_x is specified)
    monochrome : int
        Default monochrome type (1 or 2) to use if not specified in DICOM header
    delete_dicom : bool
        If True, delete the DICOM file and its containing subfolder after conversion
    
    Returns:
    --------
    dict
        Summary of conversion process with counts
    """
    # Validate bit depth
    if bit_depth not in [8, 12, 16]:
        raise ValueError("bit_depth must be 8, 12, or 16")
    
    # Validate monochrome
    if monochrome not in [1, 2]:
        raise ValueError("monochrome must be 1 or 2")
    
    # Create export folder if it doesn't exist
    os.makedirs(export_folder, exist_ok=True)
    
    # Find all DICOM files recursively
    dicom_files = []
    for ext in ['.dcm', '.DCM']:  # Common DICOM extensions
        dicom_files.extend(glob.glob(os.path.join(import_folder, '**/*' + ext), recursive=True))
    
    # Initialize counters
    successful = 0
    failed = 0
    skipped = 0
    
    # Suppress specific pydicom warnings about character sets
    warnings.filterwarnings("ignore", category=UserWarning, module="pydicom.charset")
    
    # Process each DICOM file
    for dicom_path in tqdm(dicom_files, desc="Converting DICOM files to PNG", total=len(dicom_files)):
        try:
            # Try to read as DICOM
            try:
                ds = pydicom.dcmread(dicom_path)
                pixel_array = ds.pixel_array
            except Exception as e:
                skipped += 1
                continue  # Skip if not a valid DICOM file
            
            # Get metadata for subfolder creation if needed
            exam_code = str(getattr(ds, 'StudyDescription', os.path.basename(os.path.dirname(dicom_path))))
            
            # Get file ID from the filename
            file_id = os.path.splitext(os.path.basename(dicom_path))[0]
            if file_id.endswith('.dcm'):
                file_id = file_id[:-4]  # Remove .dcm if present
            
            # Check the PhotometricInterpretation from DICOM header
            dicom_monochrome = monochrome  # Default value
            
            if hasattr(ds, 'PhotometricInterpretation'):
                if ds.PhotometricInterpretation == 'MONOCHROME1':
                    dicom_monochrome = 1
                elif ds.PhotometricInterpretation == 'MONOCHROME2':
                    dicom_monochrome = 2
            
            # Get bit depth information from DICOM header
            bits_allocated = getattr(ds, 'BitsAllocated', 14)  # Default to 14 if not present
            bits_stored = getattr(ds, 'BitsStored', bits_allocated)  # Default to bits_allocated if not present
            high_bit = getattr(ds, 'HighBit', bits_stored - 1)  # Default to bits_stored-1 if not present
            max_pixel_value = pixel_array.max()
            
            # Calculate the maximum possible value based on bits_stored
            max_possible_value = (2 ** bits_stored) - 1
            
            # Normalize pixel values based on bit depth
            output_max_value = (2 ** bit_depth) - 1  # Maximum value for the output bit depth
            
            # Scale to the appropriate range based on the output bit depth
            if max_pixel_value > 0:
                # Use the actual bit depth for scaling
                pixel_array = ((pixel_array / min(max_pixel_value, max_possible_value)) * output_max_value)
            
            # Convert to appropriate data type based on bit depth
            if bit_depth <= 8:
                pixel_array = pixel_array.astype(np.uint8)
            else:
                pixel_array = pixel_array.astype(np.uint16)
            
            # Invert pixel values if needed to match the desired monochrome type
            # If DICOM is MONOCHROME1 and we want MONOCHROME2, or vice versa, we need to invert
            if dicom_monochrome != monochrome and dicom_monochrome in [1, 2] and monochrome in [1, 2]:
                pixel_array = output_max_value - pixel_array
            
            # Convert to PIL Image
            img = Image.fromarray(pixel_array)
            
            # Resize if specified, maintaining aspect ratio if only one dimension is provided
            if resize_x is not None or resize_y is not None:
                original_width, original_height = img.size
                
                if resize_x is not None and resize_y is not None:
                    # Both dimensions specified, resize to exact dimensions
                    new_size = (resize_x, resize_y)
                elif resize_x is not None:
                    # Only width specified, calculate height to maintain aspect ratio
                    aspect_ratio = original_height / original_width
                    new_size = (resize_x, int(resize_x * aspect_ratio))
                else:
                    # Only height specified, calculate width to maintain aspect ratio
                    aspect_ratio = original_width / original_height
                    new_size = (int(resize_y * aspect_ratio), resize_y)
                
                img = img.resize(new_size, Image.LANCZOS)
            
            # Determine output path
            base_filename = os.path.splitext(os.path.basename(dicom_path))[0]
            if create_subfolders:
                subfolder_path = os.path.join(export_folder, exam_code)
                os.makedirs(subfolder_path, exist_ok=True)
                output_path = os.path.join(subfolder_path, f"{base_filename}.png")
            else:
                output_path = os.path.join(export_folder, f"{base_filename}.png")
            
            # Save as PNG
            img.save(output_path)
            successful += 1
            
            # Delete DICOM file and its containing folder if requested
            if delete_dicom:
                # Delete the DICOM file
                os.remove(dicom_path)
                
                # Delete the containing subfolder if it's empty
                dicom_folder = os.path.dirname(dicom_path)
                if dicom_folder != import_folder:  # Don't delete the main import folder
                    try:
                        # Check if folder is empty
                        if not os.listdir(dicom_folder):
                            shutil.rmtree(dicom_folder)
                    except Exception as e:
                        print(f"{ANSI['Y']}Warning: Could not delete folder {dicom_folder}: {str(e)}{ANSI['W']}")
            
        except Exception as e:
            print(f"{ANSI['R']}Error converting {dicom_path}: {str(e)}{ANSI['W']}")
            failed += 1
    
    # Summary
    summary = {
        "successful": successful,
        "skipped": skipped,
        "failed": failed,
        "total": len(dicom_files)
    }
    
    return summary

In [None]:
def download_archimed_files(dataframe, download_path, file_id_column='FileID', batch_size=20, convert=False):
    """
    Downloads files from ArchiMed based on FileIDs in the dataframe.
    
    Args:
        dataframe (pandas.DataFrame): DataFrame containing FileIDs
        download_path (str): Path where to save downloaded files
        file_id_column (str): Name of the column containing FileIDs (default: 'FileID')
        batch_size (int): Number of files to process in each batch for progress reporting
        convert (bool): If True, convert downloaded DICOM files to PNG after each batch (default: False)
        
    Returns:
        pandas.DataFrame: DataFrame with metadata of converted files
    """
    
    # Create download directory if it doesn't exist
    os.makedirs(download_path, exist_ok=True)
    
    # Get user info for verification
    user_info = a3conn.getUserInfos()
    print(f"{ANSI['G']}ArchiMed Connector Authentication Information")
    print(f"{ANSI['B']}Username:{ANSI['W']} {user_info.get('userInfos', {}).get('login', 'Unknown')}")
    print(f"{ANSI['B']}User level:{ANSI['W']} {user_info.get('userInfos', {}).get('level', 'Unknown')}")
    print(f"{ANSI['B']}Native Groups:{ANSI['W']} {', '.join(user_info.get('nativeGroups', ['None']))}")
    print(f"{ANSI['B']}Authorized studies:{ANSI['W']} {', '.join(user_info.get('authorizedStudies', ['None']))}")
    print(f"{ANSI['B']}Authorized temporary storages:{ANSI['W']} {', '.join(user_info.get('authorizedTmpStorages', ['None']))}")
    
    # Check if the FileID column exists
    if file_id_column not in dataframe.columns:
        print(f"{ANSI['R']}Error: Column '{file_id_column}' not found in dataframe{ANSI['W']}")
        return pd.DataFrame()
    
    # Get unique FileIDs to avoid downloading duplicates
    file_ids = dataframe[file_id_column].unique()
    total_files = len(file_ids)
    
    print(f"\n{ANSI['B']}Starting download of {ANSI['W']}{total_files}{ANSI['B']} files to{ANSI['W']} {download_path}\n")
    
    failed_files = []
    batch_files = []
    all_metadata = pd.DataFrame()  # Store all metadata records
    downloaded_count = 0
    skipped_count = 0
    
    # Process files in batches to show progress
    for i, file_id in enumerate(file_ids):
        if pd.isna(file_id):
            continue
            
        try:
            # Convert to integer if needed
            file_id = int(file_id)
            
            # Define output path for this file
            file_output_path = os.path.join(download_path, f"{file_id}")
            dicom_file_path = os.path.join(file_output_path, f"{file_id}.dcm")
            
            # Check if the file already exists
            if os.path.exists(dicom_file_path):
                print(f"{ANSI['Y']}File {ANSI['W']}{file_id}{ANSI['Y']} already exists, skipping download (Progress: {ANSI['W']}{((i+1)/total_files)*100:.1f}%{ANSI['Y']} - {ANSI['W']}{i+1}/{total_files}{ANSI['Y']}){ANSI['W']}")
                batch_files.append(dicom_file_path)
                skipped_count += 1
            else:
                print(f"{ANSI['B']}Downloading file {ANSI['W']}{file_id}{ANSI['B']} (Progress: {ANSI['W']}{((i+1)/total_files)*100:.1f}%{ANSI['B']} - {ANSI['W']}{i+1}/{total_files}{ANSI['B']}) from{ANSI['W']} ArchiMed")
                
                # Download the file
                result = a3conn.downloadFile(
                    file_id,
                    asStream=False,
                    destDir=file_output_path,
                    filename=f"{file_id}.dcm",
                    inWorklist=False
                )
                
                downloaded_count += 1
                batch_files.append(result)
            
            # Collect metadata for this file
            file_metadata = collect_metadata(file_id)
            
            # Add metadata to the collection if not already present
            if not file_metadata.empty and (all_metadata.empty or not (all_metadata['FileID'] == file_id).any()):
                all_metadata = pd.concat([all_metadata, file_metadata], ignore_index=True)
            
            # Show progress every batch_size files
            if (i + 1) % batch_size == 0 or (i + 1) == total_files:
                
                # Convert batch if requested
                if convert and batch_files:
                    try:
                        # print(f"\n{ANSI['B']}Converting batch of {ANSI['W']}{len(batch_files)}{ANSI['B']} DICOM files to PNG...{ANSI['W']}")
                        summary = convert_dicom_to_png(
                            import_folder=download_path,
                            export_folder=IMAGES_PATH,
                            bit_depth=BIT_DEPTH,
                            create_subfolders=CREATE_SUBFOLDERS,
                            delete_dicom=DELETE_DICOM,
                            monochrome=MONOCHROME,
                            resize_y=RESIZE_Y,
                        )
                    except Exception as e:
                        print(f"{ANSI['R']}Error during conversion: {str(e)}{ANSI['W']}")
                
                batch_files = []
                print(f"{ANSI['Y']}Progress:{ANSI['W']} {i + 1}/{total_files} {ANSI['B']}files processed {ANSI['W']}({ANSI['B']}{((i + 1) / total_files * 100):.1f}%{ANSI['W']})\n")
                
        except Exception as e:
            failed_files.append(file_id)
            print(f"{ANSI['R']}Error downloading file ID {file_id}: {str(e)}{ANSI['W']}")
    
    # Summary
    print(f"\n{ANSI['G']}Download complete: {downloaded_count} files downloaded successfully{ANSI['W']}")
    if skipped_count > 0:
        print(f"{ANSI['Y']}Skipped {skipped_count} files (already downloaded){ANSI['W']}")
    if failed_files:
        print(f"{ANSI['R']}Failed to download {len(failed_files)} files{ANSI['W']}")
    
    return all_metadata

### Execution

In [None]:
# Download files from the labeled data
if df_labeled_data is not None:
    print(f"{ANSI['Y']}Starting download of ArchiMed files...{ANSI['W']}\n")
    metadata_df = download_archimed_files(
        dataframe=df_labeled_data,
        download_path=DOWNLOAD_PATH,
        batch_size=BATCH_SIZE,
        convert=CONVERT
    )
    if not metadata_df.empty:
        print(f"{ANSI['G']}Downloaded files successfully to {ANSI['W']}{DOWNLOAD_PATH}")
        print(f"{ANSI['B']}Metadata collected for {ANSI['W']}{len(metadata_df)}{ANSI['B']} images{ANSI['W']}")
    else:
        print(f"{ANSI['Y']}Files downloaded but no metadata was collected{ANSI['W']}")
else:
    print(f"{ANSI['R']}Cannot download files: No labeled data available{ANSI['W']}")

## Correlation Matrix

### Function

In [None]:
def display_correlation_matrix(dataframe, gradient_name='coolwarm'):
    """
    Display a correlation matrix for the given dataframe.
    
    Parameters:
    -----------
    dataframe : pandas.DataFrame
        The dataframe containing the data to analyze
    gradient_name : str, default='coolwarm'
        The name of the color gradient to use for the heatmap
        
    Returns:
    --------
    None
        Displays the correlation matrix as a heatmap
    """
    
    # Select only numeric columns
    numeric_df = dataframe.select_dtypes(include=['number'])
    
    # Calculate the correlation matrix
    corr_matrix = numeric_df.corr()
    
    # Create a figure with appropriate size
    plt.figure(figsize=(10, 8))
    
    # Create a heatmap
    sns.heatmap(
        corr_matrix, 
        annot=True,              # Show correlation values
        cmap=gradient_name,      # Color map
        linewidths=0.5,          # Width of the grid lines
        vmin=-1, vmax=1,         # Value range
        fmt='.2f',               # Format of the annotations
        square=True              # Make cells square-shaped
    )
    
    # Add title and adjust layout
    plt.title('Numeric Parameters Correlation Matrix', fontsize=14, pad=20)
    plt.tight_layout()
    
    # Display the plot
    plt.show()
    
    return None

### Execution

In [None]:
display_correlation_matrix(df_labeled_data)