**<h1 align="center">CSV Preprocessing</h1>**

## Global Variables

### Project Specific Variables

In [None]:
# CSV Files
CSV_FOLDER = "../../data/"
# CSV_FOLDER = "/home/pyuser/data/"
CSV_LABELS_FILE = "Labeled_Data_RAW_Sample.csv"
CSV_SEPARATOR = ";"  # Specify the CSV separator, e.g., ',' or '\t'
IMPORT_COLUMNS = []  # If empty, import all columns
CHUNK_SIZE = 50000  # Number of rows per chunk

DOWNLOAD_PATH = '../../data/Paradise_Images'

### Colors

In [None]:
# ANSI escape codes for colored output
ANSI = {
    'R' : '\033[91m',  # Red
    'G' : '\033[92m',  # Green
    'B' : '\033[94m',  # Blue
    'Y' : '\033[93m',  # Yellow
    'W' : '\033[0m',  # White
}

## Import CSVs to Dataframe

### CSV Import

In [None]:
import pandas as pd
import os

def import_csv_to_dataframe(file_path, separator=';', columns=None, chunk_size=None):
    """
    Import CSV file into a pandas DataFrame.
    
    Args:
        file_path (str): Path to the CSV file
        separator (str): CSV separator character
        columns (list): List of columns to import (if None, import all)
        chunk_size (int): Number of rows to read at a time (if None, read all at once)
        
    Returns:
        pandas.DataFrame: The imported data
    """
    try:
        # Determine which columns to use
        usecols = columns if columns and len(columns) > 0 else None
        
        if chunk_size:
            # Read in chunks and concatenate
            chunks = []
            for chunk in pd.read_csv(file_path, sep=separator, usecols=usecols, chunksize=chunk_size):
                chunks.append(chunk)
            return pd.concat(chunks, ignore_index=True)
        else:
            # Read all at once
            return pd.read_csv(file_path, sep=separator, usecols=usecols)
    except Exception as e:
        print(f"{ANSI['R']}Error importing CSV: {e}{ANSI['W']}")
        return None

In [None]:
# Import labeled data
csv_path = os.path.join(CSV_FOLDER, CSV_LABELS_FILE)
print(f"{ANSI['B']}Importing labeled data from: {csv_path}{ANSI['W']}")

df_labeled_data = import_csv_to_dataframe(
    file_path=csv_path,
    separator=CSV_SEPARATOR,
    columns=IMPORT_COLUMNS,
    chunk_size=CHUNK_SIZE
)

if df_labeled_data is not None:
    print(f"{ANSI['G']}Successfully imported {len(df_labeled_data)} rows of labeled data{ANSI['W']}")
    display(df_labeled_data.head())
else:
    print(f"{ANSI['R']}Failed to import labeled data{ANSI['W']}")


## Download the files from ArchiMed

In [None]:
def download_archimed_files(dataframe, download_path, file_id_column='FileID', batch_size=100):
    """
    Downloads files from ArchiMed based on FileIDs in the dataframe.
    
    Args:
        dataframe (pandas.DataFrame): DataFrame containing FileIDs
        download_path (str): Path where to save downloaded files
        file_id_column (str): Name of the column containing FileIDs (default: 'FileID')
        batch_size (int): Number of files to process in each batch for progress reporting
        
    Returns:
        list: List of successfully downloaded file paths
    """
    from ArchiMedConnector.A3_Connector import A3_Connector
    import os
    
    # Create download directory if it doesn't exist
    os.makedirs(download_path, exist_ok=True)
    
    # Initialize ArchiMed connector
    a3conn = A3_Connector()
    
    # Get user info for verification
    user_info = a3conn.getUserInfos()
    print(f"{ANSI['B']}Connected to ArchiMed as:{ANSI['W']} {user_info.get('login', 'Unknown')}")
    
    # Check if the FileID column exists
    if file_id_column not in dataframe.columns:
        print(f"{ANSI['R']}Error: Column '{file_id_column}' not found in dataframe{ANSI['W']}")
        return []
    
    # Get unique FileIDs to avoid downloading duplicates
    file_ids = dataframe[file_id_column].unique()
    total_files = len(file_ids)
    
    print(f"{ANSI['B']}Starting download of {total_files} files to{ANSI['W']} {download_path}")
    
    downloaded_files = []
    failed_files = []
    
    # Process files in batches to show progress
    for i, file_id in enumerate(file_ids):
        if pd.isna(file_id):
            continue
            
        try:
            # Convert to integer if needed
            file_id = int(file_id)
            
            # Define output path for this file
            file_output_path = os.path.join(download_path, f"{file_id}")
            
            # Download the file
            result = a3conn.downloadFile(
                file_id,
                asStream=False,
                filePath=file_output_path,
                inWorklist=False
            )
            
            downloaded_files.append(result)
            
            # Show progress every batch_size files
            if (i + 1) % batch_size == 0 or (i + 1) == total_files:
                print(f"{ANSI['G']}Progress: {i + 1}/{total_files} files processed ({((i + 1) / total_files * 100):.1f}%){ANSI['W']}")
                
        except Exception as e:
            failed_files.append(file_id)
            print(f"{ANSI['R']}Error downloading file ID {file_id}: {str(e)}{ANSI['W']}")
    
    # Summary
    print(f"{ANSI['G']}Download complete: {len(downloaded_files)} files downloaded successfully{ANSI['W']}")
    if failed_files:
        print(f"{ANSI['R']}Failed to download {len(failed_files)} files{ANSI['W']}")
    
    return downloaded_files

In [None]:
# Download files from the labeled data
if df_labeled_data is not None:
    print(f"{ANSI['B']}Starting download of ArchiMed files...{ANSI['W']}")
    downloaded_files = download_archimed_files(
        dataframe=df_labeled_data,
        download_path=DOWNLOAD_PATH
    )
    print(f"{ANSI['G']}Downloaded {len(downloaded_files)} files to {download_path}{ANSI['W']}")
else:
    print(f"{ANSI['R']}Cannot download files: No labeled data available{ANSI['W']}")

## Convert DICOM files to PNG files

In [None]:
import os
import pydicom
import numpy as np
from PIL import Image
import glob
from tqdm import tqdm

def convert_dicom_to_png(
    import_folder,
    export_folder,
    bit_depth=8,
    create_subfolders=False,
    resize_x=None,
    resize_y=None
):
    """
    Convert all DICOM files in import_folder (including subfolders) to PNG format.
    
    Parameters:
    -----------
    import_folder : str
        Path to folder containing DICOM files to convert
    export_folder : str
        Path to folder where PNG files will be saved
    bit_depth : int
        Bit depth for output images (8, 12, or 16)
    create_subfolders : bool
        If True, create subfolders named after ExamCode for output files
    resize_x : int or None
        Width to resize images to (if None, no resizing)
    resize_y : int or None
        Height to resize images to (if None, no resizing)
    
    Returns:
    --------
    dict
        Summary of conversion process with counts of successful and failed conversions
    """
    # Validate bit depth
    if bit_depth not in [8, 12, 16]:
        raise ValueError("bit_depth must be 8, 12, or 16")
    
    # Create export folder if it doesn't exist
    os.makedirs(export_folder, exist_ok=True)
    
    # Find all DICOM files recursively
    dicom_files = []
    for ext in ['.dcm', '.DCM', '']:  # Common DICOM extensions (including no extension)
        dicom_files.extend(glob.glob(os.path.join(import_folder, '**/*' + ext), recursive=True))
    
    # Initialize counters
    successful = 0
    failed = 0
    skipped = 0
    
    print(f"{ANSI['B']}Converting {len(dicom_files)} DICOM files to PNG...{ANSI['W']}")
    
    # Process each DICOM file
    for dicom_path in tqdm(dicom_files):
        try:
            # Try to read as DICOM
            try:
                ds = pydicom.dcmread(dicom_path)
                pixel_array = ds.pixel_array
            except:
                skipped += 1
                continue  # Skip if not a valid DICOM file
            
            # Get metadata for subfolder creation if needed
            exam_code = str(getattr(ds, 'StudyDescription', os.path.basename(os.path.dirname(dicom_path))))
            
            # Normalize pixel values based on bit depth
            if bit_depth == 8:
                # Scale to 0-255
                if pixel_array.max() > 0:
                    pixel_array = ((pixel_array / pixel_array.max()) * 255).astype(np.uint8)
                else:
                    pixel_array = pixel_array.astype(np.uint8)
            elif bit_depth == 12:
                # Scale to 0-4095
                if pixel_array.max() > 0:
                    pixel_array = ((pixel_array / pixel_array.max()) * 4095).astype(np.uint16)
                else:
                    pixel_array = pixel_array.astype(np.uint16)
            elif bit_depth == 16:
                # Scale to 0-65535
                if pixel_array.max() > 0:
                    pixel_array = ((pixel_array / pixel_array.max()) * 65535).astype(np.uint16)
                else:
                    pixel_array = pixel_array.astype(np.uint16)
            
            # Convert to PIL Image
            img = Image.fromarray(pixel_array)
            
            # Resize if specified
            if resize_x is not None and resize_y is not None:
                img = img.resize((resize_x, resize_y), Image.LANCZOS)
            
            # Determine output path
            base_filename = os.path.splitext(os.path.basename(dicom_path))[0]
            if create_subfolders:
                subfolder_path = os.path.join(export_folder, exam_code)
                os.makedirs(subfolder_path, exist_ok=True)
                output_path = os.path.join(subfolder_path, f"{base_filename}.png")
            else:
                output_path = os.path.join(export_folder, f"{base_filename}.png")
            
            # Save as PNG
            img.save(output_path)
            successful += 1
            
        except Exception as e:
            print(f"{ANSI['R']}Error converting {dicom_path}: {str(e)}{ANSI['W']}")
            failed += 1
    
    # Summary
    print(f"{ANSI['G']}Conversion complete:{ANSI['W']}")
    print(f"  - {successful} files successfully converted")
    print(f"  - {skipped} files skipped (not valid DICOM)")
    print(f"  - {failed} files failed to convert")
    
    return {
        "successful": successful,
        "skipped": skipped,
        "failed": failed,
        "total": len(dicom_files)
    }

In [None]:
# Example usage
convert_dicom_to_png(
    import_folder=DOWNLOAD_PATH,
    export_folder=DOWNLOAD_PATH,
    bit_depth=8,
    create_subfolders=False
)