In [23]:
import pandas as pd
import pydicom
import os 
import numpy as np
import warnings
import math
import random
from joblib import Parallel, delayed
import multiprocessing as mp
from pathlib import Path
LABEL_COLS = [
    'Left Infraclinoid Internal Carotid Artery',
    'Right Infraclinoid Internal Carotid Artery',
    'Left Supraclinoid Internal Carotid Artery',
    'Right Supraclinoid Internal Carotid Artery',
    'Left Middle Cerebral Artery',
    'Right Middle Cerebral Artery',
    'Anterior Communicating Artery',
    'Left Anterior Cerebral Artery',
    'Right Anterior Cerebral Artery',
    'Left Posterior Communicating Artery',
    'Right Posterior Communicating Artery',
    'Basilar Tip',
    'Other Posterior Circulation',
    'Aneurysm Present',
]
WORK_DIR = Path("./")
DATA_DIR = Path("./data")
SERIES_DIR = Path("./series")
PROCESSING_DIR = Path ("./processing_csvs")

In [24]:
df_localizers = pd.read_csv(PROCESSING_DIR / "train_localizers_proc.csv")
df_train = pd.read_csv(PROCESSING_DIR / "train_proc.csv")
df_dimensions = pd.read_csv(PROCESSING_DIR / "dimensions_proc.csv")

print("Change in dataset sizes:")
print(len(df_localizers), len(df_train), len(df_dimensions))
# There are no such files in the provided documents
to_remove = [
    #SeriesInstanceUID                                                   #SOPInstanceUID #
    ("1.2.826.0.1.3680043.8.498.11145695452143851764832708867797988068", "1.2.826.0.1.3680043.8.498.11359680660692538603323710088085312565"),
    ("1.2.826.0.1.3680043.8.498.35204126697881966597435252550544407444", "1.2.826.0.1.3680043.8.498.50473067775982707701946022117324201859")
]

# this step removes some data that cant be found
for series_uid, sop_uid in to_remove:
    df_localizers = df_localizers[(df_localizers["SeriesInstanceUID"] != series_uid) | (df_localizers["SOPInstanceUID"] != sop_uid)]
    df_train = df_train[(df_train["SeriesInstanceUID"] != series_uid)]
    df_dimensions = df_dimensions[(df_dimensions["SeriesInstanceUID"] != series_uid)]

df_joined = pd.merge(df_train, df_localizers, on='SeriesInstanceUID', how='left')
df_ultimate = pd.merge(df_joined, df_dimensions, on="SeriesInstanceUID", how="left")
print(len(df_localizers), len(df_train), len(df_dimensions))

Change in dataset sizes:
2251 4348 4402
2249 4346 4400


In [25]:
test_set = df_ultimate[df_ultimate['Shape'].str.count(',') == 2]
# test_set.groupby("Depth").count() # -- All have depth of 1 so just one dicom file
# test_set[test_set.duplicated(subset='SeriesInstanceUID', keep=False)] #
# test_set[test_set.duplicated(subset=['SeriesInstanceUID', 'SOPInstanceUID'] , keep=False)] # has duplicates for differnet locations and coordinats
test_set = test_set.drop_duplicates(subset='SeriesInstanceUID')

# SINGLE THREAD
def process_dicom_files(series_root_path, output_path):
    test_folder = os.path.join(output_path, 'test')
    os.makedirs(test_folder, exist_ok=True)
    
    for series_instance_uid in test_set['SeriesInstanceUID']:
        series_folder_path = os.path.join(series_root_path, str(series_instance_uid))
        if not os.path.exists(series_folder_path):
            print(f"Series folder for UID {series_instance_uid} does not exist.")
            continue
        dcm_files = [f for f in os.listdir(series_folder_path) if f.endswith('.dcm')]
        if len(dcm_files) != 1:
            print(f"Series UID {series_instance_uid} has {len(dcm_files)} DICOM files instead of one.")
            continue
        dcm_file_path = os.path.join(series_folder_path, dcm_files[0])
        dcm_data = pydicom.dcmread(dcm_file_path)
        pixel_array = dcm_data.pixel_array
        output_series_folder = os.path.join(test_folder, str(series_instance_uid))
        os.makedirs(output_series_folder, exist_ok=True)
        output_file_path = os.path.join(output_series_folder, dcm_files[0].replace('.dcm', '.npy'))
        np.save(output_file_path, pixel_array)
        print(f"Processed series UID {series_instance_uid}, saved pixel data to {output_file_path}")


In [26]:

def process_single_test_series(series_instance_uid, series_root_path, output_path):
    """Process a single test series"""
    series_folder_path = os.path.join(series_root_path, str(series_instance_uid))
    
    if not os.path.exists(series_folder_path):
        return f"Series folder for UID {series_instance_uid} does not exist.\n"
    
    dcm_files = [f for f in os.listdir(series_folder_path) if f.endswith('.dcm')]
    
    if len(dcm_files) != 1:
        return f"Series UID {series_instance_uid} has {len(dcm_files)} DICOM files instead of one.\n"
    
    # Process the single DICOM file
    dcm_file_path = os.path.join(series_folder_path, dcm_files[0])
    dcm_data = pydicom.dcmread(dcm_file_path)
    pixel_array = dcm_data.pixel_array
    
    # Save to output folder (unique per series - no race conditions)
    test_folder = os.path.join(output_path, 'test')
    output_series_folder = os.path.join(test_folder, str(series_instance_uid))
    os.makedirs(output_series_folder, exist_ok=True)
    
    output_file_path = os.path.join(output_series_folder, dcm_files[0].replace('.dcm', '.npy'))
    np.save(output_file_path, pixel_array)
    
    return ""

def process_dicom_files_parallel(series_root_path, output_path, n_jobs=-1):
    """
    Process DICOM files in parallel using joblib
    n_jobs: number of parallel jobs (-1 uses all available cores)
    """
    # Create test folder
    test_folder = os.path.join(output_path, 'test')
    os.makedirs(test_folder, exist_ok=True)
    
    # Prepare tasks
    series_ids = test_set['SeriesInstanceUID'].tolist()
    print(f"Processing {len(series_ids)} series...")
    
    # Process in parallel
    results = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(process_single_test_series)(series_id, series_root_path, output_path)
        for series_id in series_ids
    )
    
    # Print results
    for result in results:
        if result:  # Only print non-empty results
            print(result, end="")
    
    print("Processing complete!")

In [169]:
# UNCOMMENT IF YOU WANT TO RELOAD THE DATA
# process_dicom_files_parallel(str(SERIES_DIR), str(DATA_DIR))
def find_sop_instance_id(series_instance_id, base_dir):
    series_path = os.path.join(base_dir, series_instance_id)
    if not os.path.exists(series_path):
        return None
    files = os.listdir(series_path)
    if files:
        return files[0].rstrip('.npy') 
    return None

def update_dataframe_with_sop_instance_id(df, base_dir):
    # Apply the function to find SOPInstanceID for each row
    df['SOPInstanceUID'] = df['SeriesInstanceUID'].apply(find_sop_instance_id, base_dir=base_dir)
    return df

# Update the DataFrame
df = update_dataframe_with_sop_instance_id(test_set, './data/test')
df.to_csv('./test.csv', index=False)

In [156]:
# Positive set v1
positive_set = df_localizers
positive_set[positive_set.duplicated(subset=['SeriesInstanceUID'], keep=False)]
test_set_ids = set(test_set['SeriesInstanceUID'])
positive_set = positive_set[~positive_set['SeriesInstanceUID'].isin(test_set_ids)]
#df_test_positive_set = df_ultimate[df_ultimate['Shape'].str.count(',') == 2]
positive_set_ids = set(positive_set['SeriesInstanceUID'])
#df_test_positive_set[df_test_positive_set['SeriesInstanceUID'].isin(positive_set_ids)]
positive_set.groupby(["SeriesInstanceUID","SOPInstanceUID"]).count() # 2099 positive slices

Unnamed: 0_level_0,Unnamed: 1_level_0,coordinates,location
SeriesInstanceUID,SOPInstanceUID,Unnamed: 2_level_1,Unnamed: 3_level_1
1.2.826.0.1.3680043.8.498.10005158603912009425635473100344077317,1.2.826.0.1.3680043.8.498.10775329348174902199350466348663848346,1,1
1.2.826.0.1.3680043.8.498.10022796280698534221758473208024838831,1.2.826.0.1.3680043.8.498.53868409774237283281776807176852774246,1,1
1.2.826.0.1.3680043.8.498.10023411164590664678534044036963716636,1.2.826.0.1.3680043.8.498.24186535344744886473554579401056227253,1,1
1.2.826.0.1.3680043.8.498.10030095840917973694487307992374923817,1.2.826.0.1.3680043.8.498.75217084841854214544099244823406151875,1,1
1.2.826.0.1.3680043.8.498.10034081836061566510187499603024895557,1.2.826.0.1.3680043.8.498.71237104731452368587327801789352569583,1,1
...,...,...,...
1.2.826.0.1.3680043.8.498.99887675554378211308175946117895608384,1.2.826.0.1.3680043.8.498.10885430363476327277192154022897733247,1,1
1.2.826.0.1.3680043.8.498.99887675554378211308175946117895608384,1.2.826.0.1.3680043.8.498.75978746530527925899354153686225196613,1,1
1.2.826.0.1.3680043.8.498.99892390884723813599532075083872271516,1.2.826.0.1.3680043.8.498.12398549862508001109149426855485142650,2,2
1.2.826.0.1.3680043.8.498.99892390884723813599532075083872271516,1.2.826.0.1.3680043.8.498.21598979799967012280125410147654260304,2,2


In [157]:

def process_single_positive_series(series_instance_uid, series_root_path, output_folder, df_ultimate):
    """Process a single positive series"""
    series_folder_path = os.path.join(series_root_path, str(series_instance_uid))
    
    results = {
        'positions_sampled': [],
        'validation_train_split': [],
        'processed_count': 0,
        'warnings': []
    }
    
    if not os.path.exists(series_folder_path):
        results['warnings'].append(f"Series folder for UID {series_instance_uid} does not exist.")
        return results
    
    sop_instances = set(df_ultimate[df_ultimate["SeriesInstanceUID"]==series_instance_uid]["SOPInstanceUID"])
    
    dcm_file_paths =[os.path.join(series_folder_path,f) for f in os.listdir(series_folder_path) if f.endswith('.dcm')]
    
    if len(dcm_file_paths) == 0:
        results['warnings'].append(f"No DICOM files found in series folder: {series_folder_path}")
        return results
    
    dcm_instance_numbers = []
    for dcm in dcm_file_paths:
        ds = pydicom.dcmread(dcm, stop_before_pixels=True)
        if hasattr(ds, 'InstanceNumber'):
            dcm_instance_numbers.append(ds.InstanceNumber)
        else:
            results['warnings'].append(f"No instance number found in DICOM file: {dcm} in series {series_instance_uid}")
    if not dcm_instance_numbers:
        results['warnings'].append(f"No valid instance numbers found in DICOM files for series {series_instance_uid}")
        maximum = len(dcm_file_paths)  # Fallback if no instance numbers found
        minimum = 0
    else:
        maximum = max(dcm_instance_numbers)
        minimum = min(dcm_instance_numbers)

    # Determine output type based on folder
    output_type = 'val' if 'validation' in output_folder else 'train'
    
    for sop in sop_instances:
        dcm_file_path = os.path.join(series_folder_path, f"{sop}.dcm")
        if os.path.exists(dcm_file_path):
            try:
                dcm_data = pydicom.dcmread(dcm_file_path)
                pixel_array = dcm_data.pixel_array
                
                output_series_folder = os.path.join(output_folder, str(series_instance_uid))
                os.makedirs(output_series_folder, exist_ok=True)
                
                output_file_path = os.path.join(output_series_folder, f"{sop}.npy")
                instance_number = dcm_data.InstanceNumber
                sample_position = (instance_number - minimum) / (maximum - minimum)
                
                np.save(output_file_path, pixel_array)
                
                results['positions_sampled'].append(sample_position)
                results['validation_train_split'].append((series_instance_uid, sop, sample_position, output_type))
                results['processed_count'] += 1
                
            except Exception as e:
                results['warnings'].append(f"Error processing {dcm_file_path}: {str(e)}")
        else:
            results['warnings'].append(f"No DICOM file found {dcm_file_path}.")
    
    return results

def process_dicom_files_positives_parallel(series_root_path, output_path, positive_set_ids, df_ultimate, n_jobs=-1):
    """
    Process DICOM files in parallel
    """
    # Create folders
    train_folder = os.path.join(output_path, 'train')
    validation_folder = os.path.join(output_path, 'validation')
    
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(validation_folder, exist_ok=True)
    
    # Pre-determine train/validation split
    total_series = len(positive_set_ids)
    train_count = int(total_series * 0.8)
    
    l_positive_set_ids = list(positive_set_ids)
    random.shuffle(l_positive_set_ids)  # Shuffle to ensure random split
    train_series = l_positive_set_ids[:train_count]
    val_series = l_positive_set_ids[train_count:]
    
    print(f"Processing {len(train_series)} training series and {len(val_series)} validation series...")
    
    # Prepare tasks
    train_tasks = [(series_id, series_root_path, train_folder, df_ultimate) for series_id in train_series]
    val_tasks = [(series_id, series_root_path, validation_folder, df_ultimate) for series_id in val_series]
    
    all_tasks = train_tasks + val_tasks
    
    # Process in parallel
    print("Starting parallel processing...")
    results = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(process_single_positive_series)(series_id, series_root_path, output_folder, df_ultimate)
        for series_id, series_root_path, output_folder, df_ultimate in all_tasks
    )
    
    # Combine results
    validation_train_split = {'val': [], 'train': []}
    positions_sampled = []
    total_processed = 0
    
    for result in results:
        positions_sampled.extend(result['positions_sampled'])
        total_processed += result['processed_count']
        
        # Add to appropriate split
        for series_id, sop, pos, output_type in result['validation_train_split']:
            validation_train_split[output_type].append((series_id, sop, pos))
        
        # Print warnings
        for warning in result['warnings']:
            warnings.warn(warning)
    
    print(f"Processing complete! Processed {total_processed} files total.")
    print(f"Train samples: {len(validation_train_split['train'])}")
    print(f"Validation samples: {len(validation_train_split['val'])}")
    
    return validation_train_split, positions_sampled



In [158]:
def create_datafames_for(array, df_localizers, df_train, skip_loc=False):
    # Collect data in lists
    class_data = []
    locators_data = []
    
    def get_label_values(series_uid, sop_uid, df_localizers):
        filtered_values = df_localizers[
            (df_localizers['SeriesInstanceUID'] == series_uid) &
            (df_localizers['SOPInstanceUID'] == sop_uid)
        ]['location'].values
        label_array = np.zeros(len(LABEL_COLS))
        label_array[np.isin(LABEL_COLS, filtered_values)] = 1
        if np.any(label_array[:-1]):
            label_array[-1] = 1
        return label_array

    def create_locator_rows(series_uid, sop_uid, df_localizers, sample_position):
        filtered_localizer = df_localizers[
            (df_localizers['SeriesInstanceUID'] == series_uid) & 
            (df_localizers['SOPInstanceUID'] == sop_uid)
        ]
        coordinates_and_types = filtered_localizer[['coordinates', 'location']].values
        rows = []
        for coordinate, a_type in coordinates_and_types:
            rows.append({
                'SeriesInstanceUID': series_uid,
                'SOPInstanceUID': sop_uid,
                'Coordinates': coordinate,
                'Coordinates Type': a_type,
                'Sample position': sample_position
            })
        return rows

    for series_uid, sop, sample_position in array:
        # Collect classification data
        label_array = get_label_values(series_uid, sop, df_localizers)
        modality = df_train[
            (df_train['SeriesInstanceUID'] == series_uid)
        ]['Modality'].iloc[0]
        
        # Create dictionary for this row
        class_row = {
            'SeriesInstanceUID': series_uid,
            'SOPInstanceUID': sop,  # Fixed: was sop_uid
            'Modality': modality
        }
        # Add label columns
        for i, label_col in enumerate(LABEL_COLS):
            class_row[label_col] = label_array[i]

        class_data.append(class_row)

        # Collect locator data
        if not skip_loc:
            locator_rows = create_locator_rows(series_uid, sop, df_localizers, sample_position)
            locators_data.extend(locator_rows)


    # Create DataFrames once at the end
    class_dataframe = pd.DataFrame(class_data)
    locators_dataframe = pd.DataFrame(locators_data)
    return class_dataframe, locators_dataframe

In [159]:
# uncomment the following to process dcom files and before delete the files in the train and validation set
val_train_split, positions_sampled = process_dicom_files_positives_parallel(str(SERIES_DIR), str(DATA_DIR), positive_set_ids, df_ultimate)

Processing 1376 training series and 344 validation series...
Starting parallel processing...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 11 concurrent workers.


  warn_and_log(


Processing complete! Processed 2069 files total.
Train samples: 1645
Validation samples: 424


In [None]:
train_array = val_train_split['train']
val_array = val_train_split['val']
df_training_class_positive, df_training_labels_positive = create_datafames_for(train_array, df_localizers, df_train)
df_validation_class_positive, df_validation_labels_positive = create_datafames_for(val_array, df_localizers, df_train)
# test the group by and count
len(df_training_labels_positive.groupby(['SeriesInstanceUID', 'SOPInstanceUID']).count()) == len(df_training_class_positive.groupby(['SeriesInstanceUID', 'SOPInstanceUID']).count()) == len(df_training_class_positive)
len(df_validation_labels_positive.groupby(['SeriesInstanceUID', 'SOPInstanceUID']).count()) == len(df_validation_class_positive.groupby(['SeriesInstanceUID', 'SOPInstanceUID']).count()) == len(df_validation_class_positive)



True

In [161]:
# Now for the negative samples
negative_set = df_train[~df_train['SeriesInstanceUID'].isin(positive_set_ids)]
negative_set = negative_set[~negative_set['SeriesInstanceUID'].isin(test_set_ids)]
#test weather this worked:
#negative_set_ids = set(negative_set['SeriesInstanceUID'])
# positive_set[positive_set['SeriesInstanceUID'].isin(negative_set_ids)] -- nothing great!
# test_set[test_set['SeriesInstanceUID'].isin(negative_set_ids)] # -- nothing great!
#ns = set(negative_set[negative_set['Aneurysm Present']==1]['SeriesInstanceUID'])
#df_localizers[df_localizers['SeriesInstanceUID'].isin(ns)]
positives_in_negative_set = set(negative_set[negative_set['Aneurysm Present']==1]['SeriesInstanceUID'])# -- something....
negative_set = negative_set[~negative_set['SeriesInstanceUID'].isin(positives_in_negative_set)]

# Achive a more equal distribution between the negatives and the positives
# print(negative_set['Modality'].value_counts()) # -- ['CT', 'MR'] -- so we can use the same code as above
# Modality
# CTA           860
# MRA           622
# MRI T2        622
# MRI T1post    228
# Name: count, dtype: int64
# print(df_train[df_train['SeriesInstanceUID'].isin(positive_set_ids)]['Modality'].value_counts())
# Modality
# CTA           994
# MRA           477
# MRI T2        199
# MRI T1post     78
# Name: count, dtype: int64
# remove 300 negative t2 scans and 100 negative T1post scans from the negative_set
negative_set_t2 = negative_set[negative_set['Modality'] == 'MRI T2']
negative_set_t1post = negative_set[negative_set['Modality'] == 'MRI T1post']

negative_set_t2_ids = set(negative_set_t2['SeriesInstanceUID'])
negative_set_t1post_ids = set(negative_set_t1post['SeriesInstanceUID'])

to_remove_t2 = list(negative_set_t2_ids)[:300]
to_remove_t1post = list(negative_set_t1post_ids)[:100]

negative_set = negative_set[~negative_set['SeriesInstanceUID'].isin(to_remove_t2)]
negative_set = negative_set[~negative_set['SeriesInstanceUID'].isin(to_remove_t1post)]

negative_set_ids = set(negative_set['SeriesInstanceUID'])
# print(negative_set['Modality'].value_counts())
# Modality
# CTA           860
# MRA           622
# MRI T2        322
# MRI T1post    128
# Name: count, dtype: int64

In [162]:
def rescale_mean_and_std(normalized_mean, normalized_std, x, y):
    rescaled_mean = x + normalized_mean * (y - x)
    rescaled_std = normalized_std * (y - x)
    return rescaled_mean, rescaled_std

def process_single_series(series_instance_uid, series_root_path, output_path, mean, std_deviation, is_validation=False):
    """Process a single series and save files directly"""
    series_folder_path = os.path.join(series_root_path, str(series_instance_uid))
    if not os.path.exists(series_folder_path):
        return None
    
    # Determine output folder
    folder_name = 'validation' if is_validation else 'train'
    output_folder = os.path.join(output_path, folder_name)
    
    # Get and sort DCM files
    dcm_file_paths = [(os.path.join(series_folder_path, f), f) for f in os.listdir(series_folder_path) if f.endswith('.dcm')]
    if not dcm_file_paths:
        return None
    
    dcm_files = []
    for dcm_file_path, file_name in dcm_file_paths:
        dcm_data = pydicom.dcmread(dcm_file_path, stop_before_pixels=True)
        dcm_files.append((dcm_file_path, dcm_data.InstanceNumber, file_name.rstrip(".dcm")))
    dcm_files.sort(key=lambda x: x[1])
    
    results = []
    max_instance = max([dcm[1] for dcm in dcm_files])
    
    # Create output series folder (unique per series - no conflicts)
    output_series_folder = os.path.join(output_folder, str(series_instance_uid))
    os.makedirs(output_series_folder, exist_ok=True)
    
    # Sample 1: Close to distribution
    rescaled_mean, rescaled_std = rescale_mean_and_std(mean, std_deviation, 0, len(dcm_files))
    sampled_dcm = np.random.normal(rescaled_mean, rescaled_std)
    closest_dcm = min(dcm_files, key=lambda x: abs(x[1] - sampled_dcm))
    
    pixel_array = pydicom.dcmread(closest_dcm[0]).pixel_array
    output_file_path = os.path.join(output_series_folder, f"{closest_dcm[2]}.npy")
    np.save(output_file_path, pixel_array)
    
    results.append((series_instance_uid, closest_dcm[2], closest_dcm[1] / max_instance))
    collected_instance_number = closest_dcm[1]

    list_of_free_instance_numbers = [dcm[1] for dcm in dcm_files if dcm[1] != collected_instance_number]
    # Sample 2: Random
    if len(dcm_files) > 1:
        selected_instance_number = np.random.choice(list_of_free_instance_numbers)
        random_dcm = next(dcm for dcm in dcm_files if dcm[1] == selected_instance_number)
        pixel_array = pydicom.dcmread(random_dcm[0]).pixel_array
        output_file_path = os.path.join(output_series_folder, f"{random_dcm[2]}.npy")
        np.save(output_file_path, pixel_array)
        
        results.append((series_instance_uid, random_dcm[2], random_dcm[1] / max_instance))
    
    return results

def process_dicom_files_negative_parallel(series_root_path, output_path, previous_samples, negative_set_ids, n_jobs=-1):
    """
    Process negative DICOM files in parallel using joblib
    n_jobs: number of parallel jobs (-1 uses all available cores)
    """
    print(f"Using joblib for parallel processing...")
    
    # Create output folders
    train_folder = os.path.join(output_path, 'train')
    validation_folder = os.path.join(output_path, 'validation')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(validation_folder, exist_ok=True)
    
    mean = np.mean(previous_samples)
    std_deviation = np.std(previous_samples)
    
    # Pre-determine train/validation split
    negative_set_list = list(negative_set_ids)
    random.shuffle(negative_set_list)  # Shuffle to ensure random split
    split_point = int(len(negative_set_list) * 0.8)
    
    print(f"Processing {len(negative_set_list)} negative series...")
    print(f"Train: {split_point}, Validation: {len(negative_set_list) - split_point}")
    
    # Create tasks with train/val designation
    tasks = []
    for i, series_id in enumerate(negative_set_list):
        is_validation = i >= split_point
        tasks.append((series_id, series_root_path, output_path, mean, std_deviation, is_validation))
    
    # Process in parallel
    results = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(process_single_series)(series_id, series_root_path, output_path, mean, std_deviation, is_validation)
        for series_id, series_root_path, output_path, mean, std_deviation, is_validation in tasks
    )
    
    # Collect results
    validation_train_split = {'val': [], 'train': []}
    
    for i, result in enumerate(results):
        if result:
            is_validation = i >= split_point
            split_key = 'val' if is_validation else 'train'
            validation_train_split[split_key].extend(result)
    
    print(f"Processing complete!")
    print(f"Train samples: {len(validation_train_split['train'])}")
    print(f"Validation samples: {len(validation_train_split['val'])}")
    
    return validation_train_split

In [163]:
validation_train_split = process_dicom_files_negative_parallel(str(SERIES_DIR), str(DATA_DIR), positions_sampled, negative_set_ids)

Using joblib for parallel processing...
Processing 1904 negative series...
Train: 1523, Validation: 381


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 11 concurrent workers.


Processing complete!
Train samples: 3046
Validation samples: 762


In [164]:
train_array = validation_train_split['train']
val_array = validation_train_split['val']
df_training_class_negative, _ = create_datafames_for(train_array, df_localizers, df_train, skip_loc=True)
df_validation_class_negative, _ = create_datafames_for(val_array, df_localizers, df_train, skip_loc=True)
# test for duplicates


In [166]:
len(df_training_class_negative.groupby(['SeriesInstanceUID', 'SOPInstanceUID']).count()) == len(df_training_class_negative)
#len(df_validation_class_negative.groupby(['SeriesInstanceUID', 'SOPInstanceUID']).count()) == len(df_validation_class_negative)

True

In [171]:
df_train_combined = pd.concat([df_training_class_positive, df_training_class_negative], 
                             ignore_index=True)
df_train_combined.to_csv('train.csv', index=False)
print(f"Combined training dataframe saved with {len(df_train_combined)} rows")

Combined training dataframe saved with 4691 rows


In [170]:
df_val_combined = pd.concat([df_validation_class_positive, df_validation_class_negative], 
                           ignore_index=True)

df_val_combined.to_csv('validation.csv', index=False)

print(f"Combined validation dataframe saved with {len(df_val_combined)} rows")

Combined validation dataframe saved with 1186 rows


In [172]:
df_validation_labels_positive.to_csv('validation_labels.csv', index=False)
df_training_labels_positive.to_csv('training_labels.csv', index=False)
