# Install Dependencies

In [None]:
import os
import shutil
import random
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define the new folders

In [33]:
images_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/images'
train_images = [os.path.join(images_dir, image_name) for image_name in os.listdir(images_dir) if image_name.endswith('.nii.gz')]
labels_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/labels'
train_labels = [os.path.join(labels_dir, image_name) for image_name in os.listdir(labels_dir) if image_name.endswith('.nii.gz')]

In [34]:
print(len(train_images))
print(len(train_labels))

278
278


In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/images

In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/labels

In [None]:
train_volumes_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TrainVolumes'
train_segmentation_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TrainSegmentation'
test_volumes_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TestVolumes'
test_segmentation_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TestSegmentation'

In [None]:
os.makedirs(train_volumes_dir, exist_ok=True)
os.makedirs(train_segmentation_dir, exist_ok=True)
os.makedirs(test_volumes_dir, exist_ok=True)
os.makedirs(test_segmentation_dir, exist_ok=True)

# Split into Train and Test (Ratio is 8:2)

In [None]:
# Get a list of all the image and label files
image_files = os.listdir(images_dir)
label_files = os.listdir(labels_dir)

# Pair the image and label files based on their filenames
image_label_pairs = []
for image_file in image_files:
    # Extract the pancreas number and group number from the filename
    pancreas_num, group_num = map(int, image_file.replace('pancreas_', '').replace('.nii.gz', '').split('_'))
    # Find the corresponding label file
    label_file = f'pancreas_{pancreas_num:03d}_{group_num}.nii.gz'
    # Add the image and label pair to the list
    image_label_pairs.append((image_file, label_file))

# Shuffle the image and label pairs
random.seed(0)
random.shuffle(image_label_pairs)

# Set the split ratio for training and testing data
split_ratio = 0.8

In [18]:
# Calculate the number of training and testing samples
num_samples = len(image_label_pairs)
num_train_samples = int(num_samples * split_ratio)
num_test_samples = num_samples - num_train_samples

# Split the image and label pairs into training and testing data
train_image_label_pairs = image_label_pairs[:num_train_samples]
test_image_label_pairs = image_label_pairs[num_train_samples:]

# Copy the training and testing data to their respective directories
print('Copying training data...')
for image_file, label_file in tqdm(train_image_label_pairs):
    shutil.copy(os.path.join(images_dir, image_file), os.path.join(train_volumes_dir, image_file))
    shutil.copy(os.path.join(labels_dir, label_file), os.path.join(train_segmentation_dir, label_file))

print('Copying testing data...')
for image_file, label_file in tqdm(test_image_label_pairs):
    shutil.copy(os.path.join(images_dir, image_file), os.path.join(test_volumes_dir, image_file))
    shutil.copy(os.path.join(labels_dir, label_file), os.path.join(test_segmentation_dir, label_file))

print('Data splitting complete!')

Copying training data...


100%|██████████| 222/222 [01:36<00:00,  2.31it/s]


Copying testing data...


100%|██████████| 56/56 [00:26<00:00,  2.09it/s]

Data splitting complete!





# Confirm Contents of folders

In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TrainVolumes

In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TrainSegmentation

In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TestVolumes

In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TestSegmentation

# Sanity Check

In [24]:
import filecmp

# dir1 = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TrainVolumes'
# dir2 = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TrainSegmentation'
dir1 = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TestVolumes'
dir2 = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/TestSegmentation'

# Get a list of all the files in each directory
files1 = set(os.listdir(dir1))
files2 = set(os.listdir(dir2))

# Find the files that exist in dir1 but not in dir2
in_dir1_not_dir2 = files1 - files2
for file in sorted(in_dir1_not_dir2):
    print(f'{file} not in {dir2}')

# Find the files that exist in dir2 but not in dir1
in_dir2_not_dir1 = files2 - files1
for file in sorted(in_dir2_not_dir1):
    print(f'{file} not in {dir1}')