# Installing Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install Monai
!pip install matplotlib
!pip install numpy
!pip install tqdm
!pip install glob2
!pip install dicom2nifti
!pip install pytest-shutil
!pip install nibabel

In [None]:
import torch
import nibabel as nib
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import pandas as pd

# Checking for GPU & Setting up images

In [None]:
#check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available. Training on CPU ...')
else:
    print('CUDA is available. Training on GPU ...')

CUDA is available. Training on GPU ...


In [None]:
data = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/panc/imagesTr/'
train_images = [os.path.join(data, image_name) for image_name in os.listdir(data) if image_name.endswith('.nii.gz')]

In [None]:
labels = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/Task07_Pancreas/labelsTr/'
train_labels = [os.path.join(labels, image_name) for image_name in os.listdir(labels) if image_name.endswith('.nii.gz')]

In [None]:
# Assuming train_images and train_labels have been defined earlier
train_images_set = set(os.path.basename(image_path) for image_path in train_images)
train_labels_set = set(os.path.basename(label_path) for label_path in train_labels)

# Find the missing label file name
missing_label = list(train_images_set - train_labels_set)

print(f"Missing Label: {missing_label[0]}")

NameError: ignored

In [None]:
print(len(train_images))
print(len(train_labels))

281
281


In [None]:
train_images = sorted(train_images, key=lambda path: path.split('/')[-1])
train_labels = sorted(train_labels, key=lambda path: path.split('/')[-1])
train_df = pd.DataFrame({
    'image_path': train_images,
    'label_path': train_labels
})

# Check for invalid files (Does not work!)

In [None]:
image_path = train_df.iloc[0].image_path
label_path = train_df.iloc[0].label_path

if os.path.basename(image_path).startswith('.') or not image_path.endswith('.nii.gz'):
    print(f"Invalid image file: {image_path}")
else:
    img = nib.load(image_path).get_fdata()

if os.path.basename(label_path).startswith('.') or not label_path.endswith('.nii.gz'):
    print(f"Invalid label file: {label_path}")
else:
    label = nib.load(label_path).get_fdata()

In [None]:
# Function to move files to an invalid folder
def move_to_invalid_folder(file_path, invalid_folder):
    file_name = os.path.basename(file_path)
    destination_path = os.path.join(invalid_folder, file_name)
    shutil.move(file_path, destination_path)
    print(f"Moved {file_name} to {invalid_folder}")

# Function to check and move invalid files
def check_and_move_invalid_files(df, target_directory):
    invalid_images_folder = os.path.join(target_directory, "invalid_images")
    invalid_labels_folder = os.path.join(target_directory, "invalid_labels")

    # Create the folders if they don't exist
    os.makedirs(invalid_images_folder, exist_ok=True)
    os.makedirs(invalid_labels_folder, exist_ok=True)

    for index, row in df.iterrows():
        image_path = row['image_path']
        label_path = row['label_path']

        if not os.path.exists(image_path) or not image_path.endswith('.nii.gz'):
            move_to_invalid_folder(image_path, invalid_images_folder)

        if not os.path.exists(label_path) or not label_path.endswith('.nii.gz'):
            move_to_invalid_folder(label_path, invalid_labels_folder)

# Define train_images and train_labels (file paths for images and labels)
data = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/'
train_images = [os.path.join(data, 'imagesTr', image_name) for image_name in os.listdir(os.path.join(data, 'imagesTr')) if image_name.endswith('.nii.gz')]
train_labels = [os.path.join(data, 'labelsTr', label_name) for label_name in os.listdir(os.path.join(data, 'labelsTr')) if label_name.endswith('.nii.gz')]

# Zip the two lists together to create pairs of (image_path, label_path)
train_data = list(zip(train_images, train_labels))

# Assuming train_df is already defined
invalid_files_directory = "/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/invalid_files"

print("Checking for invalid files and moving them...")

check_and_move_invalid_files(train_df, invalid_files_directory)

print("All invalid files have been checked and moved.")


FileNotFoundError: ignored

# Plotting the Image

In [None]:
import ipywidgets as widgets
from IPython.display import display
from matplotlib.colors import from_levels_and_colors

In [None]:
%matplotlib inline
img = nib.load(train_df.iloc[50].image_path).get_fdata()
label = nib.load(train_df.iloc[50].label_path).get_fdata()

# Create a custom colormap
my_cmap, _ = from_levels_and_colors(levels=[0, 1, 2], colors=[[0., 0., 0., 0.],  [0., 1., 0., 1], [0.8, 0., 0.8, 1]], extend='max')

def plot_slice(slice_number):
    plt.figure(figsize=(8, 8))
    plt.imshow(img[:, :, slice_number], cmap='gray')
    plt.imshow(label[:, :, slice_number].astype('int'), cmap=my_cmap, vmin=0., vmax=1.)
    plt.title(f"Slice {slice_number + 1}")
    plt.axis('off')
    plt.show()

# Create the interactive slider widget
slice_slider = widgets.IntSlider(value=0, min=0, max=img.shape[2] - 1, step=1, description='Slice:')
widgets.interactive(plot_slice, slice_number=slice_slider)

Refer to overlay.ipynb to compare the new 64 slices nifti files

# Preprocessing

In [None]:
import os
from glob import glob
import shutil
from tqdm import tqdm
import dicom2nifti
import numpy as np
import nibabel as nib
from monai.transforms import(
    Compose,
    AddChanneld,
    LoadImaged,
    Resized,
    ToTensord,
    Spacingd,
    Orientationd,
    ScaleIntensityRanged,
    CropForegroundd,
)
from monai.data import DataLoader, Dataset, CacheDataset
from monai.utils import set_determinism

# Testing

## Creating Groups

Already completed, should not need to run this section again

In [None]:
def create_groups(in_dir, out_dir, Number_slices):
    '''
    This function is to get the last part of the path so that we can use it to name the folder.
    `in_dir`: the path to your folders that contain dicom files
    `out_dir`: the path where you want to put the converted nifti files
    `Number_slices`: here you put the number of slices that you need for your project and it will
    create groups with this number.
    '''

    for patient in tqdm(glob(in_dir + '/*')):
        patient_name = os.path.basename(os.path.normpath(patient))

        # Here we need to calculate the number of folders which mean into how many groups we will divide the number of slices
        number_folders = int(len(glob(patient + '/*')) / Number_slices)

        for i in range(number_folders):
            output_path = os.path.join(out_dir, patient_name + '_' + str(i))
            os.mkdir(output_path)
            print('made path:' + output_path)

            # Move the slices into a specific folder so that you will save memory in your disk
            for i, file in enumerate(glob(patient + '/*')):
                if i == Number_slices + 1:
                    break

                shutil.move(file, output_path)

In [None]:
# in_path = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_files/images'
# out_path = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_groups/images'
in_path = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_files/labels'
out_path = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_groups/labels'

In [None]:
create_groups(in_path, out_path, 64)

In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_groups/images

In [None]:
!ls /content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_groups/labels

## Dcm 2 Nifti

Already completed, should not need to run this section again

In [None]:
def dcm2nifti(in_dir, out_dir):
    '''
    This function will be used to convert dicoms into nifti files after creating the groups with
    the number of slices that you want.
    `in_dir`: the path to the folder where you have all the patients (folder of all the groups).
    `out_dir`: the path to the output, which means where you want to save the converted nifties.
    '''

    for folder in tqdm(glob(in_dir + '/*')):
        patient_name = os.path.basename(os.path.normpath(folder))
        dicom2nifti.dicom_series_to_nifti(folder, os.path.join(out_dir, patient_name + '.nii.gz'))

In [None]:
in_path_images = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_groups/images'
in_path_labels = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/dicom_groups/labels'
out_path_images = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/images'
out_path_labels = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/labels'

In [None]:
dcm2nifti(in_path_images, out_path_images) #out of 278
# dcm2nifti(in_path_labels, out_path_labels) #out of 278 hopefully...

100%|██████████| 278/278 [1:37:30<00:00, 21.04s/it]


## Find Empty

Be careful this this section. Might need to rerun. Unclear ATM

In [None]:
def find_empty(in_dir):
    '''
    This function will help you to find the empty volumes that you may not need for your training
    so instead of opening all the files and search for the empty ones, them use this function to make it quick.
    '''

    list_patients = []
    for patient in tqdm(glob(os.path.join(in_dir, '*'))):
        img = nib.load(patient)

        if len(np.unique(img.get_fdata())) > 2:
            print(os.path.basename(os.path.normpath(patient)))
            list_patients.append(os.path.basename(os.path.normpath(patient)))

    return list_patients

In [None]:
# in_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/labels'
in_dir_lab = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/nifti_files/images'

In [None]:
find_empty(in_dir_lab)

In [None]:
find_empty(in_dir)

## Preparing the Data

Not running the code here. Run through final_preprocess.py in main

In [None]:
def preprocess_image(image, a_min=-200, a_max=200):
    # Apply windowing to the image
    adjusted_image = (image - 60 + 200) / 400  # Adjust intensity values

    # Normalize the adjusted image to the range [0, 1]
    normalized_image = (adjusted_image - np.min(adjusted_image)) / (np.max(adjusted_image) - np.min(adjusted_image))

    # Apply CLAHE enhancement
    enhanced_image = exposure.equalize_adapthist(normalized_image)

    return enhanced_image

def prepare(in_dir, pixdim=(1.5, 1.5, 1.0), a_min=-200, a_max=200, spatial_size=[128, 128, 64], cache=False):

    set_determinism(seed=0)

    def combined_transforms(image):
        enhanced_image = preprocess_image(image, a_min=a_min, a_max=a_max)
        return enhanced_image

    path_train_volumes = sorted(glob(os.path.join(in_dir, "TrainVolumes", "*.nii.gz")))
    path_train_segmentation = sorted(glob(os.path.join(in_dir, "TrainSegmentation", "*.nii.gz")))

    path_test_volumes = sorted(glob(os.path.join(in_dir, "TestVolumes", "*.nii.gz")))
    path_test_segmentation = sorted(glob(os.path.join(in_dir, "TestSegmentation", "*.nii.gz")))

    train_files = [{"vol": image_name, "seg": label_name} for image_name, label_name in zip(path_train_volumes, path_train_segmentation)]
    test_files = [{"vol": image_name, "seg": label_name} for image_name, label_name in zip(path_test_volumes, path_test_segmentation)]

    combined_train_transforms = Compose(
        [
            LoadImaged(keys=["vol", "seg"]),
            AddChanneld(keys=["vol", "seg"]),
            Spacingd(keys=["vol", "seg"], pixdim=pixdim, mode=("bilinear", "nearest")),
            Orientationd(keys=["vol", "seg"], axcodes="RAS"),
            ScaleIntensityRanged(keys=["vol"], a_min=a_min, a_max=a_max, b_min=0.0, b_max=1.0, clip=True),
            CropForegroundd(keys=["vol", "seg"], source_key="vol"),
            Resized(keys=["vol", "seg"], spatial_size=spatial_size),
            # Applying the combined preprocessing function here
            combined_transforms(keys=["vol"]),
            ToTensord(keys=["vol", "seg"]),
        ]
    )

    combined_test_transforms = Compose(
        [
            LoadImaged(keys=["vol", "seg"]),
            AddChanneld(keys=["vol", "seg"]),
            Spacingd(keys=["vol", "seg"], pixdim=pixdim, mode=("bilinear", "nearest")),
            Orientationd(keys=["vol", "seg"], axcodes="RAS"),
            ScaleIntensityRanged(keys=["vol"], a_min=a_min, a_max=a_max, b_min=0.0, b_max=1.0, clip=True),
            CropForegroundd(keys=['vol', 'seg'], source_key='vol'),
            Resized(keys=["vol", "seg"], spatial_size=spatial_size),
            # Applying the combined preprocessing function here
            combined_transforms(keys=["vol"]),
            ToTensord(keys=["vol", "seg"]),
        ]
    )

    if cache:
        train_ds = CacheDataset(data=train_files, transform=combined_train_transforms, cache_rate=1.0)
        train_loader = DataLoader(train_ds, batch_size=1)

        test_ds = CacheDataset(data=test_files, transform=combined_test_transforms, cache_rate=1.0)
        test_loader = DataLoader(test_ds, batch_size=1)

        return train_loader, test_loader

    else:
        train_ds = Dataset(data=train_files, transform=combined_train_transforms)
        train_loader = DataLoader(train_ds, batch_size=1)

        test_ds = Dataset(data=test_files, transform=combined_test_transforms)
        test_loader = DataLoader(test_ds, batch_size=1)

        return train_loader, test_loader


In [None]:
data_dir = '/content/drive/MyDrive/Machine-Learning-Biomedicine/Pancreatic-Cancer/pancreas/Data_Train_Test/'

In [None]:
data_in = prepare(data_dir, cache = True)