In [None]:
pip install SimpleITK

Collecting SimpleITK
  Downloading SimpleITK-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.3.1


#Data Loading and Preprocessing


Defining paths and loading the dataset.

Getting lists of subset images files.

Combining file lists and removing duplicates.

Defining functions to create masks (RECTANGULAR PATCHES) and retrieve filenames.

Loading annotations and mapping series UIDs to filenames.

Constructing a DataFrame with image data, mask data, and class labels.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection import retinanet_resnet50_fpn_v2, RetinaNet_ResNet50_FPN_V2_Weights
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from glob import glob
import cv2
import SimpleITK as sitk
from google.colab import drive
import os
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Mount Google Drive
drive.mount('/content/drive')

# Define paths and load the dataset
file_path_0 = "/content/drive/Shareddrives/IA DL_project/ML IA/LUNA16/subsets/subset0"
file_path_1 = "/content/drive/Shareddrives/IA DL_project/ML IA/LUNA16/subsets/subset1"
annotations_path = "/content/drive/Shareddrives/IA DL_project/ML IA/LUNA16/annotations.csv"

# Getting list of image files from both subsets
file_list_0 = glob(file_path_0 + "/*.mhd")
file_list_1 = glob(file_path_1 + "/*.mhd")

# Combine file lists and remove duplicates
file_list = list(set(file_list_0 + file_list_1))

# List filenames in subset0 and subset1 directories to understand the format
print("Sample filenames in subset0:", [os.path.basename(f) for f in file_list_0][:5])
print("Sample filenames in subset1:", [os.path.basename(f) for f in file_list_1][:5])

# Function to correctly extract series UID from filename
def get_filename(file_list, case):
    for f in file_list:
        if case in os.path.basename(f):
            return f
    return None

# Load annotations
df_node = pd.read_csv(annotations_path)

# Extract series UIDs from filenames
series_uids_from_files = [os.path.basename(f).split('.')[0] for f in file_list]
print("Sample series UIDs from filenames:", series_uids_from_files[:5])

# Verify series UID compatibility
unique_seriesuid_annotations = df_node['seriesuid'].nunique()
unique_seriesuid_images = len(set(series_uids_from_files))

print("Unique series UID in annotations:", unique_seriesuid_annotations)
print("Unique series UID in image files:", unique_seriesuid_images)

# Map the series UID to file names
df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name))
df_node = df_node.dropna(subset=["file"])  # Drop rows where file mapping is not found

print("Annotations with associated files:", df_node.shape[0])
print(df_node.head())

# Define DataFrame columns
columns = ["seriesuid", "sliceindex", "imagedata", "bbox", "class"]
data = []

# Define target size for downsampling
target_size = (256, 256)

# Function to convert 3D center and diameter to 2D bounding box
def convert_to_2d_bounding_box(center, diam, spacing, origin):
    v_center = (center[:2] - origin[:2]) / spacing[:2]  # Only X, Y coordinates
    diam_pixels = diam / spacing[:2]
    x_min = int(v_center[0] - diam_pixels[0] / 2)
    x_max = int(v_center[0] + diam_pixels[0] / 2)
    y_min = int(v_center[1] - diam_pixels[1] / 2)
    y_max = int(v_center[1] + diam_pixels[1] / 2)
    return [x_min, y_min, x_max, y_max]

# Function to load images and determine if they contain nodules
def load_images_with_nodules(file_list, annotations_df, target_size=(256, 256)):
    data = []
    for img_file in tqdm(file_list):
        mini_df = annotations_df[annotations_df["file"] == img_file]
        itk_img = sitk.ReadImage(img_file)
        img_array = sitk.GetArrayFromImage(itk_img)
        origin = np.array(itk_img.GetOrigin())
        spacing = np.array(itk_img.GetSpacing())
        original_size = img_array.shape[1:3]

        if mini_df.shape[0] > 0:
            for _, row in mini_df.iterrows():
                node_x, node_y, node_z = row["coordX"], row["coordY"], row["coordZ"]
                diam = row["diameter_mm"]
                center = np.array([node_x, node_y, node_z])
                i_z = int(np.rint((node_z - origin[2]) / spacing[2]))

                if 0 <= i_z < img_array.shape[0]:  # Check if the z-index is within the slice range
                    bbox = convert_to_2d_bounding_box(center, diam, spacing, origin)
                    img_resized = cv2.resize(img_array[i_z], target_size, interpolation=cv2.INTER_AREA)

                    # Scale bounding box coordinates to match resized image
                    scale_x = target_size[0] / original_size[1]
                    scale_y = target_size[1] / original_size[0]
                    bbox_resized = [int(bbox[0] * scale_x), int(bbox[1] * scale_y), int(bbox[2] * scale_x), int(bbox[3] * scale_y)]

                    data.append([row["seriesuid"], i_z, img_resized, bbox_resized, 1])  # 1 for nodule class
        else:
            for i in range(img_array.shape[0]):
                img_resized = cv2.resize(img_array[i], target_size, interpolation=cv2.INTER_AREA)
                data.append([img_file.split('/')[-1], i, img_resized, [0, 0, 0, 0], 0])  # 0 for non-nodule class
    return pd.DataFrame(data, columns=["seriesuid", "sliceindex", "imagedata", "bbox", "class"])

# Combine nodules and non-nodules
df_slices = load_images_with_nodules(file_list, df_node)


In [None]:
# Count the number of images with nodules and without nodules
num_images_with_nodules = df_slices[df_slices['class'] == 1].shape[0]
num_images_without_nodules = df_slices[df_slices['class'] == 0].shape[0]

print(f"Number of images with nodules: {num_images_with_nodules}")
print(f"Number of images without nodules: {num_images_without_nodules}")

# Verify the counts
assert num_images_with_nodules + num_images_without_nodules == len(df_slices), "Counts do not match total number of images"


Number of images with nodules: 240
Number of images without nodules: 10234


## Part 3: Data Preprocessing and Augmentation

The `preprocess_and_save_images` function saves the preprocessed images as .pt files to improve velocity in the running. The `NoduleDataset` class loads these preprocessed images and their corresponding bounding boxes.

Data augmentation techniques such as random horizontal and vertical flips are applied to the images. The preprocessed images are then split into training and validation sets.
We also modify the RetinaNet model to handle our specific number of classes and add batch normalization to the convolutional layers.


In [None]:
import os
import numpy as np
from glob import glob
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import torchvision
import torch.optim as optim
import torch.nn as nn
import torchvision.transforms as T
from torchvision.ops.boxes import box_iou
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import torch.multiprocessing as mp

# Set start method for multiprocessing to 'spawn'
mp.set_start_method('spawn', force=True)

# Function to preprocess and save images
def preprocess_and_save_images(df, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        img = row['imagedata']
        filename = f"{row['seriesuid']}_{row['sliceindex']}.pt"
        filepath = os.path.join(save_dir, filename)
        torch.save({'image': torch.tensor(img).float().unsqueeze(0), 'bbox': row['bbox'], 'class': row['class']}, filepath)

# Preprocess and save images
save_dir = '/content/preprocessed_images'
preprocess_and_save_images(df_slices, save_dir)

class NoduleDataset(Dataset):
    def __init__(self, file_list, transform=None):
        self.file_list = file_list
        self.transform = transform

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        data = torch.load(self.file_list[idx])
        image = data['image']
        if data['class'] == 1:
            bbox = torch.tensor(data['bbox']).float().unsqueeze(0)  # [num_boxes, 4]
            target = {'boxes': bbox, 'labels': torch.tensor([1], dtype=torch.int64)}  # 1 for nodules
        else:
            target = {'boxes': torch.empty((0, 4)), 'labels': torch.tensor([0], dtype=torch.int64)}  # No bounding boxes for non-nodules

        if self.transform:
            image = self.transform(image)

        return image, target

def custom_collate_fn(batch):
    return tuple(zip(*batch))

# Get list of preprocessed files
file_list = glob(os.path.join(save_dir, '*.pt'))

# Split the file list into training and validation sets
train_files, val_files = train_test_split(file_list, test_size=0.2, random_state=42)

# Define data augmentation
transform = T.Compose([
    T.RandomHorizontalFlip(0.5),
    T.RandomVerticalFlip(0.5),
])

# Create DataLoader with increased batch size
batch_size = 32  # Increased batch size (tried with 16, 32 ang 64)
train_dataset = NoduleDataset(train_files, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn, num_workers=2)

val_dataset = NoduleDataset(val_files)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn, num_workers=2)

# Load model
num_classes = 2
model = torchvision.models.detection.retinanet_resnet50_fpn_v2(weights=torchvision.models.detection.RetinaNet_ResNet50_FPN_V2_Weights.DEFAULT)
model.head.classification_head.cls_logits = nn.Conv2d(model.head.classification_head.cls_logits.in_channels, model.head.classification_head.num_anchors * num_classes, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
model.head.regression_head.bbox_reg = nn.Conv2d(model.head.regression_head.bbox_reg.in_channels, model.head.classification_head.num_anchors * 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
model.head.classification_head.num_classes = num_classes

# Add Batch Normalization
for name, module in model.head.named_children():
    if isinstance(module, nn.Conv2d):
        setattr(model.head, name, nn.Sequential(
            module,
            nn.BatchNorm2d(module.out_channels)
        ))



## Part 4: Model Training Setup

Set up the training process for the model.
The model is moved to the GPU if available.
We use the AdamW optimizer with a tuned learning rate and a Cosine Annealing learning rate scheduler. Mixed precision training is enabled to speed up training and reduce memory usage.

The `train_one_epoch` function handles the training of the model for one epoch. It includes the forward pass, loss computation, backpropagation, and optimization steps.

We also ensure the loss is not NaN by checking and setting a large value if it is (which happened sometimes due to not performing well).


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
torch.cuda.empty_cache()
model.to(device)

# Setup optimizer with AdamW
learning_rate = 1e-4  # Tuned learning rate
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.0001)

# Learning rate scheduler
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Using mixed precision training
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

# Training and evaluation loop
train_losses = []
val_acc_scores = []
val_mse_scores = []
best_model_wts = None
best_val_acc = 0.0
epochs = 30  # Increased number of epochs for better training

def train_one_epoch(model, data_loader, optimizer, device, scaler):
    model.train()
    epoch_loss = 0
    for images, targets in tqdm(data_loader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        scaler.scale(losses).backward()
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += losses.item()

    avg_loss = epoch_loss / len(data_loader)
    avg_loss_tensor = torch.tensor(avg_loss, device=device)  # Convert to tensor for NaN check
    if torch.isnan(avg_loss_tensor):
        avg_loss = float('inf')  # Handle NaN values by setting a large value
    print(f"Epoch loss: {avg_loss}")
    return avg_loss



## Part 5: Model Evaluation and Visualization

We evaluate the model and visualize the predictions. The `evaluate` function calculates the accuracy and mean squared error (MSE) of the model on the validation set. It also displays a confusion matrix to show the classification performance of the model in distinguishing between nodules and non-nodules.

The `visualize_predictions` function plots the predicted and true bounding boxes on the images. This helps us visually inspect the performance of the model.

We train the model for multiple epochs (30 defined before), evaluate its performance on the validation set, and save the best model weights based on the validation accuracy. Finally, we plot the training loss and validation metrics over the epochs.


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    all_boxes_true = []
    all_boxes_pred = []
    mse_loss = nn.MSELoss()

    with torch.no_grad():
        for images, targets in tqdm(data_loader):
            images = list(image.to(device) for image in images)
            outputs = model(images)

            for target in targets:
                if 'labels' in target:
                    y_true.extend(target['labels'].cpu().numpy())
                if 'boxes' in target:
                    all_boxes_true.extend(target['boxes'].cpu().numpy())

            for output in outputs:
                if 'labels' in output:
                    pred_labels = output['labels'].cpu().numpy()
                    y_pred.extend(pred_labels)
                if 'boxes' in output:
                    pred_boxes = output['boxes'].cpu().numpy()
                    all_boxes_pred.extend(pred_boxes)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    if len(y_true) == 0 or len(y_pred) == 0:
        print("No predictions or labels to compare.")
        return 0, float('inf')  # Handle cases where there are no predictions

    min_len = min(len(y_true), len(y_pred))
    y_true = y_true[:min_len]
    y_pred = y_pred[:min_len]

    accuracy = np.sum(y_true == y_pred) / len(y_true)
    mse = mse_loss(torch.tensor(all_boxes_pred[:min_len]), torch.tensor(all_boxes_true[:min_len])).item()

    print(f'y_true length: {len(y_true)}')
    print(f'y_pred length: {len(y_pred)}')
    print(f'y_true: {y_true}')
    print(f'y_pred: {y_pred}')

    # Confusion Matrix
    if len(y_true) > 0 and len(y_pred) > 0:
        cm = confusion_matrix(y_true, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Non-nodule', 'Nodule'])
        disp.plot(cmap=plt.cm.Blues)
        plt.show()

    return accuracy, mse

def visualize_predictions(model, data_loader, device):
    model.eval()
    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            outputs = model(images)
            for img, target, output in zip(images, targets, outputs):
                img = img.cpu().numpy().transpose(1, 2, 0)
                img = (img - img.min()) / (img.max() - img.min())
                plt.imshow(img, cmap='gray')

                true_boxes = target['boxes'].cpu().numpy()
                for box in true_boxes:
                    plt.gca().add_patch(plt.Rectangle((box[0], box[1]), box[2]-box[0], box[3]-box[1], fill=False, edgecolor='green', linewidth=2))

                pred_boxes = output['boxes'].cpu().numpy()
                for box in pred_boxes:
                    plt.gca().add_patch(plt.Rectangle((box[0], box[1]), box[2]-box[0], box[3]-box[1], fill=False, edgecolor='red', linewidth=2))

                plt.show()
                break  # Show only one batch for brevity


# Training and evaluation
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss = train_one_epoch(model, train_loader, optimizer, device, scaler)
    val_loss = evaluate(model, val_loader, device)
    visualize_predictions(model, val_loader, device)

    train_losses.append(train_loss)

    val_acc, val_mse = evaluate(model, val_loader, device)
    val_acc_scores.append(val_acc)
    val_mse_scores.append(val_mse)

    print(f'Epoch {epoch+1}/{epochs}, Training Loss: {train_loss}, Validation Accuracy: {val_acc}, Validation MSE: {val_mse}')

    # Adjust learning rate
    scheduler.step(train_loss)

    # Save best model weights
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_wts = model.state_dict()
        torch.save(best_model_wts, 'best_model_weights.pth')
        print('Saved best model weights')

print(f"Total number of samples in training dataset: {len(train_dataset)}")

# Plotting loss and metrics
epochs_range = range(1, epochs + 1)
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs_range


##TESTING PART

In [None]:
import os
import SimpleITK as sitk
import numpy as np
from glob import glob
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from google.colab import drive

# Assuming NoduleDataset and other necessary functions are already defined

class NoduleDataset(Dataset):
    def __init__(self, file_list):
        self.file_list = file_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        image = self.load_image(file_path)
        bbox = self.get_bbox(file_path)
        return image, bbox

    def load_image(self, file_path):
        itk_img = sitk.ReadImage(file_path)
        img_array = sitk.GetArrayFromImage(itk_img)
        img_array = img_array.astype(np.float32)
        img_array = (img_array - np.min(img_array)) / (np.max(img_array) - np.min(img_array))
        img_array = np.expand_dims(img_array, axis=0)  # Add channel dimension
        return torch.tensor(img_array)

    def get_bbox(self, file_path):
        # Implement the logic to get bounding box for the given image
        # Here we return a dummy bbox, replace this with your actual bbox logic
        bbox = [0, 0, 50, 50]  # Example bbox, replace with actual
        return {'boxes': torch.tensor([bbox], dtype=torch.float32), 'labels': torch.tensor([1], dtype=torch.int64)}

# Define subset9 path
subset9_path = "/content/drive/Shareddrives/IA DL_project/ML IA/LUNA16/subsets/subset9"
file_list = glob(subset9_path + "/*.mhd")

# Create subset9 dataset
subset9_dataset = NoduleDataset(file_list)
subset9_loader = DataLoader(subset9_dataset, batch_size=1, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

# Function to perform testing
def test_model(model, data_loader, device):
    model.eval()
    all_boxes = []
    all_scores = []
    all_labels = []
    all_true_boxes = []

    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            outputs = model(images)

            for i, output in enumerate(outputs):
                pred_boxes = output['boxes'].cpu().numpy()
                pred_scores = output['scores'].cpu().numpy()
                pred_labels = output['labels'].cpu().numpy()
                true_boxes = targets[i]['boxes'].cpu().numpy()

                all_boxes.append(pred_boxes)
                all_scores.append(pred_scores)
                all_labels.append(pred_labels)
                all_true_boxes.append(true_boxes)

                # Visualize the detection
                image = images[i].cpu().numpy().squeeze(0)
                plt.imshow(image, cmap='gray')
                ax = plt.gca()
                for box in pred_boxes:
                    rect = plt.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1], fill=False, color='r')
                    ax.add_patch(rect)
                for box in true_boxes:
                    rect = plt.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1], fill=False, color='g')
                    ax.add_patch(rect)
                plt.show()

    return all_boxes, all_scores, all_labels, all_true_boxes

# Function to plot FROC
def plot_froc(true_boxes, pred_boxes, pred_scores):
    # Flatten the list of boxes and scores
    true_boxes = [box for sublist in true_boxes for box in sublist]
    pred_boxes = [box for sublist in pred_boxes for box in sublist]
    pred_scores = [score for sublist in pred_scores for score in sublist]

    # Calculate true positive rates and false positive rates
    fpr, tpr, _ = roc_curve([1] * len(true_boxes) + [0] * len(pred_boxes), pred_scores)
    roc_auc = auc(fpr, tpr)

    # Plot the FROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('FROC Curve')
    plt.legend(loc="lower right")
    plt.show()

# Load the best model
best_model = torch.load('best_model_weights.pth')
best_model.to(device)

# Perform testing
all_boxes, all_scores, all_labels, all_true_boxes = test_model(best_model, subset9_loader, device)

# Plot FROC
plot_froc(all_true_boxes, all_boxes, all_scores)
