In [1]:
import os
import json
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms

class OCTDataset(Dataset):
    def __init__(self, root_dir, indices=None, train=True, is_gentuity=False, transform=None):
        
        self.root_dir = Path(root_dir)
        self.train = train
        self.is_gentuity = is_gentuity
        self.transform = transform
        
        if self.is_gentuity:
            # Gentuity dataset has separate train and test folders
            split_dir = "train" if self.train else "test"
            self.images_dir = self.root_dir / split_dir / "images"
            self.masks_dir = self.root_dir / split_dir / "annotations"
            self.samples = sorted(self.images_dir.glob("*.tiff"))
        else:
            # Terumo dataset has only train data
            self.images_dir = self.root_dir / "train" / "images"
            self.masks_dir = self.root_dir / "train" / "annotations"
            self.samples = sorted(self.images_dir.glob("*.tiff"))
        
        # Filter image paths using indices if provided
        if indices is not None:
            self.samples = [self.samples[i] for i in indices]
        
        else:
            self.samples = sorted(self.images_dir.glob("*.tiff"))
            
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # Load the image
        image_path = self.samples[idx]
        image = Image.open(image_path).convert("RGB")  # Keep it as a PIL Image
        # Get image height and width
        width, height = image.size

        # Load the corresponding mask
        mask_path = self.masks_dir / f"{image_path.stem}.json"
        with open(mask_path, "r") as f:
            mask_data = json.load(f)
        
        # Create a binary mask (0 and 1 values)
        mask = np.zeros((height, width), dtype=np.uint8)  # image.size gives (width, height)
        for coord in mask_data["mask"]:
            x, y = coord
            if 0 <= x < mask.shape[0] and 0 <= y < mask.shape[1]:
                mask[x, y] = 1
        
        mask = np.clip(mask, 0, 1).astype(np.uint8)
        
        mask = Image.fromarray(mask*255)
        
        # Apply the transformation if available
        if self.transform:
            # Convert image and mask to Tensor
            image = self.transform(image)
            mask = self.transform(mask)
        
        # Add channel dimension for mask
        mask = mask.unsqueeze(0) if len(mask.shape) == 2 else mask  # Add channel if needed

        unique_id = mask_data["unique_id"]
        
        return image, mask, image_path.stem, unique_id  # Returning image filename too

In [2]:
import torch.nn as nn
import torch.nn.functional as F

class AttentionBlock(nn.Module):
    def __init__(self, in_channels):
        super(AttentionBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, in_channels // 8, kernel_size=1)
        self.conv2 = nn.Conv2d(in_channels // 8, in_channels, kernel_size=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Compute the attention weights
        attention = self.sigmoid(self.conv2(self.sigmoid(self.conv1(x))))
        return x * attention  # Element-wise multiplication

# Define a large U-Net model with Attn
class Net(nn.Module):
    def __init__(self, input_channels=3, output_channels=1, initial_filters=64):
        super(Net, self).__init__()

        self.encoder1 = self.conv_block(input_channels, initial_filters)
        self.encoder2 = self.conv_block(initial_filters, initial_filters * 2)
        self.encoder3 = self.conv_block(initial_filters * 2, initial_filters * 4)
        self.encoder4 = self.conv_block(initial_filters * 4, initial_filters * 8)
        self.encoder5 = self.conv_block(initial_filters * 8, initial_filters * 16)

        self.pool = nn.MaxPool2d(2)

        # Adjusting channels to match attention input
        self.upconv5 = nn.ConvTranspose2d(initial_filters * 16, initial_filters * 8, kernel_size=2, stride=2)
        self.attn5 = AttentionBlock(initial_filters * 16)  # Updated input channels
        self.decoder5 = self.conv_block(initial_filters * 16, initial_filters * 8)

        self.upconv4 = nn.ConvTranspose2d(initial_filters * 8, initial_filters * 4, kernel_size=2, stride=2)
        self.attn4 = AttentionBlock(initial_filters * 8)  # Updated input channels
        self.decoder4 = self.conv_block(initial_filters * 8, initial_filters * 4)

        self.upconv3 = nn.ConvTranspose2d(initial_filters * 4, initial_filters * 2, kernel_size=2, stride=2)
        self.attn3 = AttentionBlock(initial_filters * 4)  # Updated input channels
        self.decoder3 = self.conv_block(initial_filters * 4, initial_filters * 2)

        self.upconv2 = nn.ConvTranspose2d(initial_filters * 2, initial_filters, kernel_size=2, stride=2)
        self.attn2 = AttentionBlock(initial_filters * 2)  # Updated input channels
        self.decoder2 = self.conv_block(initial_filters * 2, initial_filters)

        self.final_conv = nn.Conv2d(initial_filters, output_channels, kernel_size=1)

    def conv_block(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        # Encoding path
        e1 = self.encoder1(x)
        e2 = self.encoder2(self.pool(e1))
        e3 = self.encoder3(self.pool(e2))
        e4 = self.encoder4(self.pool(e3))
        e5 = self.encoder5(self.pool(e4))

        # Decoding path with skip connections and attention
        d5 = torch.cat((self.upconv5(e5), e4), dim=1)
        d5 = self.attn5(d5)  # Apply attention
        d5 = self.decoder5(d5)

        d4 = torch.cat((self.upconv4(d5), e3), dim=1)
        d4 = self.attn4(d4)  # Apply attention
        d4 = self.decoder4(d4)

        d3 = torch.cat((self.upconv3(d4), e2), dim=1)
        d3 = self.attn3(d3)  # Apply attention
        d3 = self.decoder3(d3)

        d2 = torch.cat((self.upconv2(d3), e1), dim=1)
        d2 = self.attn2(d2)  # Apply attention
        d2 = self.decoder2(d2)

        # Final output layer
        out = self.final_conv(d2)

        return torch.sigmoid(out)  # Assuming binary segmentation

class DiceLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DiceLoss, self).__init__()

    def forward(self, inputs, targets, smooth=1):   
        #flatten label and prediction tensors
        inputs = inputs.view(-1)
        targets = targets.view(-1)
        
        # Calculate intersection and union
        intersection = torch.sum(inputs * targets)
        union = torch.sum(inputs) + torch.sum(targets)

        # Compute Dice Loss
        dice_loss = 1 - (2. * intersection + smooth) / (union + smooth)  
        
        return dice_loss

class DiceBCELoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(DiceBCELoss, self).__init__()
        self.bce_loss = nn.BCELoss()

    def forward(self, inputs, targets, smooth=1): 
        #flatten label and prediction tensors
        inputs_flatten = inputs.view(-1)
        targets_flatten = targets.view(-1)
        
        # Calculate intersection and union
        intersection = torch.sum(inputs_flatten * targets_flatten)
        union = torch.sum(inputs_flatten) + torch.sum(targets_flatten)

        # Compute BCE Loss
        bce = self.bce_loss(inputs, targets)

        # Compute Dice Loss
        dice_loss = 1 - (2. * intersection + smooth) / (union + smooth)  
        Dice_BCE = bce + dice_loss
        
        return Dice_BCE

In [3]:
import numpy as np
import os
import tempfile
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from filelock import FileLock
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau
from typing import Dict
import ray
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray import tune, air
from ray.air import session
from ray.tune.search.optuna import OptunaSearch
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import neptune
os.environ["TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S"] = "0"

def train_model_cv(config):
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")
    net = Net().to(device)

    # Select optimizer based on the configuration
    if config["optimizer"] == "AdamW":
        optimizer = optim.AdamW(net.parameters(), lr=config["lr"])
    elif config["optimizer"] == "SGD":
        optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
    elif config["optimizer"] == "RMSprop":
        optimizer = optim.RMSprop(net.parameters(), lr=config["lr"])

    # Select loss function based on the configuration
    if config["loss_function"] == "DiceLoss":
        criterion = DiceLoss()
    elif config["loss_function"] == "DiceBCELoss":
        criterion = DiceBCELoss()
    elif config["loss_function"] == "BCELoss":
        criterion = nn.BCELoss()

    transform = transforms.Compose([
        transforms.Resize((256, 256), interpolation=Image.NEAREST),
        transforms.ToTensor(),
    ])

    # Load existing checkpoint through `get_checkpoint()` API.
    if train.get_checkpoint():
        loaded_checkpoint = train.get_checkpoint()
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            )
            net.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    root_dir = config["root_dir"]
    folds= config["folds"]
    
    with open(os.path.join(root_dir, "metadata.csv"), "r") as f:
        metadata_df = pd.read_csv(f)
        skf = StratifiedKFold(n_splits=folds)
        splits = list(skf.split(metadata_df, metadata_df["unique_id"]))

    for fold in range(folds):
        # Train and validate the model
        print(f"Training on fold {fold+1} out of {folds}")
        
        # Initialize Neptune run
        run = neptune.init_run(
            project="OCTAA/OCTSegmenter",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2MGU2NGNjMi0yNWE0LTRjNzgtOGNlNS1hZDdkMjJhYzYxMWUifQ==",
            name="training_and_validation",
            tags="terumo",
        )  # your credentials

        run["sys/group_tags"].add([
            str(config["loss_function"]), 
            str(config["optimizer"]), 
            f"Fold: {str(fold)}"
        ])  # Group tags

        # Log configuration parameters
        run["parameters"] = config

        train_indices, val_indices = splits[fold]

        train_dataset = OCTDataset(root_dir, indices=train_indices, transform=transform)
        val_dataset = OCTDataset(root_dir, indices=val_indices, transform=transform)

        trainloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
        valloader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)

        best_val_loss = float("inf")
        epochs = config["epochs"]
        no_improvement_epochs = 0
        patience = config["patience"]
        scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=10)
        
        for epoch in range(epochs):
            net.train()
            running_loss = 0.0
            epoch_steps= 0

            for i, data in enumerate(trainloader):
                images, masks, _, _ = data
                images, masks = images.to(device), masks.to(device)

                optimizer.zero_grad()
                outputs = net(images)
                loss = criterion(outputs, masks)
                loss.backward()
                optimizer.step()

                running_loss += loss.item() * images.size(0)

                epoch_steps += 1
                if i % 10 == 9:  # print every 10 mini-batches
                    print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                    loss.item()))


            # Calculate training loss and accuracy for the epoch
            train_loss = running_loss / len(trainloader.dataset)
            run["train_loss"].append(train_loss)  # Log training loss to neptune
            print(f"Epoch [{epoch+1}/{epochs}], Training Loss: {train_loss:.4f}")

            # Validation phase
            net.eval()
            val_loss = 0.0
            dice_loss = 0.0

            with torch.no_grad():  # No need to calculate gradients during validation
                for data in valloader:
                    images, masks, _, _ = data
                    images, masks = images.to(device), masks.to(device)

                    outputs = net(images)
                    loss = criterion(outputs, masks)
                    val_loss += loss.item() * images.size(0)

                    # Calculate Dice loss
                    dice = DiceLoss()
                    loss = dice(outputs, masks)
                    dice_loss += loss.item() * images.size(0)
            
            # Calculate validation loss and accuracy
            val_loss = val_loss / len(valloader.dataset)
            avg_dice_loss = dice_loss / len(valloader.dataset)
            scheduler.step(val_loss) # Adjust learning rate based on validation loss
            run["val_loss"].append(val_loss)  # Log validation loss
            run["dice_loss"].append(avg_dice_loss)  # Log Dice loss
            print(f"Epoch [{epoch+1}/{epochs}], Validation Loss: {val_loss:.4f}")

            with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
                path = os.path.join(temp_checkpoint_dir, "checkpoint.pt")
                torch.save(
                    (net.state_dict(), optimizer.state_dict()), path
                )
                checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
                train.report(
                    {"loss": val_loss, "accuracy": 1 - avg_dice_loss, "dice_loss": avg_dice_loss, "fold": fold},
                    checkpoint=checkpoint,
                )

            # Check if validation loss improves
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                no_improvement_epochs = 0
                print(f"Validation loss improved to {val_loss:.4f}. Saving checkpoint.")
                
            else:
                no_improvement_epochs += 1
                print(f"Validation loss did not improve. Best so far: {best_val_loss:.4f}")
            
            if no_improvement_epochs >= patience:
                print(f"Stopping early. No improvement in {patience} epochs.")
                run["early_stopping"] = True
                break

        run.stop()
        print("Finished Training")

    
def test_best_model(best_result):
    
    # Initialize Neptune run
    run = neptune.init_run(
        project="OCTAA/OCTSegmenter",
        api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2MGU2NGNjMi0yNWE0LTRjNzgtOGNlNS1hZDdkMjJhYzYxMWUifQ==",
        name="best_model_test",
        tags="gentuity"  
    )  # your credentials

    # Log configuration parameters
    run["parameters"] = best_result.config

    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")
    best_trained_model = Net().to(device)

    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")

    model_state, optimizer_state = torch.load(checkpoint_path, weights_only=True)
    best_trained_model.load_state_dict(model_state)

    transform = transforms.Compose([
        transforms.Resize((256, 256), interpolation=Image.NEAREST),
        transforms.ToTensor(),
    ])

    root_dir = r"D:\OneDrive - Aarhus Universitet\9. Semester\Deep Learning\data_gentuity"

    test_dataset = OCTDataset(root_dir, transform=transform, train=False, is_gentuity=True)
    testloader = DataLoader(test_dataset, batch_size=best_result.config["batch_size"], shuffle=False)

    criterion = DiceLoss()

    total_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation
        for data in testloader:
            images, masks, _, _ = data
            images, masks = images.to(device), masks.to(device)

            outputs = best_trained_model(images)
            predicted = (outputs > 0.5).float()
            loss = criterion(predicted, masks)
            total_loss += loss.item() * images.size(0)

    # Calculate average loss and accuracy
    total_loss /= len(testloader.dataset)
    accuracy = 1 - loss

    run["test_loss"] = total_loss
    run.stop()
    print(f"Test Loss: {total_loss:.4f}, Test Accuracy: {accuracy:.4f}")

In [None]:
from ray.train import RunConfig, CheckpointConfig

def plot_cv_indices(cv, X, y, ax, n_splits, lw=10, cmap_data="tab10"):
    """Create a sample plot for indices of a cross-validation object."""
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1  # Mark the test samples
        indices[tr] = 0  # Mark the train samples

        # Visualize the results for the current split
        # Train samples in light blue, validation samples in redder orange
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=np.where(indices == 0, '#add8e6', '#ff4500'),  # Set light blue and redder orange
            marker="_",
            lw=lw,
        )

    # Plot the unique_id at the end (instead of class labels)
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data
    )

    # Add a legend for train and validation splits
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], color='#add8e6', lw=4, label='Train'),
        Line2D([0], [0], color='#ff4500', lw=4, label='Validation'),
    ]
    ax.legend(handles=legend_elements, loc='upper right', fontsize=12)

    # Formatting
    yticklabels = list(range(n_splits)) + ["unique_id"]
    ax.set(
        yticks=np.arange(n_splits + 1) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 1.2, -0.2],
        xlim=[0, len(X)],
    )
    ax.set_title(f"{type(cv).__name__} Cross-Validation", fontsize=15)
    return ax

# Visualize splits
def visualize_cv_splits(metadata_df, n_splits=9):
    # Extract unique IDs and their corresponding target (unique_id)
    unique_ids = metadata_df["unique_id"].values

    # Initialize StratifiedKFold with n_splits
    skf = StratifiedKFold(n_splits=n_splits)

    # Create the plot
    fig, ax = plt.subplots(figsize=(10, 6))

    # Plot the cross-validation splits
    plot_cv_indices(
        skf, X=metadata_df, y=unique_ids, ax=ax, n_splits=n_splits
    )

    plt.show()

# Custom function to shorten trial directory names
def trial_dirname_creator(trial):
    # Shorten the trial name to only include key parameters
    return f"trial_{trial.trial_id}_lr={trial.config['lr']:.1e}_opt={trial.config['optimizer']}_bs={trial.config['batch_size']}"

def main(num_samples, gpus_per_trial, epochs, smoke_test, folds):
    if smoke_test:
        root_dir = r"D:\OneDrive - Aarhus Universitet\9. Semester\Deep Learning\data_terumo_smoke_test"
        with open(os.path.join(root_dir, "metadata.csv"), "r") as f:
            metadata_df = pd.read_csv(f)
            skf = StratifiedKFold(n_splits=folds)
            visualize_cv_splits(metadata_df, n_splits=folds)

    else:
        print("Using full dataset")
    
    config = {
        "root_dir": root_dir,
        "lr": tune.loguniform(1e-6, 1e-2),
        "epochs": epochs,
        "smoke_test": smoke_test,
        "batch_size": tune.choice([4]),
        "optimizer": tune.grid_search(["AdamW", "SGD", "RMSprop"]),
        "folds": folds,
        "patience": 20,
        "loss_function": tune.grid_search(["DiceLoss", "BCELoss", "DiceBCELoss"])
    }

    # ASHA SCHEDULER, BUT WILL NOT BE USED
    # scheduler = ASHAScheduler(
    #     max_t=5,
    #     grace_period=5,
    #     reduction_factor=2
    # )

    # Define your checkpoint configuration
    checkpoint_config = CheckpointConfig(
        num_to_keep=1,  # Only keep the best checkpoint
        checkpoint_score_attribute="loss",  # The metric used to determine the best checkpoint
        checkpoint_score_order="min",  # Keep the checkpoint with the lowest loss
    )

    # Define the run config with the checkpoint config
    run_config = RunConfig(checkpoint_config=checkpoint_config)

    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_model_cv),
            resources={"cpu": 2, "gpu": gpus_per_trial}
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            num_samples=num_samples,
            trial_dirname_creator=trial_dirname_creator,
        ),
        param_space=config,
        run_config=run_config,
    )
    results = tuner.fit()
    
    best_result = results.get_best_result("dice_loss", "min")

    print("Best trial config: {}".format(best_result.config))
    print("Best trial final validation loss: {}".format(
        best_result.metrics["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_result.metrics["accuracy"]))

    test_best_model(best_result)

main(num_samples=2, gpus_per_trial=1, epochs=2, smoke_test=True, folds=5)

0,1
Current time:,2024-11-28 11:41:14
Running for:,00:02:29.70
Memory:,12.6/15.9 GiB

Trial name,status,loc,batch_size,loss_function,lr,optimizer,iter,total time (s),loss,accuracy,dice_loss
train_model_cv_f10aa_00000,RUNNING,127.0.0.1:11856,4,DiceLoss,1.44375e-06,AdamW,3.0,132.168,0.834576,0.165424,0.834576
train_model_cv_f10aa_00001,PENDING,,4,BCELoss,7.35583e-06,AdamW,,,,,
train_model_cv_f10aa_00002,PENDING,,4,DiceBCELoss,0.000338883,AdamW,,,,,
train_model_cv_f10aa_00003,PENDING,,4,DiceLoss,1.13945e-06,SGD,,,,,
train_model_cv_f10aa_00004,PENDING,,4,BCELoss,0.000150183,SGD,,,,,
train_model_cv_f10aa_00005,PENDING,,4,DiceBCELoss,0.000150427,SGD,,,,,
train_model_cv_f10aa_00006,PENDING,,4,DiceLoss,5.66629e-05,RMSprop,,,,,
train_model_cv_f10aa_00007,PENDING,,4,BCELoss,2.26306e-05,RMSprop,,,,,
train_model_cv_f10aa_00008,PENDING,,4,DiceBCELoss,1.32408e-05,RMSprop,,,,,
train_model_cv_f10aa_00009,PENDING,,4,DiceLoss,1.87621e-05,AdamW,,,,,


[36m(train_model_cv pid=11856)[0m Training on fold 1 out of 5
[36m(train_model_cv pid=11856)[0m [neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/OCTAA/OCTSegmenter/e/OCT-183
[36m(train_model_cv pid=11856)[0m [1,    10] loss: 0.909
[36m(train_model_cv pid=11856)[0m [1,    20] loss: 0.770
[36m(train_model_cv pid=11856)[0m [1,    30] loss: 0.908
[36m(train_model_cv pid=11856)[0m Epoch [1/2], Training Loss: 0.8519
[36m(train_model_cv pid=11856)[0m Epoch [1/2], Validation Loss: 0.8653


[36m(train_model_cv pid=11856)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34/trial_f10aa_00000_lr=1.4e-06_opt=AdamW_bs=4/checkpoint_000000)


[36m(train_model_cv pid=11856)[0m Validation loss improved to 0.8653. Saving checkpoint.
[36m(train_model_cv pid=11856)[0m [2,    10] loss: 0.876
[36m(train_model_cv pid=11856)[0m [2,    20] loss: 0.905
[36m(train_model_cv pid=11856)[0m [2,    30] loss: 0.914
[36m(train_model_cv pid=11856)[0m Epoch [2/2], Training Loss: 0.8407
[36m(train_model_cv pid=11856)[0m Epoch [2/2], Validation Loss: 0.8504


[36m(train_model_cv pid=11856)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34/trial_f10aa_00000_lr=1.4e-06_opt=AdamW_bs=4/checkpoint_000001)


[36m(train_model_cv pid=11856)[0m Validation loss improved to 0.8504. Saving checkpoint.
[36m(train_model_cv pid=11856)[0m [neptune] [info   ] Shutting down background jobs, please wait a moment...
[36m(train_model_cv pid=11856)[0m [neptune] [info   ] Done!
[36m(train_model_cv pid=11856)[0m [neptune] [info   ] Waiting for the remaining 8 operations to synchronize with Neptune. Do not kill this process.
[36m(train_model_cv pid=11856)[0m [neptune] [info   ] All 8 operations synced, thanks for waiting!
[36m(train_model_cv pid=11856)[0m [neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/OCTAA/OCTSegmenter/e/OCT-183/metadata
[36m(train_model_cv pid=11856)[0m Finished Training
[36m(train_model_cv pid=11856)[0m Training on fold 2 out of 5
[36m(train_model_cv pid=11856)[0m [neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/OCTAA/OCTSegmenter/e/OCT-184
[36m(train_model_cv pid=11856)[0m [1,    10] loss: 0.858
[36

[36m(train_model_cv pid=11856)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34/trial_f10aa_00000_lr=1.4e-06_opt=AdamW_bs=4/checkpoint_000002)


[36m(train_model_cv pid=11856)[0m Validation loss improved to 0.8346. Saving checkpoint.


2024-11-28 11:41:14,102	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34' in 0.0130s.


[36m(train_model_cv pid=11856)[0m [2,    10] loss: 0.854


2024-11-28 11:41:24,296	INFO tune.py:1041 -- Total run time: 159.97 seconds (149.68 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34", trainable=...)
- train_model_cv_f10aa_00001: FileNotFoundError('Could not fetch metrics for train_model_cv_f10aa_00001: both result.json and progress.csv were not found at C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34/trial_f10aa_00001_lr=7.4e-06_opt=AdamW_bs=4')
- train_model_cv_f10aa_00002: FileNotFoundError('Could not fetch metrics for train_model_cv_f10aa_00002: both result.json and progress.csv were not found at C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34/trial_f10aa_00002_lr=3.4e-04_opt=AdamW_bs=4')
- train_model_cv_f10aa_00003: FileNotFoundError('Could not fetch metrics for train_model_cv_f10aa_00003: both result.json and progress.csv were not found at C:/Users/johan/ray_results/train_model_cv_2024-11-28_11-38-34/trial_f10aa_00

Best trial config: {'root_dir': 'D:\\OneDrive - Aarhus Universitet\\9. Semester\\Deep Learning\\data_terumo_smoke_test', 'lr': 1.4437474936625422e-06, 'epochs': 2, 'smoke_test': True, 'batch_size': 4, 'optimizer': 'AdamW', 'folds': 5, 'patience': 20, 'loss_function': 'DiceLoss'}
Best trial final validation loss: 0.8345764146910774
Best trial final validation accuracy: 0.16542358530892265
[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/OCTAA/OCTSegmenter/e/OCT-185


In [5]:
import os
from ray import tune
from ray.train import Result

storage_path = r"C:\Users\johan\ray_results"
exp_name = "train_model_cv_2024-11-26_18-30-35"
experiment_path = os.path.join(storage_path, exp_name)
print(f"Loading results from {experiment_path}...")

restored_tuner = tune.Tuner.restore(experiment_path, trainable=train_model_cv)
result_grid = restored_tuner.get_results()

# Check if there have been errors
if result_grid.errors:
    print("One of the trials failed!")
else:
    print("No errors!")
    
# Get the result with the maximum test set `mean_accuracy`
best_result: Result = result_grid.get_best_result()

# Load the best checkpoint if it exists
if best_result.checkpoint:
    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
else:
    raise ValueError("No checkpoint found for the best result.")

Loading results from C:\Users\johan\ray_results\train_model_cv_2024-11-26_18-30-35...
One of the trials failed!


OSError: [WinError 112] Failed copying 'C:/Users/johan/ray_results/train_model_cv_2024-11-26_18-30-35/trial_2b60a_00035_lr=1.0e-04_opt=RMSprop_bs=4/checkpoint_000004/checkpoint.pt' to 'C:/Users/johan/AppData/Local/Temp/checkpoint_tmp_eb1ac5460b9145e2847271a019179487/checkpoint.pt'. Detail: [Windows error 112] Der er ikke tilstrækkelig plads på disken.


In [None]:
import os
from ray import tune
from ray.train import Result

storage_path = r"C:\Users\johan\ray_results"
exp_name = "train_model_cv_2024-11-26_16-31-22"
experiment_path = os.path.join(storage_path, exp_name)
print(f"Loading results from {experiment_path}...")

restored_tuner = tune.Tuner.restore(experiment_path, trainable=train_model_cv)
result_grid = restored_tuner.get_results()

# Check if there have been errors
if result_grid.errors:
    print("One of the trials failed!")
else:
    print("No errors!")

num_results = len(result_grid)
print("Number of results:", num_results)

# Iterate over results
for i, result in enumerate(result_grid):
    if result.error:
        print(f"Trial #{i} had an error:", result.error)
        continue

    print(f"Trial #{i} finished successfully with a loss of: {result.metrics['loss']}")

results_df = result_grid.get_dataframe()
results_df[["training_iteration", "loss"]]

print("Shortest training time:", results_df["time_total_s"].min())
print("Longest training time:", results_df["time_total_s"].max())

ax = None
for result in result_grid:
    label = f"lr={result.config['lr']:.4f}, batch_size={result.config['batch_size']}, optimizer={result.config['optimizer']}, fold={result.config['fold']}"
    if ax is None:
        ax = result.metrics_dataframe.plot("training_iteration", "loss", label=label)
    else:
        result.metrics_dataframe.plot("training_iteration", "loss", ax=ax, label=label)
ax.set_title("Loss vs. Training Iteration for All Trials")
ax.set_ylabel("Loss")

# Get the result with the maximum test set `mean_accuracy`
best_result: Result = result_grid.get_best_result()

print("Best trial config: {}".format(best_result.config))

# Get the best trial's final loss and accuracy
best_loss = best_result.metrics["loss"]
print("Best trial final test set loss: {}".format(best_loss))
best_accuracy = best_result.metrics["accuracy"]
print("Best trial final test set accuracy: {}".format(best_accuracy))

# Load the best model
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

best_trained_model = Net().to(device)

# Load the best checkpoint
checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")

model_state, optimizer_state = torch.load(checkpoint_path, weights_only=True)
best_trained_model.load_state_dict(model_state)

# Set the model to evaluation mode
best_trained_model.eval()

# Load a sample image from the test dataset
root_dir = r"D:\OneDrive - Aarhus Universitet\9. Semester\Deep Learning\data_gentuity"
transform = transforms.Compose([
    transforms.Resize((256, 256), interpolation=Image.NEAREST),
    transforms.ToTensor(),
])
test_dataset = OCTDataset(root_dir, transform=transform, train=False, is_gentuity=True)
random_indices = np.random.choice(len(test_dataset), 1, replace=False)
sample_image, sample_mask, _, _ = test_dataset[random_indices[0]]  # Change the index to load a different sample

# Move the sample image to the appropriate device
sample_image = sample_image.to(device).unsqueeze(0)  # Add batch dimension

# Make a prediction
with torch.no_grad():
    prediction = best_trained_model(sample_image)

# Convert the prediction to a binary mask
predicted_mask = (prediction > 0.5).float()

# Plot the sample image, ground truth mask, and predicted mask
fig, ax = plt.subplots(1, 3, figsize=(18, 6))

ax[0].imshow(sample_image.squeeze().permute(1, 2, 0).cpu().numpy())
ax[0].set_title("Sample Image")
ax[0].axis('off')

ax[1].imshow(sample_mask.squeeze().cpu().numpy(), cmap='gray')
ax[1].set_title("Ground Truth Mask")
ax[1].axis('off')

ax[2].imshow(predicted_mask.squeeze().cpu().numpy(), cmap='gray')
ax[2].set_title("Predicted Mask")
ax[2].axis('off')

plt.show()

- train_model_cv_83259_00005: FileNotFoundError('Could not fetch metrics for train_model_cv_83259_00005: both result.json and progress.csv were not found at C:/Users/johan/ray_results/train_model_cv_2024-11-26_16-31-22/trial_83259_00005_lr=1.0e-04_opt=Adam_bs=4')
- train_model_cv_83259_00022: FileNotFoundError('Could not fetch metrics for train_model_cv_83259_00022: both result.json and progress.csv were not found at C:/Users/johan/ray_results/train_model_cv_2024-11-26_16-31-22/trial_83259_00022_lr=1.0e-04_opt=SGD_bs=4')
- train_model_cv_83259_00017: FileNotFoundError('Could not fetch metrics for train_model_cv_83259_00017: both result.json and progress.csv were not found at C:/Users/johan/ray_results/train_model_cv_2024-11-26_16-31-22/trial_83259_00017_lr=1.0e-04_opt=SGD_bs=4')
- train_model_cv_83259_00033: FileNotFoundError('Could not fetch metrics for train_model_cv_83259_00033: both result.json and progress.csv were not found at C:/Users/johan/ray_results/train_model_cv_2024-11-26_

Loading results from C:\Users\johan\ray_results\train_model_cv_2024-11-26_16-31-22...
One of the trials failed!
Number of results: 45


KeyError: 'loss'