# Mono and Stereo Model implementation
The following notebook implements a pipeline for training a Faster R-CNN with ResNet50 Backbone on our custom Dataset. 

## Installs
There are most likely more packages that have to be installed. `torch` and `torchvision` are listed here specifically since these versions worked with the cuda version available on bwVisu. If needed uncomment the line and install the correct version. It should be noted that in newer versions of torch the object detection models were extended and might be interesting to have a look at.

In [None]:
# Depending on the available cuda version, specific versions of torch have to be used
#!pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html

## Imports

In [None]:
# Basic python imports
import os
from io import StringIO
from contextlib import redirect_stdout
import random
import json
from pprint import pprint

# Image and array handling imports
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

# Torch imports
import torch
import torchvision
from torchvision.transforms import functional as func
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

# Torch model related imports
import torch.nn as nn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.faster_rcnn import fasterrcnn_resnet50_fpn
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Additional imports
from pycocotools.coco import COCO # Handling of coco annotation files
import albumentations as alb # Image augmentation

## Utility functions
The following is a collection of functions needed in various steps of the pipeline.

In [None]:
# Basic helper functions
def json_loader(path):
    '''Returns a json object created from a file'''
    file = open(path)
    return json.load(file)
    
def pil_loader(path):
    '''Returns a Pillow image in RGB from a given filepath'''
    # open path as file to avoid ResourceWarning
    # (https://github.com/python-pillow/Pillow/issues/835)
    with open(path, 'rb') as file:
        with Image.open(file) as image:
            return image.convert('RGB')

def read_lines_from_text_file(filename):
    '''Read all the lines in a text file and return as a list'''
    with open(filename, 'r') as f:
        lines = f.read().splitlines()
    return lines

def seed_worker(worker_id):
    '''When using multiple parallel workers they can be seeded to provide consistent results'''
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


# Functions for handling the annotation files
def fileName2imageID(filename, json_decode):
    '''Returns the corresponding image id for a filename from a given annotation structure'''
    result = next(z for z in json_decode['images'] if filename in z['file_name'])
    return result['id']

def catId2Label(cat_Id, json_decode):
    '''Returns the corresponding category label to an id from a given annotation structure'''
    result = next(z for z in json_decode['categories'] if z['id'] == cat_Id)
    return result['name']

def getAnnos(filename, json_decode, coco):
    '''Returns the annotations for a given image from a given annotation structure'''
    annos = []
    img = fileName2imageID(filename,json_decode)
    with redirect_stdout(StringIO()) as f:
        anno_ids = coco.getAnnIds(img)
        annos = coco.loadAnns(anno_ids)
    return annos

def mask_loader(path, img_name):
    '''Convert the annotations read from the annotation file to the format used by the model'''
    annos = json_loader(path)
    with redirect_stdout(StringIO()) as f: # COCO outputs an annoying amount of information. This captures the output.
        coco = COCO(path) 
    anno = getAnnos(img_name,annos, coco)
    targets = []
    for ann in anno:
        bbox = ann['bbox']
        cat_id = ann['category_id']
        targets.append({'bboxes': bbox, 'labels': cat_id})
    return targets
    
def mask_np_loader(path):
    '''If working with masks instead of bounding boxes this function loads the mask and strips the not needed channels'''
    # The np mask has 3 channels but we need only a single channel
    return np.load(path)[..., 0]


# Function to account for an issue with batch loading
def collate(batch):
    return tuple(zip(*batch))


# Postprocessing functions
def apply_nms(prediction, threshold):
    '''Applying Non-Maximum-Suppression to a given set of predictions and filtering out boxes with an iou greater than the threshold'''
    # torchvision returns the indices of the boxes to keep not the actual predictions
    keep = torchvision.ops.nms(prediction['boxes'], prediction['scores'], threshold)
    
    final_prediction = prediction
    final_prediction['boxes'] = final_prediction['boxes'][keep]
    final_prediction['scores'] = final_prediction['scores'][keep]
    final_prediction['labels'] = final_prediction['labels'][keep]
    
    return final_prediction

def split_classes(predictions):
    '''Generator function to split the set of predictions into smaller sets containing predictions for only one class'''
    labels_available = set(predictions['labels'].tolist())
    labels = predictions['labels']

    for class_label in labels_available:
        keep = [True if label == class_label else False for label in labels]
        class_predictions = predictions.copy()
        
        class_predictions['boxes'] = class_predictions['boxes'][keep]
        class_predictions['labels'] = class_predictions['labels'][keep]
        class_predictions['scores'] = class_predictions['scores'][keep]
        yield class_predictions
        
def save_predictions_to_json(filenames, ground_truth, predictions, filepath):
    '''Function to write the filenames, ground truth labels and predicted bounding boxes in a file'''
    ground_truth = [{val_k: val_v.tolist() for val_k, val_v in val_t.items()} for val_t in ground_truth] 
    predictions = [{val_k: val_v.tolist() for val_k, val_v in val_t.items()} for val_t in predictions] 

    pred_dict = {'filenames': filenames, 'labels': ground_truth, 'predictions': predictions}

    with open (filepath,'w') as fp:
        json.dump(pred_dict, fp, indent=2)

## Custom Datasets

### Basic EndoDataset

In [None]:
class EndoDataset(Dataset):
    """ The endoscopic dataset requires the following folder structure:
    Surgery --> Video --> images which contain the images to be loaded
    This mono class, works with split files which are text files that contain the
    relative path and the image name. (It is a mono class because it returns a single image)
    The class reads the image file paths that are specified in the text file and loads the images.
    It applies the specified transforms to the image, else it just converts it into a tensor.
    :returns Pre-processed and augmented image as a tensor
    """

    def __init__(self, data_root_folder=None,
                 filenames=None,
                 height=448,
                 width=448,
                 image_aug=None,
                 aug_prob=0.5,
                 camera="left",
                 image_ext='.png'):
        super(EndoDataset).__init__()
        self.data_root_folder = data_root_folder
        self.filenames = filenames
        self.height = height
        self.width = width
        self.image_ext = image_ext
        self.camera = camera
        #self.format = data_format 

        # Pick image loader based on image format
        self.image_loader = np.load if self.image_ext == '.npy' else pil_loader
        self.side_map = {"2": 2, "3": 3, "l": 2, "r": 3}
        self.cam_to_side = {"left": "l", "right": "r"}

        # Image pre-processing options
        self.image_aug = image_aug
        self.aug_prob = aug_prob

        # Pick resize function based on image format
        if self.image_ext == '.png':
            # Output: Resized PIL Image
            self.resize = transforms.Resize((self.height, self.width), interpolation=Image.LINEAR)
            # Resize to dims slightly larger than given dims
            # Sometimes useful for aug together with crop function
            self.resize_bigger = transforms.Resize((int(self.height * 1.2),
                                                    int(self.width * 1.2)))
        elif self.image_ext == '.npy':
            self.resize = lambda x: cv2.resize(x, (self.width, self.height), interpolation=cv2.INTER_NEAREST)
            self.resize_bigger = lambda x: cv2.resize(x, (self.width*1.2, self.height*1.2),
                                                      interpolation=cv2.INTER_NEAREST)

    def get_split_filename(self, filename):
        """ Splits a filename string comprising of "relative_path <space> image_name"
        :param filename A string comprising of "relative_path <space> image_name"
        :return split_filename- "relative_path, image_name"
        """
        split_filename = filename.split()
        # folder, frame_num, side
        if len(split_filename) == 2: 
            return split_filename[0], split_filename[1], self.cam_to_side[self.camera]
        else:
            return split_filename
    

    def make_image_path(self, data_root, rel_folder, image_name, side=None):
        """Combines the relative path with the data root to get the complete path of image
        """
        #print("erg :",data_root, rel_folder, image_name)
        #frame_name = "{:06d}{}".format(int(image_name), self.image_ext)
        #print("framename: ", frame_name)
        #print(image_name)
        path = (os.path.join(data_root, rel_folder,
                            "image_0{}".format(self.side_map[side]), "images", image_name)+ self.image_ext)
        #print("image: " ,path)
        #print("\n")

        return path

    def preprocess(self, image):
        #image = self.resize(image)
        # if self.image_aug and random.random() > self.aug_prob: image = self.image_aug(image)
        if self.image_aug: image = self.image_aug(image=np.asarray(image))["image"]  # alb needs np input
        return func.to_tensor(np.array(image))

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):
        """Returns the image with pre-proc transforms + aug applied to it"""
        args = self.get_split_filename(self.filenames[index])
        #print("*args: ", args)
        #print(self.make_image_path(self.data_root_folder, *args))
        image = self.image_loader(self.make_image_path(self.data_root_folder, *args))
        image = self.preprocess(image)
        return image, self.filenames[index]

### Dataset Monovision (with BB)

In [None]:
class EndoMaskDataset(EndoDataset):
    """ Loads an image and its corresponding mask
    Some aug is performed common to both image and mask
    Some aug like color aug is performed only on the image
    :returns Pre-proc+aug image, mask
    """

    def __init__(self, mask_transform=None,
                 image_mask_aug=None,
                 mask_path_suffix="",
                 **kwargs):
        super(EndoMaskDataset, self).__init__(**kwargs)
        self.mask_path_suffix = mask_path_suffix
        self.image_mask_aug = image_mask_aug
        self.mask_loader = mask_np_loader if self.image_ext == '.npy' else mask_loader
        self.mask_transform = transforms.ToTensor() if mask_transform is None else transforms.Compose(mask_transform)

    def make_mask_path(self, data_root, rel_folder, image_name, side=None):
        """Combines the relative path with the data root to get the complete path of mask
        """
        #frame_name = "{:06d}{}".format(int(image_name), self.image_ext)
        #return os.path.join(data_root, rel_folder,
        #                    "image_0{}".format(self.side_map[side]), "annotations", "instances_default.json")#+self.mask_path_suffix
        path = os.path.join(data_root, rel_folder,
                            "image_0{}".format(self.side_map[side]), "annotations", "instances_default.json")
        return path

    def preprocess_image_mask(self, image, mask):
        #image = self.resize(image)
        if self.image_aug: image = self.image_aug(image=np.asarray(image))["image"]
        if self.image_mask_aug:
            bboxes = [ann['bboxes'] for ann in mask]
            labels = [ann['labels'] for ann in mask]

            augmented = self.image_mask_aug(image=np.asarray(image), bboxes=bboxes, class_labels=labels)
            # alb needs np input
            image = augmented["image"]
            boxes = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in augmented['bboxes']]
            labels = augmented['class_labels']
            
            # Nicht existente Box mit passender Größe
            if len(boxes) == 0:
                boxes = np.zeros((0,4))
                labels = np.zeros((0), dtype=np.int64)
            
            mask = {'boxes': torch.tensor(boxes), 'labels': torch.tensor(labels)}
            
            #mask = [{'bboxes': box, 'labels': label} for box, label in zip(augmented['bboxes'], augmented['class_labels'])]
        image = func.to_tensor(np.array(image))

        #mask = [{'boxes': torch.tensor([m['bboxes']]), 'labels': torch.tensor(m['labels'])} for m in mask]
        
        return image, mask

    def __getitem__(self, index):
        #find json for one video and get boundingbox and category
        #targets = [{'boxes': bbox, 'labels': label}]
        #return img, targets
        args = self.get_split_filename(self.filenames[index])
        #print("*args: ", args)
        image_path = self.make_image_path(self.data_root_folder, *args)
        #print("image_path", image_path)
        image = self.image_loader(image_path)
        mask_path = self.make_mask_path(self.data_root_folder, *args)
        #print("mask_path", mask_path)
        mask = self.mask_loader(mask_path, args[1])
        image, mask = self.preprocess_image_mask(image=image, mask=mask)
        return image, mask, self.filenames[index]

### Dataset Stereovision

In [None]:
class StereoEndoDataset(EndoDataset):
    def get_split_filename(self, filename):
        """ Splits a filename string comprising of "surgery_number <space> image_name <space> side"
        :param filename A string comprising of "relative_path <space> image_name"
        :return split_filename- "surgery_number, image_name"
        """
        split_filename = filename.split()
        if len(split_filename) == 2: return split_filename[0], split_filename[1]
        else: return split_filename

    def preprocess(self, image_left, image_right):
        image_left = self.resize(image_left)
        image_right = self.resize(image_right)
        # if self.image_aug and random.random() > self.aug_prob: image = self.image_aug(image)
        if self.image_aug: image_left = self.image_aug(image=np.asarray(image_left))["image"]  # alb needs np input
        if self.image_aug: image_right = self.image_aug(image=np.asarray(image_right))["image"]  # alb needs np input
        image = np.concatenate((np.array(image_left), np.array(image_right)), axis=2)
        return func.to_tensor(image)

    def __getitem__(self, index):
        """Returns the image with pre-proc transforms + aug applied to it""" 
        args = self.get_split_filename(self.filenames[index])
        image_left = self.image_loader(self.make_image_path(self.data_root_folder, *["l" if idx == 3 else arg for idx, arg in enumerate(args)]))
        image_right = self.image_loader(self.make_image_path(self.data_root_folder, *["r" if idx == 3 else arg for idx, arg in enumerate(args)]))
        image = self.preprocess(image_left, image_right)
        return image, self.filenames[index]

### Dataset Stereovision (with BB)

In [None]:
class StereoEndoMaskDataset(EndoMaskDataset):
    def __init__(self, **kwargs):
        super(StereoEndoMaskDataset, self).__init__(**kwargs)
        self.other_cam = {"l": "r", "r":"l"}
        
    def get_filenumber(self, filename):
        filenumber = filename.split('_')[2]
        return filenumber
     
    def make_filename(self, surgery, filenumber, side):
        return f"{surgery}_0{self.side_map[str(side)]}_{filenumber}"
    
    def make_image_path(self, data_root, rel_folder, image_number, side=None):
        """Combines the relative path with the data root to get the complete path of image
        """
        side = self.side_map[side]
        path = (os.path.join(data_root, rel_folder,
                            f"image_0{side}", "images", self.make_filename(rel_folder, image_number, side) + self.image_ext))
        return path
    
    def preprocess_image_mask(self, image_A, image_B, mask_A, mask_B):
        if self.image_aug: image_A = self.image_aug(image=np.asarray(image_A))["image"]
        if self.image_aug: image_B = self.image_aug(image=np.asarray(image_B))["image"]
        if self.image_mask_aug:
            # Set a seed before augmenting image A. The same seed has to be set before augmenting image B to get the same augmentation parameters.
            seed = random.random()
            random.seed(seed)
            bboxes_A = [ann['bboxes'] for ann in mask_A]
            labels_A = [ann['labels'] for ann in mask_A]
            augmented_A = self.image_mask_aug(image=np.asarray(image_A), bboxes=bboxes_A, class_labels=labels_A)
            
            image_A = augmented_A["image"]
            boxes_A = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in augmented_A['bboxes']]
            labels_A = augmented_A['class_labels']
            
            if len(boxes_A) == 0:
                boxes_A = np.zeros((0,4))
                labels_A = np.zeros((0), dtype=np.int64)
            mask_A = {'boxes': boxes_A, 'labels': labels_A}
            
            random.seed(seed)
            bboxes_B = [ann['bboxes'] for ann in mask_B]
            labels_B = [ann['labels'] for ann in mask_B]
            augmented_B = self.image_mask_aug(image=np.asarray(image_B), bboxes=bboxes_B, class_labels=labels_B)
            
            image_B = augmented_B["image"]
            boxes_B = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in augmented_B['bboxes']]
            mask_B = {'boxes': boxes_B, 'labels': augmented_B['class_labels']}
            
        image = func.to_tensor(np.concatenate((np.array(image_A), np.array(image_B)), axis=2))
        mask = {'boxes': torch.tensor(mask_A['boxes']), 'labels': torch.tensor(mask_A['labels'])}
        return image, mask

    def __getitem__(self, index):
        args = self.get_split_filename(self.filenames[index])
        
        surgery = args[0]
        filenumber = self.get_filenumber(args[1])
        side = args[2]
        
        image_A = self.image_loader(self.make_image_path(self.data_root_folder, *[surgery, filenumber, side]))
        mask_A = self.mask_loader(self.make_mask_path(self.data_root_folder, *[surgery, filenumber, side]),
                                    self.make_filename(surgery, filenumber, side))
        image_B = self.image_loader(self.make_image_path(self.data_root_folder, *[surgery, filenumber, self.other_cam[side]]))
        mask_B = self.mask_loader(self.make_mask_path(self.data_root_folder, *[surgery, filenumber, self.other_cam[side]]),
                                     self.make_filename(surgery, filenumber, self.other_cam[side]))
        image, mask = self.preprocess_image_mask(image_A,image_B, mask_A, mask_B)
        return image, mask, self.filenames[index]

## Defining Parameters

In [None]:
# Plug these parameters as part of your code
HEIGHT =  256
# what height you want the input image to the model
WIDTH = 256 
# what width you want the input image to the model
AUG_PROB = 0.5 
# probability with which some random augmentations are applied to the model
BATCH_SIZE = 8
# or whatever fits in memory
NUM_WORKERS = 8
# or 16 depending on what kind of multi-processing you want, if doubtful just use the defaults and 
#leave this to pytorch
NUM_CLASSES = 6
# number of classes (plus one for the background class)
NUM_EPOCHS = 100
# number of epochs to train the models
NUM_CHANNELS = 6 
# number of input channels for the ResNet50 backbone (Only needed for stereo model)

IMAGE_EXT = ".png"
# change this only if you are trying with another image format like .npy that is faster to load
DATAROOT = "/mnt/sds-hd/sd22a004/guest/data_preprocessed/data_coco/"
# path to the images, example: /mnt/sds-hd/sd22a004/guest/dataset/instrument_detection_dataset_raw/
BACKUP_PATH = "/mnt/sds-hd/sd22a004/guest/object_detection/model_backups"
# base path to the location where model weights can be stored
PREDICTION_BASE_PATH = '/mnt/sds-hd/sd22a004/guest/object_detection/predictions/'
# base path to the location where predictions are stored
SPLIT_BASE_PATH = "/mnt/sds-hd/sd22a004/guest/data_preprocessed/splits/data_splitted/test_cv_split_lr/"
# base path to the location where split definitions are stored

# Depending on the availability of gpu this can be changed accordingly
if torch.cuda.device_count() > 0:
    DEVICE = torch.device('cuda:0') #f'cuda:{torch.cuda.device_count() - 1}')
else:
    DEVICE = torch.device('cpu')

## Defining Augmentation

Training augmentation:  
Images have to be the same size when beeing fed into the model, so a `Resize` is necessary.  
The provided dataset is not consistent in the lighting and sometimes the 3D-printed mount has a different color. Therefore we chose `ColorJitter`, `RandomBrightnessContrast` and `RGBShift` to adjust for these issues.  
Some typical transformations (Flips, Rotation etc.) are used to account for changes in the camera angles an positions.  

Validation augmentation:  
We do not want images to be changed for the validation. Resizing is still necessary, so the model can handle the input.

In [None]:
aug = alb.Compose([alb.Resize(height=HEIGHT, width=WIDTH),
                   alb.ColorJitter(brightness=0.2,
                                  contrast=(0.3, 1.5),
                                  saturation=(0.5, 2),
                                  hue=0.1,
                                  p=AUG_PROB),
                   alb.HorizontalFlip(p=0.5),
                   alb.VerticalFlip(p=0.5),
                   alb.ShiftScaleRotate(p=0.5),
                   alb.RandomBrightnessContrast(p=0.3, brightness_limit=0.1),
                   alb.RGBShift(r_shift_limit=30, g_shift_limit=30, b_shift_limit=30, p=0.3)
                  ],
                  bbox_params = alb.BboxParams(format = 'coco', label_fields = ["class_labels"])
                )

val_aug = alb.Compose([alb.Resize(height=HEIGHT, width=WIDTH),
                      ],
                     bbox_params = alb.BboxParams(format = 'coco', label_fields = ["class_labels"])
                    )

## Functions for training process

### Creating Dataset/Dataloader
These are wrapper functions to ease the defining of different datasets and dataloaders later on.

In [None]:
def get_dataset(DataSet, filenames, aug):
    dataset = DataSet(data_root_folder=DATAROOT,
                      filenames=filenames,
                      height=HEIGHT,
                      width=WIDTH,
                      image_ext=IMAGE_EXT,
                      image_mask_aug=aug
                     )
    return dataset

def get_dataloader(dataset, shuffle=True):
    return DataLoader(dataset,
                      batch_size=BATCH_SIZE,
                      shuffle=shuffle,
                      drop_last=False,
                      collate_fn=collate,
                      num_workers=NUM_WORKERS,
                      worker_init_fn=seed_worker
                     )

### Creating training function

In [None]:
def train(model, dataloader, num_epochs):
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
    scheduler = ReduceLROnPlateau(optimizer, 'min', factor = 0.1, patience = 1, threshold = 1*np.exp(-4))

    len_dataloader = len(dataloader)

    for epoch in range(num_epochs):
        print(f"Epoch: {epoch + 1}")
        model.train()
        for train_i, (train_imgs, train_annotations, filenames) in enumerate(dataloader, start=1):
            train_imgs = list(train_img.to(DEVICE) for train_img in train_imgs)
            train_annotations = [{train_k: train_v.to(DEVICE) for train_k, train_v in train_t.items()} for train_t in train_annotations] 

            train_loss_dict = model(train_imgs, train_annotations)
            train_losses = sum(train_loss for train_loss in train_loss_dict.values())
            optimizer.zero_grad()
            train_losses.backward()
            optimizer.step()

            print(f'\rIteration: {train_i}/{len_dataloader}, Loss: {train_losses}', end="")
        print("")

        #scheduler Reduce LR on Plateau
        lr = optimizer.param_groups[0]['lr']
        scheduler.step(train_losses)

        # Save the model state and the number of the last trained epoch
        torch.save(model.state_dict(), os.path.join(BACKUP_PATH, f'pretrained_{MODEL_DEF}_{FOLD}.pth'))
        with open(os.path.join(BACKUP_PATH, f'_trained_epochs.txt'), 'w') as file:
            file.write(f'Trained model: {MODEL_DEF}_{FOLD}\nFinished epochs: {epoch + 1}')

        # Display loss statistics
        print("Epoch {}/{}, Train-Loss: {:.3f}, LR: {:.10f}".format(epoch+1, NUM_EPOCHS, train_losses, lr)) 

### Creating validation function

In [None]:
def evaluate(model, dataloader):
    len_val_dataloader = len(dataloader)
    model.eval()

    val_filenames = []
    val_gt = []
    val_predictions = []

    for val_i, (val_imgs, val_annotations, filenames) in enumerate(val_dataloader, start=1):
        with torch.no_grad():
            val_imgs = list(val_img.to(DEVICE) for val_img in val_imgs)
            val_annotations = [{val_k: val_v.to(DEVICE) for val_k, val_v in val_t.items()} for val_t in val_annotations] 
            val_pred = model(val_imgs, val_annotations)

            # Non-Maximum-Supression
            val_pred_reduced = []
            for pred in val_pred:
                nms_preds = {'boxes': torch.Tensor([]).to(DEVICE), 'labels': torch.Tensor([]).to(DEVICE), 'scores': torch.Tensor([]).to(DEVICE)}
                for class_predictions in split_classes(pred):
                    class_predictions = apply_nms(class_predictions, 0.1) # TODO: kleinerer Threshold

                    nms_preds['boxes'] = torch.cat((nms_preds['boxes'], class_predictions['boxes']), 0)
                    nms_preds['labels'] = torch.cat((nms_preds['labels'], class_predictions['labels']), 0)
                    nms_preds['scores'] = torch.cat((nms_preds['scores'], class_predictions['scores']), 0)
                
                val_pred_reduced.append(nms_preds)

            val_filenames.extend(filenames)
            val_gt.extend(val_annotations)
            val_predictions.extend(val_pred_reduced)


            print(f'\rIteration: {val_i}/{len_val_dataloader}', end="")
    print("")

    path = os.path.join(PREDICTION_BASE_PATH, f"predictions_{MODEL_DEF}_{FOLD}.json")
    save_predictions_to_json(val_filenames, val_gt, val_predictions, path)

# Code for Stereovision

## Building the model

In [None]:
# The model needs the means and standard deviations for a normalization step. Usually they are predifined when using only 3 channels.
# For the stereo model they need to be specified for 6 channels. This function uses one batch to get an approximation of the values, which should be enough.
def get_means_and_std(dataloader):
    imgs, _, _ = next(iter(dataloader))
    means = np.mean([[c.mean().item() for c in img] for img in imgs], axis=0)
    stds = np.std([[c.std().item() for c in img] for img in imgs], axis=0)
    return means, stds

In [None]:
def get_stereo_model(n_classes, n_channels=6, means=[0.485, 0.456, 0.406, 0.485, 0.456, 0.406], stds=[0.229, 0.224, 0.225, 0.229, 0.224, 0.225]):
    # image_means and image_sts need arrays in the length n_channels. For convenience this is calculated beforehand (Could also use standard values and expand them)
    model = fasterrcnn_resnet50_fpn(pretrained=True, image_mean=means, image_std=stds)

    # Since the new input layer will have randomly initialized parameters, we reuse the pretrained ones and duplicate them onto the additional 3 input channels
    pretrained_params_input_layer = next(model.parameters())
    params_combined = torch.cat([pretrained_params_input_layer, pretrained_params_input_layer], dim=1)
    
    # number of channels has to be adjusted for the input layer
    model.backbone.body.conv1 = nn.Conv2d(n_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
    # Replace the randomly initialized weights
    model.backbone.body.conv1.weight = nn.Parameter(params_combined, requires_grad=False)
    
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)
    
    return model

## Iterate Folds

In [None]:
# Each model should have its own definition if you wish to store them for later evaluation. Each fold gets a corresponding suffix so one model is saved for each fold.
MODEL_DEF = 'stereo_100Epoch'

for n_fold in range(1, 5):
    FOLD = f'fold_{n_fold}'
    split_file_path = os.path.join(SPLIT_BASE_PATH, FOLD, "{}_files.txt")

    train_filenames = read_lines_from_text_file(split_file_path.format("train"))
    train_dataloader = get_dataloader(get_dataset(StereoEndoMaskDataset, train_filenames, aug))

    val_filenames = read_lines_from_text_file(split_file_path.format("val"))
    val_dataloader = get_dataloader(get_dataset(StereoEndoMaskDataset, val_filenames, val_aug), shuffle=False)
    
    means, stds = get_means_and_std(train_dataloader)
    model = get_stereo_model(NUM_CLASSES, NUM_CHANNELS, means, stds)
    model.to(DEVICE)
    
    train(model, train_dataloader, NUM_EPOCHS)
    evaluate(model, val_dataloader)

Epoch: 1
Iteration: 267/267, Loss: 0.220802431350540376
Epoch 1/100, Train-Loss: 0.221, LR: 0.0050000000
Epoch: 2
Iteration: 267/267, Loss: 0.014507347717881203
Epoch 2/100, Train-Loss: 0.015, LR: 0.0050000000
Epoch: 3
Iteration: 267/267, Loss: 0.090522070312682586
Epoch 3/100, Train-Loss: 0.091, LR: 0.0050000000
Epoch: 4
Iteration: 267/267, Loss: 0.192345406811887755
Epoch 4/100, Train-Loss: 0.192, LR: 0.0050000000
Epoch: 5
Iteration: 267/267, Loss: 0.155872911650890434
Epoch 5/100, Train-Loss: 0.156, LR: 0.0005000000
Epoch: 6
Iteration: 267/267, Loss: 0.144506101995987936
Epoch 6/100, Train-Loss: 0.145, LR: 0.0005000000
Epoch: 7
Iteration: 267/267, Loss: 0.078974601845248614
Epoch 7/100, Train-Loss: 0.079, LR: 0.0000500000
Epoch: 8
Iteration: 267/267, Loss: 0.182767569886499654
Epoch 8/100, Train-Loss: 0.183, LR: 0.0000500000
Epoch: 9
Iteration: 267/267, Loss: 0.069765631104917072
Epoch 9/100, Train-Loss: 0.070, LR: 0.0000050000
Epoch: 10
Iteration: 267/267, Loss: 0.12420806292183921

# Code for Monovision

## Building the model

In [None]:
def get_model_object_annotation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

## Iterate Folds

In [None]:
# Each model should have its own definition if you wish to store them for later evaluation. Each fold gets a corresponding suffix so one model is saved for each fold.
MODEL_DEF = 'mono_100Epochs'

for n_fold in range(1, 5):
    FOLD = f'fold_{n_fold}'
    split_file_path = os.path.join(SPLIT_BASE_PATH, FOLD, "{}_files.txt")

    train_filenames = read_lines_from_text_file(split_file_path.format("train"))
    train_dataloader = get_dataloader(get_dataset(EndoMaskDataset, train_filenames, aug))

    val_filenames = read_lines_from_text_file(split_file_path.format("val"))
    val_dataloader = get_dataloader(get_dataset(EndoMaskDataset, val_filenames, val_aug))
    
    model = get_model_object_annotation(NUM_CLASSES)
    model.to(DEVICE)
    
    train(model, train_dataloader, NUM_EPOCHS)
    evaluate(model, val_dataloader)

## Plot augmented images

In [None]:
train_imgs, train_annotations, filenames = next(iter(train_dataloader))

fig = plt.figure(figsize=(10,25))
for i in range(0, BATCH_SIZE): 
    ax = plt.subplot(8, 2, i+1)
    plt.axis("off")
    plt.title(filenames[i])
    plt.imshow(np.transpose(train_imgs[i], (1, 2, 0)))
plt.show()
plt.close(fig)

# Code for predicting on the test set

### Defining Parameters

In [None]:
TEST_SPLIT_PATH = "/mnt/sds-hd/sd22a004/guest/data_preprocessed/splits/data_splitted_testset/test_cv_split_lr/test_file.txt"
TEST_DATAROOT = "/mnt/sds-hd/sd22a004/guest/data_preprocessed/data_testset/"
MODEL_PATH = "/mnt/sds-hd/sd22a004/guest/trained_models/pretrained_mono_100epochs_fold_1.pth"
MODEL_DEF = "mono_100epochs"
FOLD = "test"

### Creating prediction function 

In [None]:
def predict(model, dataloader):
    len_test_dataloader = len(dataloader)
    model.eval()

    test_filenames = []
    test_predictions = []

    for test_i, (test_imgs, filenames) in enumerate(dataloader, start=1):
        with torch.no_grad():
            test_imgs = list(test_img.to(DEVICE) for test_img in test_imgs)
            test_pred = model(test_imgs)

            # Non-Maximum-Supression
            test_pred_reduced = []
            for pred in test_pred:
                nms_preds = {'boxes': torch.Tensor([]).to(DEVICE), 'labels': torch.Tensor([]).to(DEVICE), 'scores': torch.Tensor([]).to(DEVICE)}
                for class_predictions in split_classes(pred):
                    class_predictions = apply_nms(class_predictions, 0.1)

                    nms_preds['boxes'] = torch.cat((nms_preds['boxes'], class_predictions['boxes']), 0)
                    nms_preds['labels'] = torch.cat((nms_preds['labels'], class_predictions['labels']), 0)
                    nms_preds['scores'] = torch.cat((nms_preds['scores'], class_predictions['scores']), 0)
                
                test_pred_reduced.append(nms_preds)

            test_filenames.extend(filenames)
            test_predictions.extend(test_pred_reduced)


            print(f'\rIteration: {test_i}/{len_test_dataloader}', end="")
    print("")

    path = os.path.join(PREDICTION_BASE_PATH, f"predictions{MODEL_DEF}_{FOLD}.json")
    save_predictions_to_json(test_filenames, [], test_predictions, path)

### Building/Loading the model

In [None]:
model = get_model_object_annotation(NUM_CLASSES)
model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))
model.to(DEVICE)

In [None]:
test_filenames = read_lines_from_text_file(TEST_SPLIT_PATH)

test_dataset = EndoDataset(data_root_folder=TEST_DATAROOT,
                      filenames=test_filenames,
                      height=HEIGHT,
                      width=WIDTH,
                      image_ext=IMAGE_EXT,
                      image_aug=alb.Compose([alb.Resize(height=HEIGHT, width=WIDTH)])
                     )

test_dataloader = get_dataloader(test_dataset, shuffle=False)

In [None]:
predict(model, test_dataloader)

# Additional Content for future implementations

In [None]:
#!pip install torchmetrics

In [None]:
# Can be used in the validation run
from torchmetrics.detection.mean_ap import MeanAveragePrecision

metric = MeanAveragePrecision(class_metrics=True)
metric.to(DEVICE)
metric.update(val_predictions, val_gt)

pprint(metric.compute())

{'map': tensor(0.1248),
 'map_50': tensor(0.3331),
 'map_75': tensor(0.0629),
 'map_large': tensor(0.1676),
 'map_medium': tensor(0.0795),
 'map_per_class': tensor([ 0.2363, -1.0000,  0.0134, -1.0000]),
 'map_small': tensor(0.0032),
 'mar_1': tensor(0.2499),
 'mar_10': tensor(0.3000),
 'mar_100': tensor(0.3000),
 'mar_100_per_class': tensor([ 0.4028, -1.0000,  0.1972, -1.0000]),
 'mar_large': tensor(0.3633),
 'mar_medium': tensor(0.2367),
 'mar_small': tensor(0.0500)}
