In [1]:
import torch
import pandas as pd
import os
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
from torchvision.transforms import transforms
from pathlib import Path


class MimicCxrReportsEpisodes(Dataset):
    """
    MIMIC-CXR Reports Only
    Todo: Insert references to the database here!
    Removes '_' from reports
    Truncates the reports to 512 tokens by removing the beginning of the report (Usually where the 'wet read' resides)
    """

    def __init__(self, root_text, csv_path, tokenizer, n_way, k_shot, k_query, num_episodes, mode, max_length=512):

        # Check if mode contains an accepted value
        if mode not in ('base', 'novel'):
            raise Exception("Selected 'mode' is not valid")

        # Initialise variables
        self.root_text = root_text
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.n_way = n_way
        self.k_shot = k_shot
        self.k_query = k_query
        self.num_episodes = num_episodes
        
        # Load data
        csv_data = pd.read_csv(csv_path)

        if mode == 'base':
            self.dict_labels = {
                'Atelectasis': 0,
                'Cardiomegaly': 1,
                'Consolidation': 2,
                'Edema': 3,
                'No Finding': 4,
                'Pneumonia': 5,
            }
            # Filters for novel classes
            data = csv_data[(csv_data.split == "base_train") | (csv_data.split == "base_validate")]

        else:
            self.dict_labels = {
                'Enlarged Cardiomediastinum': 0,
                'Fracture': 1,
                'Lung Lesion': 2,
                'Lung Opacity': 3,
                'Pleural Effusion': 4,
                'Pneumothorax': 5
            }
            data = csv_data[csv_data.split == "novel"]  # Filters for novel classes
            
        # Converts classes to numeric values
        self.data = data.assign(labels=data["labels"].apply(lambda x: self.dict_labels[x]))
            
        # Create Episodes
        self.support_episodes = []  # List of training episodes (support set)
        self.query_episodes = []  # List of testing episodes (query set)
        for i in range(self.num_episodes):  # for each batch
            # 1.select n_way classes randomly
            selected_cls = np.random.choice(len(self.dict_labels), self.n_way, False)  # no duplicate
            np.random.shuffle(selected_cls)
            df_support = pd.DataFrame()
            df_query = pd.DataFrame()
            for cls in selected_cls:
                df_cls = self.data[self.data.labels == cls]
                # 2. select k_shot + k_query for each class
                selected_idx = np.random.choice(len(df_cls), self.k_shot + self.k_query, False)
                np.random.shuffle(selected_idx)

                # Index of samples for the support and query set
                support_idx = selected_idx[:self.k_shot]
                query_idx = selected_idx[self.k_shot:]

                df_support = df_support.append(df_cls.iloc[support_idx])
                df_query = df_query.append(df_cls.iloc[query_idx])

            # Shuffle the indexes so that it is no longer ordered by class
            df_support = df_support.sample(frac=1)
            df_query = df_query.sample(frac=1)

            self.support_episodes.append(df_support)
            self.query_episodes.append(df_query)
            
    def __len__(self):
        return self.num_episodes

    def __getitem__(self, idx):
        # Get a single episode
        support_set = self.support_episodes[idx]
        query_set = self.query_episodes[idx]
        
        # Labels ranging from 0 to (number of classes -1)
        support_labels = support_set.labels.tolist()
        query_labels = query_set.labels.tolist()

        # Convert labels to range from 0 to (n way-1) for loss calculation
        unique_labels = np.unique(support_labels)  # Unique labels are the same for support and query set
        converted_support_labels = support_labels
        converted_query_labels = query_labels
        for idx, val in enumerate(unique_labels):
            # Get indexes of labels that are equal to the iterated val
            idx_support = [x for x, label in enumerate(support_labels) if label == val]
            idx_query = [x for x, label in enumerate(query_labels) if label == val]

            # Replace old labels with new labels
            for idx_change in range(len(idx_support)):
                converted_support_labels[idx_support[idx_change]] = idx

            for idx_change in range(len(idx_query)):
                converted_query_labels[idx_query[idx_change]] = idx
        
        # Get the support set of texts and masks as tensors
        support_texts = torch.Tensor()
        support_masks = torch.Tensor()
        for i in range(len(support_set)):
            # Extract CSV data
            file_path = support_set.iloc[idx, 0]

            # Get text tensor and attention mask
            text_name = f'{file_path.split("/")[2]}.txt'  # Extract the study id to find the report
            text_path = Path(os.path.join(self.root_text, text_name))
            plain_text = text_path.read_text()
            plain_text = plain_text.replace('_', '')  # Remove all underscores from the text
            encoded_text = self.tokenizer.encode(plain_text, add_special_tokens=True)
            len_encoding = len(encoded_text)
            
            # Transform encodings to be of the same size
            if len_encoding > self.max_length:
                # Truncate to max length
                cutoff = len_encoding - self.max_length + 1  # The cutoff for the tokens to be deleted
                del encoded_text[1:cutoff]
                attention = [1] * self.max_length
            elif len_encoding < self.max_length:
                # Pad to max length
                num_padding = self.max_length - len_encoding
                encoded_text.extend([0] * num_padding)  # Padding token is 0
                attention = [1] * len_encoding
                attention.extend([0] * (self.max_length - len_encoding))
            else:
                # If equal size, create attention matrix
                attention = [1] * self.max_length
                
            # Append texts and attention masks to the tensor to be outputted
            torch.cat((support_texts, torch.tensor(encoded_text)))
            torch.cat((support_masks, torch.tensor(attention)))
        
        # Get the query set of texts and masks as tensors
        query_texts = torch.Tensor()
        query_masks = torch.Tensor()
        for i in range(len(query_set)):
            # Extract CSV data
            file_path = query_set.iloc[idx, 0]

            # Get text tensor and attention mask
            text_name = f'{file_path.split("/")[2]}.txt'  # Extract the study id to find the report
            text_path = Path(os.path.join(self.root_text, text_name))
            plain_text = text_path.read_text()
            plain_text = plain_text.replace('_', '')  # Remove all underscores from the text
            encoded_text = self.tokenizer.encode(plain_text, add_special_tokens=True)
            len_encoding = len(encoded_text)
            
            # Transform encodings to be of the same size
            if len_encoding > self.max_length:
                # Truncate to max length
                cutoff = len_encoding - self.max_length + 1  # The cutoff for the tokens to be deleted
                del encoded_text[1:cutoff]
                attention = [1] * self.max_length
            elif len_encoding < self.max_length:
                # Pad to max length
                num_padding = self.max_length - len_encoding
                encoded_text.extend([0] * num_padding)  # Padding token is 0
                attention = [1] * len_encoding
                attention.extend([0] * (self.max_length - len_encoding))
            else:
                # If equal size, create attention matrix
                attention = [1] * self.max_length
                
            # Append texts and attention masks to the tensor to be outputted
            torch.cat((query_texts, torch.Tensor(encoded_text)))
            torch.cat((query_masks, torch.tensor(attention)))

        return support_texts, support_masks, torch.LongTensor(support_labels), \
                    query_texts, query_masks, torch.LongTensor(query_labels)
        

In [2]:
import torch
import pandas as pd
import os
from torch.utils.data import DataLoader
import torch.optim as optim
from biobertology import get_tokenizer
import sys

sys.path.append('..')
from shared.models import *
# from shared.datasets import *
from shared.metrics import *


def train(text_inputs, attention_inputs, labels, model, criterion, device, optimizer, freeze=False):
    # Training loop
    model.train()

    # Freeze all layers except those indicated
    if freeze:
        for name, param in model.named_parameters():
            if name not in freeze:
                param.requires_grad = False

    # Train the entire support set in one batch
    optimizer.zero_grad()
    pred = model(text_inputs, attention_inputs)
    loss = criterion(pred, labels)
    loss.backward()
    optimizer.step()
    train_loss = loss.item()  # Running training loss

    return train_loss


def test(text_inputs, attention_inputs, labels, model, criterion, device, n_way):
    # An F1 Score of 0 indicates that it is invalid
    model.eval()
    true_positive = list(0. for i in range(n_way))  # Number of correctly predicted samples per class
    total_truth = list(0. for i in range(n_way))  # Number of ground truths per class
    predicted_positive = list(0. for i in range(n_way))  # Number of predicted samples per class
    correct_total = 0  # Total correctly predicted samples
    total = 0  # Total samples
    with torch.no_grad():
        # Test the entire query set in one batch
        pred = model(text_inputs, attention_inputs)
        loss = criterion(pred, labels)
        val_loss = loss.item()  # Running validation loss
        _, predicted = torch.max(pred, 1)
        correct = (predicted == labels).squeeze()  # Samples that are correctly predicted
        correct_total += (predicted == labels).sum().item()
        total += labels.size(0)

        for i in range(len(predicted)):
            label = labels[i]
            true_positive[label] += correct[i].item()
            total_truth[label] += 1
            predicted_positive[predicted[i].item()] += 1  # True Positive + False Positive

    accuracy, macro_accuracy, f1_score, class_f1 = metrics(true_positive, total_truth,
                                                           predicted_positive, correct_total, total)

    return val_loss, accuracy, macro_accuracy, f1_score, class_f1


def main():
    # Set Training Parameters
    n_way = 3
    k_shot = 20
    k_query = 16
    num_episodes = 20
    num_epochs = 20
    num_workers = 12
    bs = 4
    lr = 1e-4
    root_text = '../../../../scratch/rl80/mimic-cxr-2.0.0.physionet.org'
    path_biobert = '../results'
    path_splits = '../splits/splits.csv'  # Location of preprocessed splits
    path_results = '../../results'  # Folder to save the CSV results
    freeze = ['linear.weight', 'linear.bias']  # Freeze all layers except linear layers

    torch.cuda.set_device(0)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # Training tools
    criterion = nn.CrossEntropyLoss()
    tokenizer = get_tokenizer()

    # Load in data
    dataset = MimicCxrReportsEpisodes(root_text, path_splits, tokenizer, n_way, k_shot, k_query, num_episodes, 'novel')
    loader = DataLoader(dataset, batch_size=bs, shuffle=True, num_workers=num_workers)

    # Create Dataframe to export results to CSV
    df_results = pd.DataFrame(columns=['Epoch', 'Training Loss', 'Validation Loss', 'Accuracy', 'Macro Accuracy',
                                       'Macro-F1 Score'] + [str(x) + ' F1' for x in range(n_way)])

    # Iterate through batched episodes. One episode is one experiment
    for step, (support_imgs, support_masks, support_labels, query_imgs, query_masks, query_labels) in enumerate(loader):
        # Convert Tensors to appropriate device
        batch_support_x, batch_support_masks, batch_support_y, batch_query_x, batch_query_masks, batch_query_y = \
            support_imgs.to(device), support_masks.to(device), support_labels.to(device), \
            query_imgs.to(device), query_masks.to(device), query_labels.to(device)

        # [num_batch, training_sz, channels, height, width] = support_x.size()
        # num_batch = num of episodes
        # training_sz = size of support or query set
        num_batch = batch_support_x.size(0) # Number of episodes in the batch

        # Break down the batch of episodes into single episodes
        for i in range(num_batch):
            # Load in model and reset weights every episode/experiment
            model = SemanticNet(n_way, path_biobert).to(device)

            # Reset optimizer with model parameters
            optimizer = optim.Adam(model.parameters(), lr=lr)

            # Break down the sets into individual episodes
            support_x, support_y = batch_support_x[i], batch_support_y[i]
            query_x, query_y = batch_query_x[i], batch_query_y[i]

            # Variables for best epoch per experiment
            best_score = 0
            best_epoch = 0
            df_best = pd.DataFrame(columns=['Epoch', 'Training Loss', 'Validation Loss', 'Accuracy', 'Macro Accuracy',
                                       'Macro-F1 Score'] + [str(x) + ' F1' for x in range(n_way)]) # Track best epoch
            # Training and testing for specified epochs
            for epoch in range(num_epochs):
                # Training
                train_loss = train(support_x, support_y, model, criterion, device, optimizer, freeze=freeze)

                # Testing
                val_loss, acc, m_acc, macro_f1, class_f1 = test(query_x, query_y, model, criterion, device, n_way)

                # Find best epoch
                score = 0.5*acc + 0.5*macro_f1
                if score > best_score:
                    best_score = score
                    df_best.loc[0] = [epoch + 1, train_loss, val_loss, acc, m_acc, macro_f1] + class_f1

            # Print the best results per experiment
            print(
                f'[{int(df_best.iloc[0,0])}] t_loss: {df_best.iloc[0,1]} v_loss: {df_best.iloc[0,2]} '
                f'val_acc: {df_best.iloc[0,3]} f1: {df_best.iloc[0,5]}')

            # Record the best epoch to be saved into a CSV
            df_results = df_results.append(df_best.loc[0], ignore_index=True)

    # Create results folder if it does not exist
    if not os.path.exists(path_results):
        os.makedirs(path_results)

    # Export results to a CSV file
    df_results.to_csv(os.path.join(path_results, f'{k_shot}shot_semantic.csv'), index=False)


if __name__ == '__main__':
    main()


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/ilu3/destinationPath/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/ilu3/destinationPath/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/ilu3/destinationPath/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-1-34be384eb770>", line 152, in __getitem__
    torch.cat((support_texts, encoded_text))
TypeError: expected Tensor as element 1 in argument 0, but got list


In [7]:
torch.tensor([0.1])

tensor([0.1000])

In [None]:
a, b, c, d, e, f, g = \
1, 2, 3, 4, 5, 6, 7

In [None]:
a

In [None]:
b

In [None]:
c

In [None]:
d

In [None]:
e

In [None]:
f

In [None]:
g