In [1]:
import torch
import pandas as pd
import os
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms
import torch.optim as optim
from biobertology import get_biobert, get_tokenizer
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

import sys
sys.path.append('..')
from shared.models import *

In [2]:
class MimicCxrMulti(Dataset):
    """
    MIMIC-CXR-JPG Images and MIMIC-CXR Reports
    Todo: Insert references to the database here!
    Removes '_' from reports
    Truncates the reports to 512 tokens by removing the beginning of the report (Usually where the 'wet read' resides)
    """
    
    def __init__(self, root_image, root_text, csv_path, tokenizer, mode, resize=224, max_length=512):
        
        # Check if mode contains an accepted value
        if mode not in ('base_train', 'base_validate', 'novel_train', 'novel_validate'):
            raise Exception("Selected 'mode' is not valid")
        
        # Initialise variables
        self.root_text = root_text
        self.root_image = root_image
        self.resize = resize
        self.max_length = max_length
        self.transform = transforms.Compose([lambda x: Image.open(x).convert('L'), # Transforms for images
                                             transforms.Resize((self.resize, self.resize)),
                                             transforms.ToTensor()
                                             ])
        self.tokenizer = tokenizer
        
        # Load data
        csv_data = pd.read_csv(csv_path)
        self.data = csv_data[csv_data.split == mode]
        
        if mode == 'base_train' or mode == 'base_validate':
            self.dict_labels = {
                'Atelectasis': 0,
                'Cardiomegaly': 1,
                'Consolidation': 2,
                'Edema': 3,
                'Fracture': 4,
                'Lung Opacity': 5,
                'No Finding': 6,
                'Pneumonia': 7,
                'Pneumothorax': 8,
                'Support Devices': 9
            }
        else:
            self.dict_labels = {
                'Enlarged Cardiomediastinum': 0,
                'Lung Lesion': 1,
                'Pleural Effusion': 2,
            }
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract CSV data
        file_path = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]
        
        # Get image tensor
        img_path = os.path.join(self.root_image, file_path) # Absolute file path to the JPG img
        img_tensor = self.transform(img_path)
        
        # Get text tensor and attention mask
        text_name = f'{file_path.split("/")[2]}.txt' # Extract the study id to find the report
        text_path = Path(os.path.join(self.root_text, text_name))
        plain_text = text_path.read_text()
        plain_text = plain_text.replace('_','') # Remove all underscores from the text
        encoded_text = self.tokenizer.encode(plain_text, add_special_tokens=True)
        
        # Transform encodings to be of the same size
        len_encoding = len(encoded_text)
        if len_encoding > self.max_length:
            # Truncate to max length
            cutoff = len_encoding - self.max_length + 1 # The cutoff for the tokens to be deleted
            del encoded_text[1:cutoff]
            attention = [1] * self.max_length
        elif len_encoding < self.max_length:
            # Pad to max length
            num_padding = self.max_length - len_encoding
            encoded_text.extend([0] * num_padding) # Padding token is 0
            attention = [1] * len_encoding
            attention.extend([0] * (self.max_length - len_encoding))
        else:
            # If equal size, create attention matrix
            attention = [1] * self.max_length
            
        text_tensor = torch.tensor(encoded_text)
        attention_tensor = torch.tensor(attention)
        
        return img_tensor, text_tensor, attention_tensor, self.dict_labels[label]

class MultiModalNet(nn.Module):
    def __init__(self, n_way, path_biobert):
        super(MultiModalNet, self).__init__()
        self.baseline = BaselineNet(n_way)
        self.biobert = get_biobert(model_dir=path_biobert, download=False)
        self.concat_linear = nn.Linear(13312, n_way) # 12544 + 768 = 13312 from baseline and biobert respectively
    def forward(self, image, text, attention_mask):
        _, image = self.baseline(image, extract_features=True) # baseline returns: logits, features
        _, text = self.biobert(text, attention_mask=attention_mask) # biobert returns: sequence output, pooled output
        x = torch.cat((image,text), 1)
        x = self.concat_linear(x)
        return x    

In [3]:
def train(model, train_loader, criterion, device, optimizer, freeze=False):
    # freeze accepts a list and represents the layers not to freeze
    model.train()

    # Freeze all layers except those indicated
    if freeze:
        for name, param in model.named_parameters():
            if name not in freeze:
                param.requires_grad = False

    train_loss = 0
    for step, (data_image, data_text, data_attention, data_labels) in enumerate(train_loader):
        image_inputs, labels = data_image.to(device), data_labels.to(device)
        text_inputs, attention_inputs = data_text.to(device), data_attention.to(device)
        optimizer.zero_grad()
        pred = model(image_inputs, text_inputs, attention_inputs)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()  # Running training loss

    return train_loss / (step+1)

In [4]:
# Variables
num_epochs = 5
num_workers = 3
bs = 32
n_way = 3
root_image = '../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/files'
root_text = '../../../../scratch/rl80/mimic-cxr-2.0.0.physionet.org'
path_splits = '../splits/20_shot.csv'
path_biobert = './'
path_pretrained = '../results/basic/basic_36.pth' # Pretrained image model
freeze = ['concat_linear.weight', 'concat_linear.bias']

# Check for GPU device
torch.cuda.set_device(0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load in model
model = MultiModalNet(n_way, path_biobert).to(device)
pretrained_dict = torch.load(path_pretrained)

# Convert image model to work with the multi modal model
multi_dict = {}
del pretrained_dict['linear.weight']  # Pretrained model is for 10-way, remove last layer for 3-way
del pretrained_dict['linear.bias']
for key, value in pretrained_dict.items():
    multi_dict[f'baseline.{key}'] = pretrained_dict[key] # The model has 'baseline.' in front of every image model key
model_dict = model.state_dict()
model_dict.update(multi_dict)
model.load_state_dict(model_dict)

# Check if the layers to be unfrozen are in the model
if type(freeze) != bool:
    check = all(item in model_dict for item in freeze)
    if check:
        print(f'The layers that will not be frozen are: {freeze}')
    else:
        raise Exception('Not all elements to stay unfrozen are in the model')

# Load in dataset
tokenizer = get_tokenizer()
train_dataset = MimicCxrMulti(root_image, root_text, path_splits, tokenizer, mode='novel_train')
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers)

# Training
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    #train_loss = train(model, train_loader, criterion, device, optimizer)
        # freeze accepts a list and represents the layers not to freeze
    model.train()

    # Freeze all layers except those indicated
    if freeze:
        for name, param in model.named_parameters():
            if name not in freeze:
                param.requires_grad = False

    train_loss = 0
    for step, (data_image, data_text, data_attention, data_labels) in enumerate(train_loader):
        image_inputs, labels = data_image.to(device), data_labels.to(device)
        text_inputs, attention_inputs = data_text.to(device), data_attention.to(device)
        optimizer.zero_grad()
        pred = model(image_inputs, text_inputs, attention_inputs)
        loss = criterion(pred, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()  # Running training loss
    #val_loss, acc, m_acc, macro_f1, class_f1 = test(model, test_loader, criterion, device, n_way)

    #if (save_models):
    #    torch.save(model.state_dict(), os.path.join(path_models, f'basic_{epoch + 1}.pth'))  # Save the model

    # Append and report results
    #df_results.loc[epoch] = [epoch + 1, train_loss, val_loss, acc, m_acc, macro_f1] + class_f1
    #print(
    #    f'[{epoch + 1}] t_loss: {train_loss:.5f} v_loss: {val_loss:.5f} val_acc: {acc:.5f} '
    #    f'val_m_acc: {m_acc:.5f} f1: {macro_f1:.5f}')
    print(train_loss)

The layers that will not be frozen are: ['concat_linear.weight', 'concat_linear.bias']
2.452433466911316
2.0351550579071045
1.7757296562194824
1.5069501996040344
1.313522219657898


In [17]:
print(image_inputs.size())
print(text_inputs.size())
print(attention_inputs.size())
print(labels.size())
test1, test2, test3, test4 = train_dataset[1]
print(test1.size())
print(test2.size())
print(test3.size())
print(test4.size())

torch.Size([32, 1, 224, 224])
torch.Size([32, 1, 512])
torch.Size([32, 1, 512])
torch.Size([32])
torch.Size([1, 224, 224])
torch.Size([1, 512])
torch.Size([1, 512])


AttributeError: 'int' object has no attribute 'size'

In [11]:
biobert = get_biobert('./',download=False)
biobert.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [13]:
#biobert(text_inputs, attention_mask=attention_inputs)
biobert(text_inputs)

RuntimeError: number of dims don't match in permute

In [39]:
import os
count = 0
for filename in os.listdir(root_text):
    if filename.endswith(".txt"):
         # print(os.path.join(directory, filename))
        abs_path = Path(os.path.join(root_text, filename))
        encoded_text = tokenizer.encode(abs_path.read_text(), add_special_tokens=True)
        hold = len(encoded_text)
        if hold > count:
            count = hold
        continue
    else:
        continue

KeyboardInterrupt: 

In [19]:
csv_data = pd.read_csv(path_splits)
#data = csv_data[csv_data.split == 'novel_train']
data = csv_data[csv_data.split == 'novel_validate']

In [20]:
for index, row in data.iterrows():
    file_path = row['file_path']
    # Get paths for both images and text
    text_name = f'{file_path.split("/")[2]}.txt' # Extract the study id to find the report
    abs_path = Path(os.path.join(root_text, text_name))
    text = abs_path.read_text()
    text = text.replace('_','')
    encoded_text = tokenizer.encode(text, add_special_tokens=True)
    if len(encoded_text) > 511:
        hold=encoded_text
        print(file_path, len(encoded_text))

p16/p16104236/s57336365/caea42f2-5abeb0d6-c6753361-0252cae8-66323dcb.jpg 535
p17/p17673557/s52871653/be675e3a-b4ef502b-816a850e-4e47a8b6-cc00cf55.jpg 849


In [22]:
#tokenizer.encode(cleaned_text, add_special_tokens=True, max_length=512)
new_tensor = torch.tensor(encoded_text)
new_tensor.size()

torch.Size([126])

In [79]:
tokenizer.decode(hold)

[" WET READ : 8 : 38 AM ONE COULD ARGUE THAT AREAS OF LUCENCY WITHIN THE RIGHT MEDIASTINAL BORDER ARE MORE PROMINENT ON TODAY'S EXAMINATION, AS COMPARED TO THE MOST RECENT PRIOR XRAY, COMPATIBLE WITH KNOWN PNEUMOMEDIASTINUM AS SEEN ON PRIOR CHEST CT FROM. LUNG VOLUMES ARE DECREASED, ACCENTUATING BRONCHOVASCULAR STRUCTURES AND CAUSING INCREASED CROWDING OF THE LUNG BASES BILATERALLY. AGAIN SEEN ARE SMALL BILATERAL PLEURAL EFFUSIONS WITH INTERSTITIAL ABNORMALITY, MOST PRONOUNCED AT THE LEFT MID LUNG ZONE, SUGGESTIVE OF CHRONIC INTERSTITIAL LUNG DISEASE. FINDINGS DISCUSSED WITH DR BY NSR VIA PHONE ON AT 6 : 15 PM. WET READ VERSION # 1 6 : 21 PM ONE COULD ARGUE THAT AREAS OF LUCENCY WITHIN THE RIGHT MEDIASTINAL BORDER ARE MORE PROMINENT ON TODAY'S EXAMINATION, AS COMPARED TO THE MOST RECENT PRIOR XRAY, COMPATIBLE WITH KNOWN PNEUMOMEDIASTINUM AS SEEN ON PRIOR CHEST CT FROM. LUNG VOLUMES ARE DECREASED, ACCENTUATING BRONCHOVASCULAR STRUCTURES AND CAUSING INCREASED CROWDING OF THE LUNG BASES B

In [86]:
max_length = 512
print(len(hold))
if len(hold) > max_length:
    cutoff = len(hold) - max_length + 1 # The cutoff for the tokens to be deleted
    del hold[1:cutoff]
print(len(hold))

849
512


In [119]:
print(len(encoded_text))
if len(encoded_text) < max_length:
    num_padding = max_length - len(encoded_text)
    encoded_text.extend([0] * num_padding)
    print(len(encoded_text))

126
512


In [127]:
biobert(torch.tensor([encoded_text]), attention_mask = torch.tensor([attention]))

(tensor([[[ 0.0343,  0.0601, -0.1038,  ..., -0.0026,  0.1650,  0.0292],
          [-0.4967,  0.4419, -0.3455,  ...,  0.4246, -0.6533,  0.1716],
          [-0.4041, -0.0194,  0.0374,  ...,  0.4303, -0.2325, -0.3277],
          ...,
          [ 0.0118, -0.1907, -0.0836,  ...,  0.1552,  0.1065,  0.0623],
          [ 0.2091, -0.2067, -0.1239,  ..., -0.0492, -0.0890,  0.0951],
          [ 0.1902, -0.5259, -0.6845,  ..., -0.4168,  0.0479, -0.3069]]],
        grad_fn=<NativeLayerNormBackward>),
 tensor([[ 3.4797e-02,  6.9386e-02,  9.5684e-01, -9.9999e-01,  9.9998e-01,
           3.8128e-01,  2.6741e-02,  8.5040e-01, -3.2890e-02, -2.0456e-02,
           8.7006e-01,  9.9974e-01,  8.1255e-02, -8.8358e-01,  6.9053e-02,
           1.6002e-02,  9.9999e-01,  1.1773e-02, -9.9771e-01, -4.4217e-02,
          -7.1061e-03, -9.5842e-01,  1.5264e-01,  9.7861e-01, -4.8262e-02,
           7.2457e-02,  9.9969e-01,  9.9737e-01,  3.3668e-02, -1.1860e-02,
           5.8690e-03, -9.9999e-01,  9.9294e-01, -9.9969e

In [126]:
length = 126
attention = [1]* length
attention.extend([0] * (max_length - length))
len(attention)

512

In [125]:
    # Training Loop
    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, criterion, device, optimizer)
        val_loss, acc, m_acc, macro_f1, class_f1 = test(model, test_loader, criterion, device, n_way)

        if (save_models):
            torch.save(model.state_dict(), os.path.join(path_models, f'basic_{epoch + 1}.pth'))  # Save the model

        # Append and report results
        df_results.loc[epoch] = [epoch + 1, train_loss, val_loss, acc, m_acc, macro_f1] + class_f1
        print(
            f'[{epoch + 1}] t_loss: {train_loss:.5f} v_loss: {val_loss:.5f} val_acc: {acc:.5f} '
            f'val_m_acc: {m_acc:.5f} f1: {macro_f1:.5f}')

NameError: name 'attention' is not defined

In [5]:
# Variables
num_epochs = 5
num_workers = 3
bs = 64
n_way = 3
root_image = '../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/files'
root_text = '../../../../scratch/rl80/mimic-cxr-2.0.0.physionet.org'
path_splits = '../splits/20_shot.csv'
path_biobert = './'
path_pretrained = '../results/basic/basic_36.pth' # Pretrained image model

# Check for GPU device
torch.cuda.set_device(0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load in model
model = MultiModalNet(n_way, path_biobert)

In [14]:
for key in model.named_parameters():
    print(key[0])

baseline.block1.0.weight
baseline.block1.0.bias
baseline.block1.1.weight
baseline.block1.1.bias
baseline.block2.0.weight
baseline.block2.0.bias
baseline.block2.1.weight
baseline.block2.1.bias
baseline.block3.0.weight
baseline.block3.0.bias
baseline.block3.1.weight
baseline.block3.1.bias
baseline.block4.0.weight
baseline.block4.0.bias
baseline.block4.1.weight
baseline.block4.1.bias
baseline.linear.weight
baseline.linear.bias
biobert.embeddings.word_embeddings.weight
biobert.embeddings.position_embeddings.weight
biobert.embeddings.token_type_embeddings.weight
biobert.embeddings.LayerNorm.weight
biobert.embeddings.LayerNorm.bias
biobert.encoder.layer.0.attention.self.query.weight
biobert.encoder.layer.0.attention.self.query.bias
biobert.encoder.layer.0.attention.self.key.weight
biobert.encoder.layer.0.attention.self.key.bias
biobert.encoder.layer.0.attention.self.value.weight
biobert.encoder.layer.0.attention.self.value.bias
biobert.encoder.layer.0.attention.output.dense.weight
biobert.en

In [13]:
tokenizer.encode_plus

'baseline.block1.0.weight'

In [None]:
# Variables
num_epochs = 5
num_workers = 3
bs = 64
n_way = 3
root_image = '../../../../scratch/rl80/mimic-cxr-jpg-2.0.0.physionet.org/files'
root_text = '../../../../scratch/rl80/mimic-cxr-2.0.0.physionet.org'
path_splits = '../splits/20_shot.csv'
path_biobert = ',/'
path_pretrained = '../results/basic/basic_36.pth' # Pretrained image model

# Check for GPU device
torch.cuda.set_device(0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load in model
model = MultiModalNet(n_way, path_biobert)
pretrained_dict = torch.load(path_pretrained)
#model_dict = model.state_dict()
#model_dict.update(pretrained_dict)
#model.load_state_dict(model_dict)
model.load_state_dict(torch.load(path_pretrained))

# Load in model and dataset
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
tokenizer = get_tokenizer()
train_dataset = MimicCxrMulti(root_image, root_text, csv_path, tokenizer, mode='novel_train')
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers)


#biobert = get_biobert(model_dir=None, download=True)
#biobert = get_biobert(model_dir='./', download=False)