In [1]:
from transformers import BertModel
from transformers import BertTokenizer
from transformers import PretrainedConfig

# metrics
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import BCELoss
from collections import Counter

In [2]:
path_to_data = './data'

# create dataframe from sessions.json
df = pd.read_json(f'{path_to_data}/sessions.json')
df.head()

# create dictionaries for switching between symptom and id
id2sym = {}
sym2id = {}

with open(f'{path_to_data}/symptoms.json') as json_file:
    data = json.load(json_file)
    for sym in data:
        id2sym[sym['id']] = sym['name']
        sym2id[sym['name']] = sym['id']
        
        
# remove labels that have less than m occurrences
m = 0

labels_list = df['confirmed'].tolist()
labels_list = sum(labels_list, [])
c = Counter(labels_list)
for i in range(len(df)):
    to_remove = []
    
    # find labels that should be removed 
    for j in range(len(df['confirmed'][i])):
        if c[df['confirmed'][i][j]] < m:
            to_remove.append(j)
            
    # remove the labels
    shift = 0
    for j in range(len(to_remove)):
        df['confirmed'][i].pop(to_remove[j]-shift)
        shift += 1
    
        
# add column with the symptom names
sym_names = []

for syms in df['confirmed']:
    if len(syms) != 0:
        sym_names.append([id2sym[x] for x in syms])
    else:
        sym_names.append([])

df['labels'] = sym_names

# remove all rows with no confirmed labels
df = df[df['confirmed'].map(len) > 0]
df = df.reset_index(drop=True)

In [3]:
df.head()

Unnamed: 0,text,confirmed,suggested,labels
0,Slut på medicin.,"[89, 651]",[348],"[Känd astma, Känd lungsjukdom]"
1,Behöver att prata med psykolog angående använd...,"[116, 215]","[215, 348, 446]","[Nedstämdhet, Trötthet]"
2,Har fått besvärlig eksem på händerna,"[2, 141]",[141],"[Hudbesvär, Synliga hudbesvär]"
3,Muskelsvaghet och trötthet känner mig skakig o...,"[606, 215]","[12, 97, 215, 359, 518, 606]","[Muskelsvaghet, Trötthet]"
4,Svår smärta i vänsterhanden/handleden precis n...,"[682, 493]","[15, 28, 48, 54, 114, 148, 246, 313, 333, 339,...","[Smärta i handled eller fingrar, Förvärras av ..."


In [4]:
path_to_bert = r'./bert/bert-base-swedish-cased'
tok = BertTokenizer.from_pretrained(path_to_bert)

# train a multilabel_binarizer on the labels
labels = df['labels'].tolist()
multilab_bin = MultiLabelBinarizer()
multilab_bin.fit(labels)


class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, multilab_bin, max_len):
        self.tokenizer = tokenizer
        self.multilab_bin = multilab_bin
        self.data = dataframe
        self.text = self.data['text']
        self.labels = self.data['labels']
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        text = ' '.join(text.split())
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            pad_to_max_length=True,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(np.sum(self.multilab_bin.transform([self.labels[index]]), axis=0), dtype=torch.float)
        }        

In [5]:
batch_size = 20

train_dataset, test_dataset = train_test_split(df,
                                        random_state=42,
                                        test_size=0.2,
                                        shuffle=True)
train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

train_set = CustomDataset(train_dataset, tok, multilab_bin, 500)
test_set = CustomDataset(test_dataset, tok, multilab_bin, 500)

train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
               }
test_params = {'batch_size': batch_size,
               'shuffle': True,
               'num_workers': 0
              }

train_loader = DataLoader(train_set, **train_params)
test_loader = DataLoader(test_set, **test_params)

In [6]:
# decide which device to use. use cuda if available
dev = ''
if torch.cuda.is_available():
    dev = 'cuda:0'
else:
    dev = 'cpu'
    
# TEMP: use cpu only
dev = 'cpu'

print(f'dev = {dev}')
print(f'Number of available GPUs: {torch.cuda.device_count()}')

device = torch.device(dev)

dev = cpu
Number of available GPUs: 1


In [7]:
class BERTClass(nn.Module):
    def __init__(self, output_dim, config):
        super(BERTClass, self).__init__()
        self.bert = BertModel.from_pretrained(path_to_bert)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.fc = nn.Linear(config.hidden_size, output_dim)
        self.sigm = nn.Sigmoid()
        
    def forward(self, ids, mask, token_type_ids):
        x = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        #print(f'pool_out[1]: {pool_out[1].shape}')
        x = self.dropout(x[1])
        #print(dropout_out: {dropout_out.shape}')
        x = self.fc(x)
        #print(f'fc_out: {fc_out.shape}')
        output = self.sigm(x)
        return output

In [8]:
# compute the loss of an epoch by averaging all batch losses
def epoch_loss(model, data_loader, criterion):
    loss = 0
    batch_count = 0
    
    with torch.no_grad():
        for idx,batch in enumerate(data_loader):
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            loss += criterion(outputs, labels)
            batch_count += 1
    
    return loss / batch_count

In [9]:
# define the model
D_out = len(multilab_bin.classes_)
config = PretrainedConfig.from_json_file(f'{path_to_bert}/config.json')

model = BERTClass(D_out, config)
model.to(device)

# training loop
learning_rate = 0.0003

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# losses over entire train-/test-set per epoch
train_losses = []
test_losses = []

num_epochs = 1
for epoch in range(num_epochs):
    for idx,batch in enumerate(train_loader):
        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.float)
        
        print(f'Batch {idx+1}')
        #print('forward pass ...')
        
        # forward pass
        outputs = model(ids, mask, token_type_ids)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # backward pass
        loss = criterion(outputs, labels)
        
        #print('backward pass')
        loss.backward()
        optimizer.step()
    
    # save the losses
    train_losses.append(epoch_loss(model, train_loader, criterion))
    test_losses.append(epoch_loss(model, test_loader, criterion))
    
    print(f'End of epoch {epoch+1}, Train Loss: {train_losses[-1]:.7f}, Test Loss: {test_losses[-1]:.7f}')


Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7


NameError: name 'th' is not defined

In [43]:
# get the predicitons and corresponding labels
def get_pred_true(model, data_loader, D_out):
    y_pred = np.zeros((1,197))
    y_true = np.zeros((1,197))
    
    with torch.no_grad():
        for idx,batch in enumerate(data_loader):
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            y_pred = np.concatenate((np.array(y_pred),outputs.numpy()), axis=0)
            y_true = np.concatenate((y_true,np.array(labels)), axis=0)

    return y_pred, y_true

In [44]:
y_pred, y_true = get_pred_true(model, test_loader, D_out)

# set all values above threshold to 1, else 0
th = 0.2
y_pred[y_pred > th] = 1
y_pred[y_pred <= th] = 0

print(f"Micro-average F1-score: {f1_score(y_true, y_pred, average='micro')}")
print(f"Weighted-average F1-score: {f1_score(y_true, y_pred, average='weighted', zero_division=1)}")
print(f"Macro-average F1-score: {f1_score(y_true, y_pred, average='macro', zero_division=1)}")
print(f"Sample-average Jaccard score: {jaccard_score(y_true, y_pred, average='samples', zero_division=1)}")
print(f"Accuracy (exact match): {accuracy_score(y_true, y_pred)}")
print(f"Hamming Loss: {hamming_loss(y_true, y_pred)}")

x = [x for x in range(1,num_epochs+1)]
plt.plot(x, test_losses)
plt.plot(x, train_losses)
plt.legend(['Test Loss', 'Train Loss'])
plt.xticks(x, x)
plt.xlabel('Epoch')
plt.ylabel('Loss')
#plt.gca().set_ylim([0,0.1])
plt.show()

(20, 197)
(1, 197)
(20, 197)
(21, 197)
(20, 197)
(41, 197)
(20, 197)
(61, 197)
Micro-average F1-score: 0.02742561448900388
Weighted-average F1-score: 0.28879462205641826
Macro-average F1-score: 0.14101956942975025
Sample-average Jaccard score: 0.026137703421278517
Accuracy (exact match): 0.012345679012345678
Hamming Loss: 0.47114119195337467


NameError: name 'plt' is not defined