In [116]:
%pip install transformers
%pip install accelerate -U
%pip install wandb

Collecting wandb
  Downloading wandb-0.15.10-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.35-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.8/188.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.30.0-py2.py3-none-any.whl (218 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.8/218.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.2-cp310-cp310-manylinux_2_5_x86_64.manyl

In [117]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import f1_score

from transformers import AutoTokenizer, BertForTokenClassification, get_scheduler
import importlib
from importlib import reload
from tqdm.auto import tqdm

In [118]:
model_checkpoint = 'distilbert-base-uncased'
model_name = 'custom_model'

In [119]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [120]:
using_wandb = False

In [122]:
import wandb
wandb.login()
using_wandb = True

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [123]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [124]:
# If you are using Colab
dir_path = "/content/drive/Othercomputers/my_computer/dl-nlp_project_named-entity-recognition/"
module_path = dir_path[9:].replace("/", ".")
# imports
data_module = importlib.import_module(module_path + "data")
load_data = data_module.load_data
extract_sentences_and_labels = data_module.extract_sentences_and_labels
generate_label_vocab = data_module.generate_label_vocab
encode_labels = data_module.encode_labels
build_label_to_idx = data_module.build_label_to_idx
build_idx_to_label = data_module.build_idx_to_label
build_word_to_idx = data_module.build_word_to_idx
build_idx_to_word = data_module.build_idx_to_word
split_data = data_module.split_data

In [125]:
reload(data_module)

<module 'drive.Othercomputers.my_computer.dl-nlp_project_named-entity-recognition.data' from '/content/drive/Othercomputers/my_computer/dl-nlp_project_named-entity-recognition/data.py'>

In [126]:
# If you are NOT using colab
# dir_path = ""
# from data_new import (
#     prepare_data_pipeline,
#     TRAIN_DATA_PATH,
#     TEST_DATA_PATH,
#     PAD,
#     tensor_to_sentences,
#     tensor_to_labels,
# )

In [127]:
train_file_path = dir_path + "data/train.json"
test_file_path = dir_path + "data/test.json"

In [128]:
train_data, test_data = load_data(train_file_path, test_file_path)
train_sentences, train_raw_labels = extract_sentences_and_labels(train_data)
test_sentences, test_raw_labels = extract_sentences_and_labels(test_data)

# Generate label vocabulary
label_vocab = generate_label_vocab(train_raw_labels + test_raw_labels)

# Encode labels pre-transformer
train_encoded_labels = encode_labels(train_raw_labels, label_vocab, train_sentences)
test_labels = encode_labels(test_raw_labels, label_vocab, test_sentences)

word_to_idx = build_word_to_idx(train_sentences + test_sentences)
idx_to_word = build_idx_to_word(word_to_idx)
label_to_idx = build_label_to_idx(label_vocab)
idx_to_label = build_idx_to_label(label_to_idx)

train_sentences, train_labels, val_sentences, val_labels = split_data(
    train_sentences, train_encoded_labels
)

In [129]:
SPECIAL_TOKEN = "<SPC>"

class Labels():
    def __init__(self, num_classes, names):
        super().__init__()
        names.append(SPECIAL_TOKEN)
        self.names = names
        self.num_classes = num_classes + 1

    def __getitem__(self, label_vector):
        return [
            self.names[idx]
            for idx, value in enumerate(label_vector)
            if value == 1
        ]

    def num_classes(self):
        return self.num_classes

    def decode(self, label_vector):
        return self.__getitem__(label_vector)

    def encode(self, names):
        indexes = []
        for name in names:
            index = self.names.index(name)
            indexes.append(index)
        tensor = torch.zeros(self.num_classes)
        for index in indexes:
            tensor[index] = 1
        return tensor

    def tensor2sentence(self, tensor):
        return [self.decode(vector) for vector in tensor]

ner_labels = Labels(num_classes=len(label_vocab), names=label_vocab)

In [130]:
id2label = ner_labels.decode
label2id = ner_labels.encode

In [131]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.ner_labels = labels
        self.num_rows = len(sentences)
        self.input_ids = None
        self.attention_mask = None
        self.aligned_labels = None
        self.features = {'id': range(self.num_rows),
                         'tokens': self.sentences,
                         'ner_labels': self.ner_labels,
                         'input_ids': self.input_ids,
                         'attention_mask': self.attention_mask,
                         'labels': self.aligned_labels}
        self.tokenized = False

    def __getitem__(self, idx):
        if self.tokenized:
            item = {
                'id': idx,
                'tokens': self.sentences[idx],
                'ner_labels': self.ner_labels[idx],
                'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_mask[idx],
                'labels': self.aligned_labels[idx]
            }
        else:
            item = {
                'id': idx,
                'tokens': self.sentences[idx],
                'ner_labels': self.ner_labels[idx],
            }
        return item

    def __len__(self):
        return self.num_rows

    def tokenize(self):
        tokenized_inputs = tokenize_and_align_labels(self[:])
        self.input_ids = torch.Tensor(tokenized_inputs['input_ids']).to(device)
        self.attention_mask = torch.Tensor(tokenized_inputs['attention_mask']).to(device)
        self.aligned_labels = torch.Tensor(tokenized_inputs['labels']).to(device)
        self.tokenized = True


In [132]:
datasets = {
    'train': NERDataset(train_sentences, train_labels),
    'val': NERDataset(val_sentences, val_labels),
    'test': NERDataset(test_sentences, test_labels)
}

In [133]:
ner_labels.names

['NumberAffected',
 'CTDesign',
 'ObjectiveDescription',
 'NumberPatientsArm',
 'PValueChangeValue',
 'ObservedResult',
 'FinalNumPatientsArm',
 'Title',
 'AggregationMethod',
 'DiffGroupAbsValue',
 'SdDevChangeValue',
 'ConfIntervalDiff',
 'Country',
 'PublicationYear',
 'Frequency',
 'DoseValue',
 'SubGroupDescription',
 'Precondition',
 'NumberPatientsCT',
 'PvalueDiff',
 'PercentageAffected',
 'Drug',
 'SdDevResValue',
 'TimePoint',
 'ConfIntervalChangeValue',
 'DoseDescription',
 'PMID',
 'Journal',
 'SdDevBL',
 'ResultMeasuredValue',
 'AvgAge',
 'ConclusionComment',
 'Author',
 'RelativeChangeValue',
 'MinAge',
 'AllocationRatio',
 '<SPC>']

In [134]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [135]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True)

    label_list = []
    for i, labels in enumerate(examples['ner_labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(label2id([SPECIAL_TOKEN]))
            elif word_idx != previous_word_idx:
                label_ids.append(torch.Tensor(labels[word_idx] + [0]))
            previous_word_idx = word_idx

        padded_length = len(tokenized_inputs['input_ids'][i])
        for i in range(padded_length - len(label_ids)):
            label_ids.append(label2id([SPECIAL_TOKEN]))
        label_ids = torch.stack(label_ids)
        label_list.append(label_ids)

    tokenized_inputs["labels"] = torch.stack(label_list)
    return tokenized_inputs

In [136]:
datasets['train'].tokenize()
datasets['val'].tokenize()
datasets['test'].tokenize()

In [137]:
print(datasets['train'][:]['input_ids'].shape)
print(datasets['train'][:]['labels'].shape)

torch.Size([1300, 269])
torch.Size([1300, 269, 37])


In [138]:
from transformers import DistilBertForTokenClassification
class CustomTokenClassification(DistilBertForTokenClassification):
    def __init__(self, config):
        super(CustomTokenClassification, self).__init__(config)
        self.loss_fct = BCEWithLogitsLoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None, id=None, tokens=None, ner_labels=None, **kwargs):
        outputs = super().forward(input_ids=input_ids.int(), attention_mask=attention_mask.int(), **kwargs)
        return outputs['logits']


In [139]:
def mask_and_flatten_logits_and_labels(logits, labels):
    mask = labels[:, :, -1] != 1
    logits = logits[mask]
    labels = labels[mask]

    flat_logits = logits.view(-1, logits.shape[-1])
    flat_labels = labels.view(-1, labels.shape[-1])
    return flat_logits, flat_labels

In [140]:
model = CustomTokenClassification.from_pretrained(model_checkpoint, num_labels=ner_labels.num_classes)
model.to(device);

Some weights of CustomTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [141]:
config = {}
config["num_epochs"] = 5
config["batch_size"] = 1
config["lr"] = 2e-5
config["num_warmup_steps"] = 0

In [142]:
if using_wandb:
  wandb.init(project="DL-NLP-Clinical-Trial-NER", config=config)
  config = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33mreylord[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [143]:
label_list = ner_labels.names

train_dataloader = DataLoader(
    datasets['train'], shuffle=True, batch_size=config['batch_size']
)
val_dataloader = DataLoader(
    datasets['val'], shuffle=True, batch_size=config['batch_size']
)

optimizer = AdamW(model.parameters(), lr=config['lr'])

num_training_steps = config["num_epochs"] * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=config["num_warmup_steps"], num_training_steps=num_training_steps
)

loss_fct = BCEWithLogitsLoss()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(config["num_epochs"]):
    model.train()

    for batch in train_dataloader:
        running_loss = 0.0
        labels = batch.pop('labels')

        logits = model(**batch)

        flat_logits, flat_labels = mask_and_flatten_logits_and_labels(logits, labels)

        loss = loss_fct(flat_logits, flat_labels)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        running_loss += loss.item() * labels.size(0)

        progress_bar.update(1)

    epoch_loss = running_loss / len(train_dataloader)
    progress_bar.write(f"Epoch {epoch}, Loss: {epoch_loss}")

    model.eval()

    preds = []
    true_labels = []

    for batch in val_dataloader:

        labels = batch.pop("labels")

        with torch.no_grad():
            logits = model(**batch)

        flat_logits, flat_labels = mask_and_flatten_logits_and_labels(logits, labels)

        pred = flat_logits.heaviside(torch.tensor(
            [0.0], device=device)).int().tolist()
        true_label = flat_labels.int().tolist()

        preds.extend(pred)
        true_labels.extend(true_label)

    f1 = f1_score(true_labels, preds, average='micro')
    progress_bar.write(f"f1 micro: {f1}")
    if using_wandb:
        wandb.log({"train_loss": epoch_loss, "micro_f1": f1, "epoch": epoch})
progress_bar.close()
model.save_pretrained(model_name)


  0%|          | 0/6500 [00:00<?, ?it/s]

Epoch 0, Loss: 3.8666014487926774e-05
f1 micro: 0.3398617511520737
Epoch 1, Loss: 4.885187014364279e-06
f1 micro: 0.6424778761061947
Epoch 2, Loss: 2.2005449192455184e-06
f1 micro: 0.6430839002267573
Epoch 3, Loss: 2.7753008672824273e-05
f1 micro: 0.6395400265369305
Epoch 4, Loss: 1.5823796678047914e-05
f1 micro: 0.647187091146969


In [144]:
test_dataloader = DataLoader(
    datasets['test'], shuffle=True, batch_size=1
)
model.eval()

preds = []
true_labels = []

progress_bar = tqdm(range(len(test_dataloader)))
for batch in test_dataloader:
    labels = batch.pop("labels")

    with torch.no_grad():
        logits = model(**batch)

    flat_logits, flat_labels = mask_and_flatten_logits_and_labels(logits, labels)

    pred = flat_logits.heaviside(torch.tensor(
        [0.0], device=device)).int().tolist()
    true_label = flat_labels.int().tolist()

    preds.extend(pred)
    true_labels.extend(true_label)

    progress_bar.update(1)

f1 = f1_score(true_labels, preds, average='micro')
progress_bar.write(f"f1 micro: {f1}")
if using_wandb:
    wandb.log({"test_micro_f1": f1})
    wandb.finish()
progress_bar.close()

  0%|          | 0/385 [00:00<?, ?it/s]

f1 micro: 0.6701423136536565


In [145]:
index = 10
examples = datasets['test'][index:index+1]
print(examples['tokens'][0])

labels = examples['labels']
with torch.no_grad():
    logits = model(**examples)

flat_logits, flat_labels = mask_and_flatten_logits_and_labels(logits, labels)

pred = flat_logits.heaviside(torch.tensor(
    [0.0], device=device)).int().tolist()
true_label = flat_labels.int().tolist()
f1 = f1_score(true_label, pred, average='micro')

print(ner_labels.tensor2sentence(true_label))
print(ner_labels.tensor2sentence(pred))
print(f"Micro-F1 Score: {f1:.3f}")

['METHODS', ':', 'This', '52', '-', 'week', ',', 'open', '-', 'label', ',', 'randomized', ',', 'multinational', ',', 'multicentre', 'trial', 'included', '310', 'subjects', 'with', 'type', '2', 'diabetes', '(', 'T2D', ')', 'on', 'premix', ',', 'with', 'or', 'without', 'metformin', ',', 'who', 'were', 'randomized', 'to', 'a', 'basal', '-', 'bolus', 'regimen', 'with', 'glargine', 'and', 'glulisine', '(', 'n', '=', '153', ';', 'mean', '+', '/', '-', 's', '.', 'd', '.']
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['NumberPatientsCT'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], ['Precondition'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['NumberPatientsArm'], [], [], [], [], [], [], [], [], []]
[[], [], [], [], [], [], [], [], [], [

In [146]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▃▅▆█
micro_f1,▁████
test_micro_f1,▁
train_loss,█▂▁▆▄

0,1
epoch,4.0
micro_f1,0.64719
test_micro_f1,0.67014
train_loss,2e-05
