In [None]:
%env CUDA_VISIBLE_DEVICES=0

SOURCE_DATA_FILE_NAME = 'tagged_bib_refs_fr.json'

MODEL_DIR = "./output"

!pip install transformers seqeval[gpu]
import itertools
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import CamembertTokenizerFast, CamembertForTokenClassification

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)
data = pd.read_json(SOURCE_DATA_FILE_NAME)
print(data.head())

unique_labels = np.unique(list(itertools.chain(*data.labels)))
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}
labels_to_ids

In [None]:
MAX_LENGTH = 128
TRAINING_EPOCHS = 10
TRAINING_BATCH_SIZE = 4
VALIDATION_BATCH_SIZE = 2
LEARNING_RATE = 1e-05
TEST_SIZE = 0.2

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")


class BibRefParserDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.length = len(dataframe)

    def __getitem__(self, index):
        sentence = self.dataframe.words[index]
        labels = self.dataframe.labels[index]

        embedding = self.tokenizer(sentence,
                                   is_split_into_words=True,
                                   return_offsets_mapping=True,
                                   padding='max_length',
                                   truncation=True,
                                   max_length=self.max_length)

        labels = [labels_to_ids[label] for label in labels]

        token_labels = np.ones(len(embedding["offset_mapping"]), dtype=int) * -100

        i = -1
        for index, offset_mapping in enumerate(embedding["offset_mapping"]):
            car = tokenizer.convert_ids_to_tokens(embedding.data['input_ids'][index])
            # tokenizer adds isolated spaces
            if offset_mapping[0] == 0 and offset_mapping[1] != 0 and car != '▁':
                i += 1
            if offset_mapping[1] != 0:
                token_labels[index] = labels[i]

        entry = {key: torch.as_tensor(val) for key, val in embedding.items()}
        entry['labels'] = torch.as_tensor(token_labels)

        return entry

    def __len__(self):
        return self.length

In [None]:
training_dataset, testing_dataset = train_test_split(data, test_size=TEST_SIZE)

training_dataset = training_dataset.reset_index(drop=True)
testing_dataset = testing_dataset.reset_index(drop=True)

training_torch_dataset = BibRefParserDataset(training_dataset, tokenizer, MAX_LENGTH)
validation_torch_dataset = BibRefParserDataset(testing_dataset, tokenizer, MAX_LENGTH)

training_torch_dataloader = DataLoader(training_torch_dataset, **{'batch_size': TRAINING_BATCH_SIZE,
                                                                  'shuffle': True,
                                                                  'num_workers': 0
                                                                  })
validation_torch_dataloader = DataLoader(validation_torch_dataset, **{'batch_size': VALIDATION_BATCH_SIZE,
                                                                      'shuffle': True,
                                                                      'num_workers': 0
                                                                      })
bibref_parser_model = CamembertForTokenClassification.from_pretrained('camembert-base', num_labels=len(labels_to_ids))
bibref_parser_model.to(device)
optimizer = torch.optim.Adam(params=bibref_parser_model.parameters(), lr=LEARNING_RATE)


In [None]:

def forward(batch, model):
    input_ids = batch['input_ids'].to(device, dtype=torch.long)
    attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
    labels = batch['labels'].to(device, dtype=torch.long) if type(batch) is dict else None
    result = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    return input_ids, labels, result


def predictions_from_logits(logits):
    logits = logits.view(-1, bibref_parser_model.num_labels)
    predictions = torch.argmax(logits, axis=1)
    return predictions


def compute_precision(labels, logits, sumed_labels=[], sumed_predictions=[]):
    targets = labels.view(-1)
    predictions = predictions_from_logits(logits)
    precision = labels.view(-1) != -100
    labels = torch.masked_select(targets, precision)
    predictions = torch.masked_select(predictions, precision)
    sumed_labels.extend(labels)
    sumed_predictions.extend(predictions)
    return accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())


def train():
    loss, precision, examples_counter, steps_counter = 0, 0, 0, 0
    bibref_parser_model.train()

    for idx, batch in enumerate(training_torch_dataloader):
        unput_ids = batch['input_ids'].to(device, dtype=torch.long)
        attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        result = bibref_parser_model(input_ids=unput_ids, attention_mask=attention_mask, labels=labels)
        logits = result.logits
        loss += result.loss.item()
        steps_counter += 1
        examples_counter += TRAINING_BATCH_SIZE

        precision += compute_precision(labels, logits)

        torch.nn.utils.clip_grad_norm_(
            parameters=bibref_parser_model.parameters(), max_norm=10
        )

        optimizer.zero_grad()
        result.loss.backward()
        optimizer.step()

    epoch_loss = loss / steps_counter
    precision = precision / steps_counter
    print(f"Loss: {epoch_loss}")
    print(f"Precision: {precision}")



In [None]:
torch.cuda.empty_cache()
for epoch in range(TRAINING_EPOCHS):
    print(f"Beginning epoch n°: {epoch + 1}")
    train()

In [None]:
def validate(model, loader):
    model.eval()

    loss, precision, examples_counter, steps_counter = 0, 0, 0, 0
    validation_predictions, validation_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(loader):
            input_ids, labels, result = forward(batch, bibref_parser_model)
            logits = result.logits
            loss += result.loss.item()

            steps_counter += 1
            examples_counter += VALIDATION_BATCH_SIZE

            precision += compute_precision(labels, logits, validation_labels, validation_predictions)

    labels = [ids_to_labels[label_id.item()] for label_id in validation_labels]
    predictions = [ids_to_labels[label_id.item()] for label_id in validation_predictions]

    loss = loss / steps_counter
    precision = precision / steps_counter
    print(f"Loss: {loss}")
    print(f"Accuracy: {precision}")

    return labels, predictions


In [None]:
labels, predictions = validate(bibref_parser_model, validation_torch_dataloader)

In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

In [None]:
sentence = 'Amdaoud M. , Arcuri G., Levratto N. (2021b) "Healthcare system and social trust in the fight against COVID-19: the case of France", European Journal of Public Health, Volume 31, Issue 4, August 2021, Pages 895–900, https://doi.org/10.1093/eurpub/ckab112'

inputs = tokenizer(sentence.split(),
                   is_split_into_words=True,
                   return_offsets_mapping=True,
                   padding='max_length',
                   truncation=True,
                   max_length=MAX_LENGTH,
                   return_tensors="pt")

input_ids, labels, result = forward(inputs, bibref_parser_model)

predictions = predictions_from_logits(result[0])

tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in predictions.cpu().numpy()]
tokens_predictions_map = list(zip(tokens, token_predictions))

predictions_by_word = []
token_counter = 0
for token_prediction, offset_mapping in zip(tokens_predictions_map, inputs["offset_mapping"].squeeze().tolist()):
    token = tokens[token_counter]
    if offset_mapping[0] == 0 and offset_mapping[1] != 0 and token != '▁':
        predictions_by_word.append(token_prediction[1])
    token_counter+=1

print(sentence.split())
print(predictions_by_word)

In [None]:
import os

os.makedirs(MODEL_DIR, exist_ok=True)
tokenizer.save_vocabulary(MODEL_DIR)
bibref_parser_model.save_pretrained(MODEL_DIR)