**French corpus annotation using xlm-roberta-base fine tuned model for NER on Chia dataset**

Now, once we have trained the NER model on the Chia dataset, we are going to annotate the french translations of the corpus using the model. 

Remember that the main hypothesis for this annotation strategy is that a multilingual model, such as xlm-roberta-base, fine-tuned for a specific task (NER) in one of the languages it has been trained on, will be able to perfom well in the rest of the languages it has seen during pretraining.

In [93]:
# uncomment if working in colab
# from google.colab import drive
# drive.mount('/content/drive')

In [94]:
# uncomment if using colab
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U datasets
# !pip install -q -U git+https://github.com/huggingface/accelerate.git

In [95]:
from transformers import pipeline, AutoTokenizer
from datasets import Dataset
import torch
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

In [96]:
# dict for the entities (entity to int value)
sel_ent = {
    "O": 0,
    "B-Condition": 1,
    "I-Condition": 2,
    "B-Value": 3,
    "I-Value": 4,
    "B-Drug": 5,
    "I-Drug": 6,
    "B-Procedure": 7,
    "I-Procedure": 8,
    "B-Measurement": 9,
    "I-Measurement": 10,
    "B-Temporal": 11,
    "I-Temporal": 12,
    "B-Observation": 13,
    "I-Observation": 14,
    "B-Person": 15,
    "I-Person": 16
}
entities_list = list(sel_ent.keys())
sel_ent_inv = {v: k for k, v in sel_ent.items()}

In [153]:
# paths
root_path = '..' # comment if working in colab
# root_path = './drive/MyDrive/HandsOn-NLP' # uncomment if working in colab
data_path = f'{root_path}/data'
french_data_path = f'{data_path}/chia_criteria_french'
french_annotated_path = f'{data_path}/chia_criteria_french_annotated'
models_path = f'{root_path}/models'

In [98]:
base_model = 'xlm-roberta-base'

In [99]:
# load model
model = torch.load(f'{models_path}/chia-multilingual-ner.pt')
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [100]:
# get all french texts
files = os.listdir(french_data_path)
sentences = []

for file in files:
    with open(f'{french_data_path}/{file}', 'r') as f:
        sentences.extend([(file,sentence[:-1])  for sentence in f.readlines() if sentence != '\n'])
len(sentences)

296

In [101]:
# clean all sentences (remove spaces before special characters...)
import re

def clean_text(text):
    text = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text)
    text = re.sub(r'\s([,;:])(?:\s|$)', r'\1', text)
    text = re.sub(r'\s([)(])', r'\1', text)
    text = re.sub(r'\s([/])\s', r'\1', text)
    text = re.sub(r'\s([%])\s', r'\1', text)
    text = re.sub(r'\s([=])\s', r'\1', text)
    text = re.sub(r'\s([-])\s', r'\1', text)
    text = re.sub(r'\s([+])\s', r'\1', text)
    text = re.sub(r'\s([*])\s', r'\1', text)
    return text

sentences = [(file, clean_text(sentence)) for file,sentence in sentences]

In [102]:
sentences[0][1].split(' ')

['Adolescent(',
 '10-21',
 'ans)',
 'en',
 'cours',
 'de',
 'fusion',
 'de',
 'la',
 'colonne',
 'vertébrale',
 'pour',
 'la',
 'scoliose',
 'idiopathique,',
 'la',
 'spondylolisthésis',
 'ou',
 'la',
 'kyphose',
 'de',
 'Scheuermann.']

In [103]:
data_french = []
for file, sentence in sentences:
    data_french.append({
        'tokens': sentence.split(),
        'file': file
    })
data_french = Dataset.from_pandas(pd.DataFrame(data_french))
data_french

Dataset({
    features: ['tokens', 'file'],
    num_rows: 296
})

In [104]:
def tokenize_sentence(sentence, tokenizer):
    """
    Tokenize a sentence using the tokenizer and keeps the word ids
    inputs:
        sentence: str, sentence to tokenize
        tokenizer: tokenizer, tokenizer to use
    outputs:
        tokenized_sentence: dict, tokenized sentence
    """
    tokenized_sentence = tokenizer(sentence, is_split_into_words=True, truncation=True, padding='max_length', max_length=512)
    words_ids  = []
    for i in range(len(sentence)):
        word_ids_sentence = tokenized_sentence.word_ids(batch_index=i)
        words_ids.append(word_ids_sentence)
    tokenized_sentence['word_ids'] = words_ids
    return tokenized_sentence



In [105]:
data_french = data_french.map(lambda x: tokenize_sentence(x['tokens'], tokenizer), batched=True)

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

In [106]:
data_french

Dataset({
    features: ['tokens', 'file', 'input_ids', 'attention_mask', 'word_ids'],
    num_rows: 296
})

In [116]:
# get just the input_ids and attention_mask
data_for_model = data_french.remove_columns(['file', 'tokens', 'word_ids'])

In [118]:
data_loader = torch.utils.data.DataLoader(data_for_model, batch_size=16)

In [108]:
annotated_sentences = []

In [109]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [120]:
labels = []
for batch in tqdm(data_loader):
    
    batch['input_ids'] = torch.LongTensor(np.column_stack(np.array(batch['input_ids']))).to(device)
    batch['attention_mask'] = torch.LongTensor(np.column_stack(np.array(batch['attention_mask']))).to(device)
    batch_tokenizer = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
    # break
    with torch.no_grad():
        outputs = model(**batch_tokenizer)
    
    labels_batch = torch.argmax(outputs.logits, dim=2).to('cpu').numpy()
    labels.extend([list(labels_batch[i]) for i in range(labels_batch.shape[0])])
    
    del batch
    del outputs
    torch.cuda.empty_cache()

100%|██████████| 19/19 [01:02<00:00,  3.29s/it]


In [123]:
sentences[0]

('NCT02464813_inc.bio_fr.txt',
 'Adolescent( 10-21 ans) en cours de fusion de la colonne vertébrale pour la scoliose idiopathique, la spondylolisthésis ou la kyphose de Scheuermann.')

In [126]:
print(data_french[0]['word_ids'])

[None, 0, 0, 1, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 10, 10, 10, 11, 12, 13, 13, 13, 14, 14, 14, 14, 14, 15, 16, 16, 16, 16, 16, 16, 16, 17, 18, 19, 19, 19, 20, 21, 21, 21, 21, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, N

In [127]:
print([entities_list[l] for l in labels[0]])

['I-Procedure', 'B-Person', 'I-Person', 'B-Value', 'I-Value', 'I-Value', 'O', 'O', 'O', 'O', 'B-Procedure', 'I-Procedure', 'I-Procedure', 'I-Procedure', 'I-Procedure', 'I-Procedure', 'I-Procedure', 'I-Procedure', 'I-Procedure', 'O', 'O', 'B-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'O', 'O', 'B-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'O', 'O', 'B-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'O', 'I-Condition', 'I-Condition', 'B-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'I

In [144]:
def annotate_sentences(dataset, labels, entities_list,criteria = 'first_label'):
    """
    Annotate the sentences with the predicted labels
    inputs:
        dataset: dataset, dataset with the sentences
        labels: list, list of labels
        entities_list: list, list of entities
        criteria: str, criteria to use to select the label when the words pices have different labels
            - first_label: select the first label
            - majority: select the label with the majority
    outputs:
        annotated_sentences: list, list of annotated sentences
    """
    annotated_sentences = []
    for i in range(len(dataset)):
        # get just the tokens different from None
        sentence = dataset[i]
        word_ids = sentence['word_ids']
        sentence_labels = labels[i]
        annotated_sentence = [[] for _ in range(len(dataset[i]['tokens']))]
        for word_id, label in zip(word_ids, sentence_labels):
            if word_id is not None:
                annotated_sentence[word_id].append(label)
        annotated_sentence_filtered = []
        if criteria == 'first_label':
            annotated_sentence_filtered = [annotated_sentence[i][0] for i in range(len(annotated_sentence))]
        elif criteria == 'majority':
            annotated_sentence_filtered = [max(set(annotated_sentence[i]), key=annotated_sentence[i].count) for i in range(len(annotated_sentence))]

        annotated_sentences.append(annotated_sentence_filtered)
    return annotated_sentences

In [149]:
annotated_sentences_first = annotate_sentences(data_french, labels, entities_list, criteria='first_label')

In [150]:
annotated_sentences_max = annotate_sentences(data_french, labels, entities_list, criteria='majority')

In [151]:
dataset_annotated = Dataset.from_dict({'tokens': data_french['tokens'], 'annotated_labels': annotated_sentences_first, 'annotated_labels_max': annotated_sentences_max, 'file': data_french['file']})

In [152]:
dataset_annotated

Dataset({
    features: ['tokens', 'annotated_labels', 'annotated_labels_max', 'file'],
    num_rows: 296
})

In [158]:
# generate the files
for file in set(dataset_annotated['file']):
    file_data = dataset_annotated.filter(lambda x: x['file'] == file)
    with open(f'{french_annotated_path}/{file}', 'w') as f:
        for sentence, labels, labels_max in zip(file_data['tokens'], file_data['annotated_labels'], file_data['annotated_labels_max']):
            for token, label, label_max in zip(sentence, labels, labels_max):
                f.write(f'{token} {entities_list[label]} {entities_list[label_max]}\n')
            f.write('\n')

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/296 [00:00<?, ? examples/s]