# Imports


In [1]:
!pip install transformers
!pip install sacremoses
!pip install sentencepiece
!pip install seqeval

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.3 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempti

In [2]:
#torch and tranformers for model and training
import torch  
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import XLMRobertaTokenizerFast              
from transformers import XLMRobertaForTokenClassification
from transformers import AdamW                            
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
import sentencepiece

#sklearn for evaluation
from sklearn import preprocessing                       
from sklearn.metrics import classification_report        
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid         
from sklearn.model_selection import ParameterSampler      
from sklearn.utils.fixes import loguniform

#nlp preprocessing
from nltk import ngrams                                 
from spacy.pipeline import SentenceSegmenter
from spacy.lang.en import English
from spacy.pipeline import Sentencizer
from sacremoses import MosesTokenizer, MosesDetokenizer


#utilities
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import glob, os
import time
import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
import pickle         # for saving data structures
from pynvml import *  # for checking gpu memory

In [3]:
# connect to GPU 
device = torch.device('cuda')

print('Connected to GPU:', torch.cuda.get_device_name(0))

Connected to GPU: Tesla P100-PCIE-16GB


# Prepare Data

In [9]:
%cd /content/drive/MyDrive/2-Job/Product\ and\ Code/TermExtraction/ACTER-master

/content/drive/MyDrive/2-Job/Product and Code/TermExtraction/ACTER-master


In [10]:
!ls

ACTER-master


Training Data: corp, wind

Validation Data: equi

Test Data: htfl

In [12]:
#load terms

#en
df_corp_terms_en=pd.read_csv('ACTER-master/en/corp/annotations/corp_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_equi_terms_en=pd.read_csv('ACTER-master/en/equi/annotations/equi_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_htfl_terms_en=pd.read_csv('ACTER-master/en/htfl/annotations/htfl_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_wind_terms_en=pd.read_csv('ACTER-master/en/wind/annotations/wind_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  

#fr
df_corp_terms_fr=pd.read_csv('ACTER-master/fr/corp/annotations/corp_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_equi_terms_fr=pd.read_csv('ACTER-master/fr/equi/annotations/equi_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_htfl_terms_fr=pd.read_csv('ACTER-master/fr/htfl/annotations/htfl_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_wind_terms_fr=pd.read_csv('ACTER-master/fr/wind/annotations/wind_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  

#nl
df_corp_terms_nl=pd.read_csv('ACTER-master/nl/corp/annotations/corp_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_equi_terms_nl=pd.read_csv('ACTER-master/nl/equi/annotations/equi_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_htfl_terms_nl=pd.read_csv('ACTER-master/nl/htfl/annotations/htfl_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_wind_terms_nl=pd.read_csv('ACTER-master/nl/wind/annotations/wind_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  

labels=["Random", "Term"]

In [13]:
# show dataframe
df_wind_terms_en

Unnamed: 0,Term,Label
0,48/600,Named_Entity
1,4energia,Named_Entity
2,4energy,Named_Entity
3,"ab ""lietuvos energija""",Named_Entity
4,ab lietuvos elektrine,Named_Entity
...,...,...
1529,zhiquan,Named_Entity
1530,çetinkaya,Named_Entity
1531,çeti̇nkaya,Named_Entity
1532,çeşme,Named_Entity


**Functions for preprocessing and creating of Training Data**

In [14]:
#load all text files from folder into a string
def load_text_corpus(path):
  text_data=""
  print(glob.glob(path))
  for file in glob.glob(path+"*.txt"):
      print(file)
      with open(file) as f:
        temp_data = f.read()
        print(len(temp_data))
        text_data=text_data+" "+temp_data
  print(len(text_data))
  return text_data

In [41]:
#split in sentences and tokenize
def preprocess(text):
  #sentenize (from spacy)
  sentencizer = Sentencizer()
  nlp = English()
  nlp.add_pipe(sentencizer)
  doc = nlp(text)

  #tokenize
  sentence_list=[]
  mt = MosesTokenizer(lang='en')
  for s in doc.sents:
    tokenized_text = mt.tokenize(s, return_str=True)    #append tuple of tokens and original sentence
    sentence_list.append((tokenized_text.split(), s))     
  return sentence_list


In [51]:
#find indices of a sublist sub in a list l
def find_sub_list(subl,l):
    results=[]
    subllen=len(subl)
    for ind in (i for i,e in enumerate(l) if e==subl[0]):
        if l[ind:ind+subllen]==subl:
            results.append((ind,ind+subllen-1))

    return results

In [53]:
#input is list of sentences and dataframe containing terms
def create_training_data(sentence_list, df_terms, n):

  #create empty dataframe
  training_data = []

  md = MosesDetokenizer(lang='en')

  print(len(sentence_list))
  count=0

  for sen in sentence_list:
    count+=1
    s=sen[0]  #take first part of tuple, i.e. the tokens
    if count%100==0:
      print(count)

    #create label list, with "n" for non-terms, "B-T" for beginning of a term and "T" for the continuation of a term
    tags=["n"]*len(s)

    # 1-gram up to n-gram
    for i in range(1,min(n+1, len(s))):
      #create n-grams of this sentence

      n_grams = ngrams(s, i)

      #look if n-grams are in the annotation dataset
      for n_gram in n_grams: 
        n_gram_aslist=list(n_gram)
        n_gram=md.detokenize(n_gram)
        #print(n_gram) 
        context=str(sen[1]).strip()
        #if yes add an entry to the training data
        if n_gram.lower() in df_terms.values:
          #check where n_gram is in sentence and annotate it 
          #print(n_gram_aslist,s)
          sublist_indices=find_sub_list(n_gram_aslist, s)
          for indices in sublist_indices:
            for ind in range(indices[0],indices[1]+1):
              #if term start
              if ind==indices[0]:
                tags[ind]="B-T"
              #if continuation of a Term
              else: 
                tags[ind]="T"

    training_data.append((s,tags))
        

  return training_data

  

**Create Training Data**

In [54]:
#create trainings data for all corp texts
corp_text_en=load_text_corpus("ACTER-master/en/corp/texts/annotated/") # load text
corp_s_list=preprocess(corp_text_en)                                                # preprocess
#print(corp_s_list)
train_data_corp_en=create_training_data(corp_s_list, df_corp_terms_en, 6)           # create training data

['ACTER-master/en/corp/texts/annotated/']
ACTER-master/en/corp/texts/annotated/corp_en_01.txt
22924
ACTER-master/en/corp/texts/annotated/corp_en_02.txt
10531
ACTER-master/en/corp/texts/annotated/corp_en_06.txt
68867
ACTER-master/en/corp/texts/annotated/corp_en_12.txt
39055
ACTER-master/en/corp/texts/annotated/corp_en_19.txt
9404
ACTER-master/en/corp/texts/annotated/corp_en_04.txt
5408
ACTER-master/en/corp/texts/annotated/corp_en_10.txt
2280
ACTER-master/en/corp/texts/annotated/corp_en_08.txt
26854
ACTER-master/en/corp/texts/annotated/corp_en_11.txt
43972
ACTER-master/en/corp/texts/annotated/corp_en_09.txt
11955
ACTER-master/en/corp/texts/annotated/corp_en_05.txt
5911
ACTER-master/en/corp/texts/annotated/corp_en_07.txt
50648
297821
1594
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [57]:
train_data_corp_en

[(['Corruption', '?'], ['B-T', 'n']),
 (['Not',
   'in',
   'our',
   'company',
   '…',
   'Preventing',
   'corruption',
   'in',
   'corporate',
   'life',
   'Preface',
   'Conscious',
   'of',
   'its',
   'central',
   'position',
   'within',
   'the',
   'European',
   'Union',
   ',',
   'Belgium',
   'has',
   ',',
   'for',
   'many',
   'years',
   ',',
   'taken',
   'a',
   'firm',
   'line',
   'against',
   'corruption',
   'in',
   'national',
   'and',
   'international',
   'transactions',
   '.'],
  ['n',
   'n',
   'n',
   'B-T',
   'n',
   'n',
   'B-T',
   'n',
   'B-T',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'B-T',
   'T',
   'n',
   'B-T',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'n',
   'B-T',
   'n',
   'n',
   'n',
   'n',
   'B-T',
   'n']),
 (['For',
   'this',
   'purpose',
   ',',
   'a',
   'major',
   'reform',
   'was',
   'carried',
   'out',
   'at',
   'the',
   'end',
  

In [58]:
#create trainings data for all wind texts
wind_text_en=load_text_corpus("ACTER-master/en/wind/texts/annotated/") # load text
wind_s_list=preprocess(wind_text_en)                                                # preprocess
train_data_wind_en=create_training_data(wind_s_list, df_wind_terms_en, 6)           # create training data

['ACTER-master/en/wind/texts/annotated/']
ACTER-master/en/wind/texts/annotated/wind_en_04.txt
27665
ACTER-master/en/wind/texts/annotated/wind_en_01.txt
167235
ACTER-master/en/wind/texts/annotated/wind_en_32.txt
11613
ACTER-master/en/wind/texts/annotated/wind_en_26.txt
10284
ACTER-master/en/wind/texts/annotated/wind_en_02.txt
89979
306781
1855
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800


In [59]:
#create trainings data for all equi texts
equi_text_en=load_text_corpus("ACTER-master/en/equi/texts/annotated/") # load text
equi_s_list=preprocess(equi_text_en)                                                # preprocess
train_data_equi_en=create_training_data(equi_s_list, df_equi_terms_en, 6)           # create training data

['ACTER-master/en/equi/texts/annotated/']
ACTER-master/en/equi/texts/annotated/equi_en_009.txt
2520
ACTER-master/en/equi/texts/annotated/equi_en_008.txt
10706
ACTER-master/en/equi/texts/annotated/equi_en_004.txt
5133
ACTER-master/en/equi/texts/annotated/equi_en_007.txt
3045
ACTER-master/en/equi/texts/annotated/equi_en_006.txt
910
ACTER-master/en/equi/texts/annotated/equi_en_003.txt
9926
ACTER-master/en/equi/texts/annotated/equi_en_005.txt
2460
ACTER-master/en/equi/texts/annotated/equi_en_001.txt
33853
ACTER-master/en/equi/texts/annotated/equi_en_002.txt
30169
ACTER-master/en/equi/texts/annotated/equi_en_013.txt
6975
ACTER-master/en/equi/texts/annotated/equi_en_014.txt
5305
ACTER-master/en/equi/texts/annotated/equi_en_015.txt
4351
ACTER-master/en/equi/texts/annotated/equi_en_021.txt
15262
ACTER-master/en/equi/texts/annotated/equi_en_034.txt
13186
ACTER-master/en/equi/texts/annotated/equi_en_022.txt
2300
ACTER-master/en/equi/texts/annotated/equi_en_026.txt
14982
ACTER-master/en/equi/text

In [60]:
#create trainings data for all htfl texts
htfl_text_en=load_text_corpus("ACTER-master/en/htfl/texts/annotated/") # load text
htfl_s_list=preprocess(htfl_text_en)                                                # preprocess
train_data_htfl_en=create_training_data(htfl_s_list, df_htfl_terms_en, 6)           # create training data 

['ACTER-master/en/htfl/texts/annotated/']
ACTER-master/en/htfl/texts/annotated/htfl_en_006.txt
847
ACTER-master/en/htfl/texts/annotated/htfl_en_003.txt
1193
ACTER-master/en/htfl/texts/annotated/htfl_en_024.txt
1713
ACTER-master/en/htfl/texts/annotated/htfl_en_018.txt
750
ACTER-master/en/htfl/texts/annotated/htfl_en_001.txt
1181
ACTER-master/en/htfl/texts/annotated/htfl_en_009.txt
2867
ACTER-master/en/htfl/texts/annotated/htfl_en_014.txt
1890
ACTER-master/en/htfl/texts/annotated/htfl_en_010.txt
617
ACTER-master/en/htfl/texts/annotated/htfl_en_030.txt
1772
ACTER-master/en/htfl/texts/annotated/htfl_en_021.txt
990
ACTER-master/en/htfl/texts/annotated/htfl_en_022.txt
451
ACTER-master/en/htfl/texts/annotated/htfl_en_002.txt
1776
ACTER-master/en/htfl/texts/annotated/htfl_en_026.txt
2030
ACTER-master/en/htfl/texts/annotated/htfl_en_023.txt
1554
ACTER-master/en/htfl/texts/annotated/htfl_en_004.txt
1764
ACTER-master/en/htfl/texts/annotated/htfl_en_027.txt
1799
ACTER-master/en/htfl/texts/annotate

In [61]:
#fr
corp_text_fr=load_text_corpus("ACTER-master/fr/corp/texts/annotated/") # load text
corp_s_list=preprocess(corp_text_fr)                                                # preprocess
train_data_corp_fr=create_training_data(corp_s_list, df_corp_terms_fr, 6)           # create training data

wind_text_fr=load_text_corpus("ACTER-master/fr/wind/texts/annotated/") # load text
wind_s_list=preprocess(wind_text_fr)                                                # preprocess
train_data_wind_fr=create_training_data(wind_s_list, df_wind_terms_fr, 6)           # create training data

equi_text_fr=load_text_corpus("ACTER-master/fr/equi/texts/annotated/") # load text
equi_s_list=preprocess(equi_text_fr)                                                # preprocess
train_data_equi_fr=create_training_data(equi_s_list, df_equi_terms_fr, 6)           # create training data

htfl_text_fr=load_text_corpus("ACTER-master/fr/htfl/texts/annotated/") # load text
htfl_s_list=preprocess(htfl_text_fr)                                                # preprocess
train_data_htfl_fr=create_training_data(htfl_s_list, df_htfl_terms_fr, 6)           # create training data 

['ACTER-master/fr/corp/texts/annotated/']
ACTER-master/fr/corp/texts/annotated/corp_fr_08.txt
29365
ACTER-master/fr/corp/texts/annotated/corp_fr_06.txt
79134
ACTER-master/fr/corp/texts/annotated/corp_fr_02.txt
10756
ACTER-master/fr/corp/texts/annotated/corp_fr_11.txt
49155
ACTER-master/fr/corp/texts/annotated/corp_fr_04.txt
6115
ACTER-master/fr/corp/texts/annotated/corp_fr_10.txt
2345
ACTER-master/fr/corp/texts/annotated/corp_fr_07.txt
58901
ACTER-master/fr/corp/texts/annotated/corp_fr_19.txt
10162
ACTER-master/fr/corp/texts/annotated/corp_fr_12.txt
44073
ACTER-master/fr/corp/texts/annotated/corp_fr_05.txt
6539
ACTER-master/fr/corp/texts/annotated/corp_fr_01.txt
25577
ACTER-master/fr/corp/texts/annotated/corp_fr_09.txt
13267
335401
1551
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
['ACTER-master/fr/wind/texts/annotated/']
ACTER-master/fr/wind/texts/annotated/wind_fr_06.txt
330323
ACTER-master/fr/wind/texts/annotated/wind_fr_10.txt
29655
359980
1944
100
200
300
400


In [62]:
#nl
corp_text_nl=load_text_corpus("ACTER-master/nl/corp/texts/annotated/") # load text
corp_s_list=preprocess(corp_text_nl)                                                # preprocess
train_data_corp_nl=create_training_data(corp_s_list, df_corp_terms_nl, 6)           # create training data

wind_text_nl=load_text_corpus("ACTER-master/nl/wind/texts/annotated/") # load text
wind_s_list=preprocess(wind_text_nl)                                                # preprocess
train_data_wind_nl=create_training_data(wind_s_list, df_wind_terms_nl, 6)           # create training data

equi_text_nl=load_text_corpus("ACTER-master/nl/equi/texts/annotated/") # load text
equi_s_list=preprocess(equi_text_nl)                                                # preprocess
train_data_equi_nl=create_training_data(equi_s_list, df_equi_terms_nl, 6)           # create training data

htfl_text_nl=load_text_corpus("ACTER-master/nl/htfl/texts/annotated/") # load text
htfl_s_list=preprocess(htfl_text_nl)                                                # preprocess
train_data_htfl_nl=create_training_data(htfl_s_list, df_htfl_terms_nl, 6)           # create training data 

['ACTER-master/nl/corp/texts/annotated/']
ACTER-master/nl/corp/texts/annotated/corp_nl_02.txt
11248
ACTER-master/nl/corp/texts/annotated/corp_nl_10.txt
2468
ACTER-master/nl/corp/texts/annotated/corp_nl_04.txt
5932
ACTER-master/nl/corp/texts/annotated/corp_nl_01.txt
24728
ACTER-master/nl/corp/texts/annotated/corp_nl_08.txt
28169
ACTER-master/nl/corp/texts/annotated/corp_nl_06.txt
77720
ACTER-master/nl/corp/texts/annotated/corp_nl_11.txt
47996
ACTER-master/nl/corp/texts/annotated/corp_nl_05.txt
6830
ACTER-master/nl/corp/texts/annotated/corp_nl_19.txt
10614
ACTER-master/nl/corp/texts/annotated/corp_nl_12.txt
43662
ACTER-master/nl/corp/texts/annotated/corp_nl_07.txt
58167
ACTER-master/nl/corp/texts/annotated/corp_nl_09.txt
13009
330555
1691
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
['ACTER-master/nl/wind/texts/annotated/']
ACTER-master/nl/wind/texts/annotated/wind_nl_13.txt
10181
ACTER-master/nl/wind/texts/annotated/wind_nl_18.txt
10682
ACTER-master/nl/wind/tex

In [82]:
#concat trainingsdata
trainings_data = train_data_corp_en + train_data_wind_en

val_data = train_data_equi_en + train_data_equi_fr + train_data_equi_nl
val_data_en = train_data_equi_en
val_data_fr = train_data_equi_fr
val_data_nl = train_data_equi_nl

test_data = train_data_htfl_en + train_data_htfl_fr + train_data_htfl_nl
test_data_en = train_data_htfl_en
test_data_fr = train_data_htfl_fr
test_data_nl = train_data_htfl_nl

gold_set_for_validation = set(df_equi_terms_en["Term"]).union(set(df_equi_terms_fr["Term"])).union(set(df_equi_terms_nl["Term"])) 
gold_set_for_test = set(df_htfl_terms_en["Term"]).union(set(df_htfl_terms_fr["Term"])).union(set(df_htfl_terms_nl["Term"])) 


print(len(trainings_data))
print(len(val_data))
print(len(test_data))

3449
7978
6416


In [64]:
#seperate tokens and tags

#train
train_tags=[tup[1] for tup in trainings_data]
train_texts=[tup[0] for tup in trainings_data]

#val
val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]

val_tags_en=[tup[1] for tup in val_data_en]
val_texts_en=[tup[0] for tup in val_data_en]

val_tags_fr=[tup[1] for tup in val_data_fr]
val_texts_fr=[tup[0] for tup in val_data_fr]

val_tags_nl=[tup[1] for tup in val_data_nl]
val_texts_nl=[tup[0] for tup in val_data_nl]

#test
test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]

test_tags_en=[tup[1] for tup in test_data_en]
test_texts_en=[tup[0] for tup in test_data_en]

test_tags_fr=[tup[1] for tup in test_data_fr]
test_texts_fr=[tup[0] for tup in test_data_fr]

test_tags_nl=[tup[1] for tup in test_data_nl]
test_texts_nl=[tup[0] for tup in test_data_nl]

# Tokenize 

In [65]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

In [66]:
#align labels with tokenization from XLM-R
label_list=["n", "B-T", "T"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      # We use this argument because the texts in our dataset are lists of words (with a label for each word).
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          # Special tokens have a word id that is None. We set the label to -100 so they are automatically
          # ignored in the loss function.
          if word_idx is None:
              label_ids.append(-100)
          # We set the label for the first token of each word.
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          # For the other tokens in a word, we set the label to either the current label or -100, depending on
          # the label_all_tokens flag.
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs  


train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)

val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)
val_input_and_labels_en = tokenize_and_align_labels(val_texts_en, val_tags_en)
val_input_and_labels_fr = tokenize_and_align_labels(val_texts_fr, val_tags_fr)
val_input_and_labels_nl = tokenize_and_align_labels(val_texts_nl, val_tags_nl)

test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)
test_input_and_labels_en = tokenize_and_align_labels(test_texts_en, test_tags_en)
test_input_and_labels_fr = tokenize_and_align_labels(test_texts_fr, test_tags_fr)
test_input_and_labels_nl = tokenize_and_align_labels(test_texts_nl, test_tags_nl)



In [67]:
# create dataset that can be used for training with the huggingface trainer
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = OurDataset(train_input_and_labels, train_input_and_labels["labels"])

val_dataset = OurDataset(val_input_and_labels, val_input_and_labels["labels"])
val_dataset_en = OurDataset(val_input_and_labels_en, val_input_and_labels_en["labels"])
val_dataset_fr = OurDataset(val_input_and_labels_fr, val_input_and_labels_fr["labels"])
val_dataset_nl = OurDataset(val_input_and_labels_nl, val_input_and_labels_nl["labels"])

test_dataset = OurDataset(test_input_and_labels, test_input_and_labels["labels"])
test_dataset_en = OurDataset(test_input_and_labels_en, test_input_and_labels_en["labels"])
test_dataset_fr = OurDataset(test_input_and_labels_fr, test_input_and_labels_fr["labels"])
test_dataset_nl = OurDataset(test_input_and_labels_nl, test_input_and_labels_nl["labels"])

# Training

In [68]:
# return the extracted terms given the token level prediction and the original texts

def extract_terms(token_predictions, val_texts):
  extracted_terms = set()
  # go over all predictions
  for i in range(len(token_predictions)):
    pred = token_predictions[i]
    txt  = val_texts[i]
    for j in range(len(pred)):
      # if right tag build term and add it to the set otherwise just continue
      if pred[j]=="B-T":
        term=txt[j]
        for k in range(j+1,len(pred)):
          if pred[k]=="T": term+=" "+txt[k]
          else: break
        extracted_terms.add(term)
  return extracted_terms

In [69]:
#compute the metrics TermEval style for Trainer
# this function always uses the val sets. thus, for the test set you need an additional function or exchange the marked values

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, val_texts) # VAL set hardcoded
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_set_for_validation      # VAL set hardcoded

    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

In [72]:
# training arguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy= "steps",     # or "epoch"
    eval_steps=100,
    #save_total_limit=1,
    load_best_model_at_end=True,   #loads the model with the best evaluation score
    metric_for_best_model="f1",
    greater_is_better=True
)

In [73]:
# initialize model
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [74]:
# initialize huggingface trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [75]:
# train
trainer.train()

***** Running training *****
  Num examples = 3449
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 432


Step,Training Loss,Validation Loss,Precision,Recall,F1
100,No log,0.454291,0.496093,0.544074,0.518977
200,No log,0.488664,0.56633,0.536239,0.550874
300,No log,0.494014,0.579799,0.508815,0.541993
400,No log,0.533663,0.600605,0.486043,0.537285


***** Running Evaluation *****
  Num examples = 7978
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7978
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-200
Configuration saved in ./results/checkpoint-200/config.json
Model weights saved in ./results/checkpoint-200/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-200/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7978
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-300
Configuration saved in ./results/checkpoint-300/config.json
Model w

TrainOutput(global_step=432, training_loss=0.1884145736694336, metrics={'train_runtime': 878.5197, 'train_samples_per_second': 3.926, 'train_steps_per_second': 0.492, 'total_flos': 901220661789696.0, 'train_loss': 0.1884145736694336, 'epoch': 1.0})

# Test Set Evaluation

In [79]:
#compute the metrics TermEval style for Trainer

def compute_metrics_test(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, test_texts) # VAL set hardcoded
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_set_for_test      # VAL set hardcoded

    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

In [80]:
# initialize huggingface trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_test,
    )

In [83]:
#test
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions, axis=2)
# Remove ignored index (special tokens)
true_test_predictions = [
    [label_list[p] for (p, l) in zip(test_prediction, test_label) if l != -100]
    for test_prediction, test_label in zip(test_predictions, test_labels)
]

***** Running Prediction *****
  Num examples = 6416
  Batch size = 16


In [85]:
# example output
i=1
print('{:>10}  {:>10}  {:>10}'.format("Text", "Label", "Prediction"))
for j in range(len(true_test_predictions[i])):
  print('{:>10}  {:>10}  {:>10}'.format(test_texts[i][j], test_tags[i][j], true_test_predictions[i][j]))

      Text       Label  Prediction
Management           n           n
        of           n           n
  patients         B-T           n
      with           n           n
     heart         B-T         B-T
   failure           T           T
         (           n           n
        HF         B-T         B-T
         )           n           n
         ,           n           n
  stemming           n           n
      from           n           n
  ischemic         B-T         B-T
       and           n           n
nonischemic         B-T         B-T
cardiomyopathies           T         B-T
         ,           n           n
 continues           n           n
        to           n           n
        be           n           n
problematic           n           n
         ,           n           n
   despite           n           n
       the           n           n
   inroads           n           n
      made           n           n
        in           n           n
       the  

In [86]:
def computeTermEvalMetrics(extracted_terms, gold_df):
  #make lower case cause gold standard is lower case
  extracted_terms = set([item.lower() for item in extracted_terms])
  gold_set=set(gold_df)
  true_pos=extracted_terms.intersection(gold_set)
  recall=len(true_pos)/len(gold_set)
  precision=len(true_pos)/len(extracted_terms)

  print("Intersection",len(true_pos))
  print("Gold",len(gold_set))
  print("Extracted",len(extracted_terms))
  print("Recall:", recall)
  print("Precision:", precision)
  print("F1:", 2*(precision*recall)/(precision+recall))

In [87]:
test_extracted_terms = extract_terms(true_test_predictions, test_texts)

In [88]:
computeTermEvalMetrics(test_extracted_terms, set(df_htfl_terms_en["Term"]).union(set(df_htfl_terms_fr["Term"])).union(set(df_htfl_terms_nl["Term"])))

Intersection 3944
Gold 6900
Extracted 6326
Recall: 0.5715942028985507
Precision: 0.6234587417009169
F1: 0.596401028277635
