<a href="https://colab.research.google.com/github/lwachowiak/Term-Extraction-With-Language-Models/blob/main/Term_Extraction_Token_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [None]:
!pip install transformers
!pip install sacremoses
!pip install sentencepiece
!pip install seqeval

In [4]:
#torch and tranformers for model and training
import torch  
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import XLMRobertaTokenizerFast              
from transformers import XLMRobertaForTokenClassification
from transformers import AdamW                            
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
import sentencepiece

#sklearn for evaluation
from sklearn import preprocessing                       
from sklearn.metrics import classification_report        
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid         
from sklearn.model_selection import ParameterSampler      
from sklearn.utils.fixes import loguniform

#nlp preprocessing
from nltk import ngrams                                 
from spacy.pipeline import SentenceSegmenter
from spacy.lang.en import English
from spacy.pipeline import Sentencizer
from sacremoses import MosesTokenizer, MosesDetokenizer


#utilities
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import glob, os
import time
import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
import pickle         # for saving data structures
from pynvml import *  # for checking gpu memory

In [None]:
# connect to GPU 
device = torch.device('cuda')

print('Connected to GPU:', torch.cuda.get_device_name(0))

Connected to GPU: Tesla T4


# Prepare Data

Training Data: corp, wind

Validation Data: equi

Test Data: htfl

In [None]:
#load terms

#en
df_corp_terms_en=pd.read_csv('ACTER-master/ACTER-master/en/corp/annotations/corp_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_equi_terms_en=pd.read_csv('ACTER-master/ACTER-master/en/equi/annotations/equi_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_htfl_terms_en=pd.read_csv('ACTER-master/ACTER-master/en/htfl/annotations/htfl_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_wind_terms_en=pd.read_csv('ACTER-master/ACTER-master/en/wind/annotations/wind_en_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  

#fr
df_corp_terms_fr=pd.read_csv('ACTER-master/ACTER-master/fr/corp/annotations/corp_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_equi_terms_fr=pd.read_csv('ACTER-master/ACTER-master/fr/equi/annotations/equi_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_htfl_terms_fr=pd.read_csv('ACTER-master/ACTER-master/fr/htfl/annotations/htfl_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_wind_terms_fr=pd.read_csv('ACTER-master/ACTER-master/fr/wind/annotations/wind_fr_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  

#nl
df_corp_terms_nl=pd.read_csv('ACTER-master/ACTER-master/nl/corp/annotations/corp_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_equi_terms_nl=pd.read_csv('ACTER-master/ACTER-master/nl/equi/annotations/equi_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_htfl_terms_nl=pd.read_csv('ACTER-master/ACTER-master/nl/htfl/annotations/htfl_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  
df_wind_terms_nl=pd.read_csv('ACTER-master/ACTER-master/nl/wind/annotations/wind_nl_terms_nes.ann', delimiter="\t", names=["Term", "Label"])  

labels=["Random", "Term"]

In [None]:
# show dataframe
df_wind_terms_en

Unnamed: 0,Term,Label
0,48/600,Named_Entity
1,4energia,Named_Entity
2,4energy,Named_Entity
3,"ab ""lietuvos energija""",Named_Entity
4,ab lietuvos elektrine,Named_Entity
...,...,...
1529,zhiquan,Named_Entity
1530,çetinkaya,Named_Entity
1531,çeti̇nkaya,Named_Entity
1532,çeşme,Named_Entity


**Functions for preprocessing and creating of Training Data**

In [None]:
#load all text files from folder into a string
def load_text_corpus(path):
  text_data=""
  print(glob.glob(path))
  for file in glob.glob(path+"*.txt"):
      print(file)
      with open(file) as f:
        temp_data = f.read()
        print(len(temp_data))
        text_data=text_data+" "+temp_data
  print(len(text_data))
  return text_data

In [None]:
#split in sentences and tokenize
def preprocess(text):
  #sentenize (from spacy)
  sentencizer = Sentencizer()
  nlp = English()
  nlp.add_pipe(sentencizer)
  doc = nlp(text)

  #tokenize
  sentence_list=[]
  mt = MosesTokenizer(lang='en')
  for s in doc.sents:
    tokenized_text = mt.tokenize(s, return_str=True)    #append tuple of tokens and original senteence
  return sentence_list


In [None]:
#find indices of a sublist sub in a list l
def find_sub_list(subl,l):
    results=[]
    subllen=len(subl)
    for ind in (i for i,e in enumerate(l) if e==subl[0]):
        if l[ind:ind+subllen]==subl:
            results.append((ind,ind+subllen-1))

    return results

In [None]:
#input is list of sentences and dataframe containing terms
def create_training_data(sentence_list, df_terms, n):

  #create empty dataframe
  training_data = []

  md = MosesDetokenizer(lang='en')

  print(len(sentence_list))
  count=0

  for sen in sentence_list:
    count+=1
    if count%100==0:print(count)

    s=sen[0]  #take first part of tuple, i.e. the tokens

    #create label list, with "n" for non-terms, "B-T" for beginning of a term and "T" for the continuation of a term
    tags=["n"]*len(s)

    # 1-gram up to n-gram
    for i in range(1,n+1):
      #create n-grams of this sentence
      n_grams = ngrams(s, i)

      #look if n-grams are in the annotation dataset
      for n_gram in n_grams: 
        n_gram_aslist=list(n_gram)
        n_gram=md.detokenize(n_gram) 
        context=str(sen[1]).strip()
        #if yes add an entry to the training data
        if n_gram.lower() in df_terms.values:
          #check where n_gram is in sentence and annotate it 
          #print(n_gram_aslist,s)
          sublist_indices=find_sub_list(n_gram_aslist, s)
          for indices in sublist_indices:
            for ind in range(indices[0],indices[1]+1):
              #if term start
              if ind==indices[0]:
                tags[ind]="B-T"
              #if continuation of a Term
              else: 
                tags[ind]="T"

    training_data.append((s,tags))
        

  return training_data

  

**Create Training Data**

In [None]:
#create trainings data for all corp texts
corp_text_en=load_text_corpus("ACTER-master/ACTER-master/en/corp/texts/annotated/") # load text
corp_s_list=preprocess(corp_text_en)                                                # preprocess
train_data_corp_en=create_training_data(corp_s_list, df_corp_terms_en, 6)           # create training data

In [None]:
#create trainings data for all wind texts
wind_text_en=load_text_corpus("ACTER-master/ACTER-master/en/wind/texts/annotated/") # load text
wind_s_list=preprocess(wind_text_en)                                                # preprocess
train_data_wind_en=create_training_data(wind_s_list, df_wind_terms_en, 6)           # create training data

In [None]:
#create trainings data for all equi texts
equi_text_en=load_text_corpus("ACTER-master/ACTER-master/en/equi/texts/annotated/") # load text
equi_s_list=preprocess(equi_text_en)                                                # preprocess
train_data_equi_en=create_training_data(equi_s_list, df_equi_terms_en, 6)           # create training data

In [None]:
#create trainings data for all htfl texts
htfl_text_en=load_text_corpus("ACTER-master/ACTER-master/en/htfl/texts/annotated/") # load text
htfl_s_list=preprocess(htfl_text_en)                                                # preprocess
train_data_htfl_en=create_training_data(htfl_s_list, df_htfl_terms_en, 6)           # create training data 

In [None]:
#fr
corp_text_fr=load_text_corpus("ACTER-master/ACTER-master/fr/corp/texts/annotated/") # load text
corp_s_list=preprocess(corp_text_fr)                                                # preprocess
train_data_corp_fr=create_training_data(corp_s_list, df_corp_terms_fr, 6)           # create training data

wind_text_fr=load_text_corpus("ACTER-master/ACTER-master/fr/wind/texts/annotated/") # load text
wind_s_list=preprocess(wind_text_fr)                                                # preprocess
train_data_wind_fr=create_training_data(wind_s_list, df_wind_terms_fr, 6)           # create training data

equi_text_fr=load_text_corpus("ACTER-master/ACTER-master/fr/equi/texts/annotated/") # load text
equi_s_list=preprocess(equi_text_fr)                                                # preprocess
train_data_equi_fr=create_training_data(equi_s_list, df_equi_terms_fr, 6)           # create training data

htfl_text_fr=load_text_corpus("ACTER-master/ACTER-master/fr/htfl/texts/annotated/") # load text
htfl_s_list=preprocess(htfl_text_fr)                                                # preprocess
train_data_htfl_fr=create_training_data(htfl_s_list, df_htfl_terms_fr, 6)           # create training data 

In [None]:
#nl
corp_text_nl=load_text_corpus("ACTER-master/ACTER-master/nl/corp/texts/annotated/") # load text
corp_s_list=preprocess(corp_text_nl)                                                # preprocess
train_data_corp_nl=create_training_data(corp_s_list, df_corp_terms_nl, 6)           # create training data

wind_text_nl=load_text_corpus("ACTER-master/ACTER-master/nl/wind/texts/annotated/") # load text
wind_s_list=preprocess(wind_text_nl)                                                # preprocess
train_data_wind_nl=create_training_data(wind_s_list, df_wind_terms_nl, 6)           # create training data

equi_text_nl=load_text_corpus("ACTER-master/ACTER-master/nl/equi/texts/annotated/") # load text
equi_s_list=preprocess(equi_text_nl)                                                # preprocess
train_data_equi_nl=create_training_data(equi_s_list, df_equi_terms_nl, 6)           # create training data

htfl_text_nl=load_text_corpus("ACTER-master/ACTER-master/nl/htfl/texts/annotated/") # load text
htfl_s_list=preprocess(htfl_text_nl)                                                # preprocess
train_data_htfl_nl=create_training_data(htfl_s_list, df_htfl_terms_nl, 6)           # create training data 

In [None]:
#concat trainingsdata
trainings_data = train_data_corp_en + train_data_wind_en

val_data = train_data_equi_en + train_data_equi_fr + train_data_equi_nl
val_data_en = train_data_equi_en
val_data_fr = train_data_equi_fr
val_data_nl = train_data_equi_nl

test_data = train_data_htfl_en + train_data_htfl_fr + train_data_htfl_nl
test_data_en = train_data_htfl_en
test_data_fr = train_data_htfl_fr
test_data_nl = train_data_htfl_nl

gold_set_for_validation=set(df_equi_terms_en["Term"]).union(set(df_equi_terms_fr["Term"])).union(set(df_equi_terms_nl["Term"])) 

print(len(trainings_data))
print(len(val_data))
print(len(test_data))

3449
7978
6416


In [None]:
#seperate tokens and tags

#train
train_tags=[tup[1] for tup in trainings_data]
train_texts=[tup[0] for tup in trainings_data]

#val
val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]

val_tags_en=[tup[1] for tup in val_data_en]
val_texts_en=[tup[0] for tup in val_data_en]

val_tags_fr=[tup[1] for tup in val_data_fr]
val_texts_fr=[tup[0] for tup in val_data_fr]

val_tags_nl=[tup[1] for tup in val_data_nl]
val_texts_nl=[tup[0] for tup in val_data_nl]

#test
test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]

test_tags_en=[tup[1] for tup in test_data_en]
test_texts_en=[tup[0] for tup in test_data_en]

test_tags_fr=[tup[1] for tup in test_data_fr]
test_texts_fr=[tup[0] for tup in test_data_fr]

test_tags_nl=[tup[1] for tup in test_data_nl]
test_texts_nl=[tup[0] for tup in test_data_nl]

# Tokenize 

In [None]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…




In [None]:
#align labels with tokenization from XLM-R
label_list=["n", "B-T", "T"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      # We use this argument because the texts in our dataset are lists of words (with a label for each word).
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          # Special tokens have a word id that is None. We set the label to -100 so they are automatically
          # ignored in the loss function.
          if word_idx is None:
              label_ids.append(-100)
          # We set the label for the first token of each word.
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          # For the other tokens in a word, we set the label to either the current label or -100, depending on
          # the label_all_tokens flag.
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs  


train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)

val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)
val_input_and_labels_en = tokenize_and_align_labels(val_texts_en, val_tags_en)
val_input_and_labels_fr = tokenize_and_align_labels(val_texts_fr, val_tags_fr)
val_input_and_labels_nl = tokenize_and_align_labels(val_texts_nl, val_tags_nl)

test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)
test_input_and_labels_en = tokenize_and_align_labels(test_texts_en, test_tags_en)
test_input_and_labels_fr = tokenize_and_align_labels(test_texts_fr, test_tags_fr)
test_input_and_labels_nl = tokenize_and_align_labels(test_texts_nl, test_tags_nl)



In [None]:
# create dataset that can be used for training with the huggingface trainer
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = OurDataset(train_input_and_labels, train_input_and_labels["labels"])

val_dataset = OurDataset(val_input_and_labels, val_input_and_labels["labels"])
val_dataset_en = OurDataset(val_input_and_labels_en, val_input_and_labels_en["labels"])
val_dataset_fr = OurDataset(val_input_and_labels_fr, val_input_and_labels_fr["labels"])
val_dataset_nl = OurDataset(val_input_and_labels_nl, val_input_and_labels_nl["labels"])

test_dataset = OurDataset(test_input_and_labels, test_input_and_labels["labels"])
test_dataset_en = OurDataset(test_input_and_labels_en, test_input_and_labels_en["labels"])
test_dataset_fr = OurDataset(test_input_and_labels_fr, test_input_and_labels_fr["labels"])
test_dataset_nl = OurDataset(test_input_and_labels_nl, test_input_and_labels_nl["labels"])

# Training

In [None]:
# return the extracted terms given the token level prediction and the original texts

def extract_terms(token_predictions, val_texts):
  extracted_terms = set()
  # go over all predictions
  for i in range(len(token_predictions)):
    pred = token_predictions[i]
    txt  = val_texts[i]
    for j in range(len(pred)):
      # if right tag build term and add it to the set otherwise just continue
      if pred[j]=="B-T":
        term=txt[j]
        for k in range(j+1,len(pred)):
          if pred[k]=="T": term+=" "+txt[k]
          else: break
        extracted_terms.add(term)
  return extracted_terms

In [None]:
#compute the metrics TermEval style for Trainer

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, val_texts) # ??????
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_set_for_validation      # ??????

    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

In [None]:
# training arguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy= "no",#"steps", # or use epoch here
    eval_steps=100,
    #save_total_limit=1,
    load_best_model_at_end=True,   #loads the model with the best evaluation score
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
# initialize model
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [None]:
# initialize huggingface trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
# train
trainer.train()

# Test Set Evaluation

In [None]:
#test
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions, axis=2)
# Remove ignored index (special tokens)
true_test_predictions = [
    [label_list[p] for (p, l) in zip(test_prediction, test_label) if l != -100]
    for test_prediction, test_label in zip(test_predictions, test_labels)
]



In [None]:
# example output
i=1
print('{:>10}  {:>10}  {:>10}'.format("Text", "Label", "Prediction"))
for j in range(len(true_test_predictions_en[i])):
  print('{:>10}  {:>10}  {:>10}'.format(test_texts[i][j], test_tags[i][j], true_test_predictions_en[i][j]))

      Text       Label  Prediction
       The           n           n
  analysis           n           n
  included           n           n
         a           n           n
     large           n           n
     study           n           n
    sample           n           n
      with           n           n
      more           n           n
      than           n           n
    60,000           n           n
  patients         B-T           n
    across           n           n
      4372           n           n
 hospitals         B-T           n
         .           n           n


In [None]:
def computeTermEvalMetrics(extracted_terms, gold_df):
  #make lower case cause gold standard is lower case
  extracted_terms = set([item.lower() for item in extracted_terms])
  gold_set=set(gold_df)
  true_pos=extracted_terms.intersection(gold_set)
  recall=len(true_pos)/len(gold_set)
  precision=len(true_pos)/len(extracted_terms)

  print("Intersection",len(true_pos))
  print("Gold",len(gold_set))
  print("Extracted",len(extracted_terms))
  print("Recall:", recall)
  print("Precision:", precision)
  print("F1:", 2*(precision*recall)/(precision+recall))

In [None]:
test_extracted_terms = extract_terms(true_test_predictions, test_texts)

In [None]:
computeTermEvalMetrics(test_extracted_terms, set(df_htfl_terms_en["Term"]).union(set(df_htfl_terms_fr["Term"])).union(set(df_htfl_terms_nl["Term"])))