In [None]:
!pip install transformers

In [None]:
#imports
from transformers import XLMRobertaTokenizerFast, pipeline, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification  
from spacy.pipeline import SentenceSegmenter
from spacy.lang.en import English
from spacy.pipeline import Sentencizer
#from sacremoses import MosesTokenizer, MosesDetokenizer 
import torch  
import itertools
from string import punctuation
import pandas as pd
from lxml import etree
import datetime
from graphviz import Digraph


#Pipeline

**Functions**

In [None]:
def extract_terms(pred, txt):
  extracted_terms = []
  # go over all predictions
  for j in range(len(pred)):
    # if right tag build term and add it to the set otherwise just continue
    if pred[j]=="LABEL_1":
      term=txt[j]
      for k in range(j+1,len(pred)):
        if pred[k]=="LABEL_2": term+=" "+txt[k]     #if continuation of term 
        else: break
      #remove wrong punctuation and add it to the termlist if it is no duplicate
      term = remove_end_punctuation(term)
      if term not in extracted_terms:  
        extracted_terms.append(term)
  return extracted_terms

In [None]:
# remove last character if it is punctuation and there is not other punctuation in the word
def remove_end_punctuation(word):
  word_without_last = word[:-1]
  #only remove end punctuation if there is not other punctuation inside the string
  if not any(char in word_without_last for char in punctuation):
    return word.translate(str.maketrans('', '', punctuation))     #all chars in punctuation are mapped to None and the translate function uses this translation table
  else:
    return word

print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
def get_term_list(output):
  word_list=[]
  label_list=[]
  #can i do this with some already existing decode/encode function?
  for i in range(len(output)):
    item=output[i]
    #print(item["word"])
    #if start of word_
    if item["word"][0]=="▁":
      word=item["word"]
      label=item["entity"]
      for j in range(i+1,len(output)):
        item=output[j]
        if item["word"][0]!="▁": 
          word+=item["word"]
        else:
          break
      #print(word,label)
      word_list.append(word[1:len(word)])
      label_list.append(label)
  return label_list, word_list



In [None]:
#split in sentences and tokenize
def preprocess(text):
  #sentenize (from spacy)
  sentencizer = Sentencizer()
  nlp = English()
  nlp.add_pipe(sentencizer)
  doc = nlp(text)

  #tokenize
  sentence_list=[]
  #mt = MosesTokenizer(lang='en')
  for s in doc.sents:
  #  tokenized_text = mt.tokenize(s, return_str=True)    
    #sentence_list.append((tokenized_text.split(), s))     #append tuple of tokens and original sentence
    sentence_list.append(str(s))
  return sentence_list

**Load Models and Tokenizers**

In [None]:
#load model TE
PATH = "./TermExtraction/saved models/tvt_en_only"    #en_only, fr, nl, all
model_TermExtraction = XLMRobertaForTokenClassification.from_pretrained(PATH)
print("Term Extraction Model loaded")

#load tokenizer TE
tokenizer_TermExtraction = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
print("Tokenizer loaded")

#load model RE
PATH = "./RelationExtraction/saved models/pipeline1803"    
model_RelationExtraction = XLMRobertaForSequenceClassification.from_pretrained(PATH)
print("Relation Extraction Model loaded")


Term Extraction Model loaded


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…


Tokenizer loaded
Relation Extraction Model loaded
Hierachy Model loaded


In [None]:
label_list=['SYNONYM', 'activityRelation (e1,e2)', 'activityRelation (e2,e1)',
 'associativeRelation', 'causalRelation (e1,e2)', 'causalRelation (e2,e1)',
 'genericRelation (e1,e2)', 'genericRelation (e2,e1)',
 'instrumentalRelation (e1,e2)', 'instrumentalRelation (e2,e1)', 'none',
 'originationRelation (e1,e2)', 'originationRelation (e2,e1)',
 'partitiveRelation (e1,e2)', 'partitiveRelation (e2,e1)',
 'spatialRelation (e1,e2)', 'spatialRelation (e2,e1)']

wrong_labels=["LABEL_0","LABEL_1","LABEL_2","LABEL_3","LABEL_4","LABEL_5","LABEL_6","LABEL_7","LABEL_8","LABEL_9","LABEL_10","LABEL_11","LABEL_12","LABEL_13", "LABEL_14", "LABEL_15", "LABEL_16"]
print(len(label_list),len(wrong_labels))

25 25


In [None]:
#pipelines

pipeline_terms=pipeline("ner", model=model_TermExtraction, tokenizer=tokenizer_TermExtraction)

pipeline_relation=pipeline("sentiment-analysis", model=model_RelationExtraction, tokenizer=tokenizer_TermExtraction)

**Read Text**

In [None]:
text = "' Vorhandener Wirkstoff\n\n„Prüfprogramm“ ist die Bezeichnung, die üblicherweise für das Arbeitsprogramm zur Prüfung in Biozidprodukten enthaltener alter Wirkstoffe verwendet wird. Das Programm wurde von der Europäischen Kommission unter der Biozidprodukte-Richtlinie (BPD) eingerichtet und wird unter der Verordnung über Biozidprodukte (BPR) fortgeführt.\n\nAlte Wirkstoffe sind jene Stoffe, die am 14. Mai 2000 als Wirkstoff eines Biozidprodukts auf dem Markt waren (für andere Zwecke als die wissenschaftliche oder produkt- und verfahrensorientierte Forschung und Entwicklung). Es wurden jene alten Wirkstoffe zur Überprüfung im Prüfprogramm akzeptiert, die als solche identifiziert wurden und für die eine Notifizierung gemäß Anhang II der Verordnung (EG) Nr. 1451/2007 der Kommission akzeptiert wurde.\n\nDie genauen Vorschriften für das Prüfprogramm wurden im Rahmen der neuen Verordnung zum Prüfprogramm (EU) Nr. 1062/2014, die die Verordnung (EG) Nr. 1451/2007 der Kommission aufhebt und ersetzt, an die Bestimmungen der BPR angepasst.\n\nDie in Artikel 89 der Verordnung (EU) Nr. 528/2012 festgelegten Übergangsmaßnahmen ermöglichen das Inverkehrbringen und die Verwendung von Biozidprodukten, die einen im Prüfprogramm (für eine bestimmte Produktart) enthaltenen Wirkstoff enthalten, vorbehaltlich der nationalen Vorschriften, bis drei Jahre nach ihrem Genehmigungsdatum (im Falle einer Nichtgenehmigung können kürzere Zeiträume gelten).\n\nIn Anhang II Teil 1 der Verordnung zum Prüfprogramm sind die Wirkstoffe aufgeführt, die derzeit geprüft werden.\n\nDarüber hinaus passt die Verordnung zum Prüfprogramm die Verfahren zur Bewertung von Dossiers an die Verfahren an, die in der BPR für neue Wirkstoffe oder in Verordnung (EU) Nr. 88/2014 zur Änderung von Anhang I beschrieben sind.\n\nDes Weiteren sieht die Verordnung zum Prüfprogramm eine feste Rolle für die ECHA vor und legt Verfahren zum Beteiligen oder Ersetzen von Teilnehmern im Prüfprogramm in gegenseitigem Einvernehmen, zum Ausscheiden als Teilnehmer sowie zur Übernahme der Rolle eines Teilnehmers in bestimmten Situationen fest und führt die Möglichkeit ein, unter bestimmten Bedingungen Stoff/Produktart-Kombinationen in das Prüfprogramm aufzunehmen."

#Single Example of the Whole Pipeline

**Split corpus into sentences**

In [None]:
sentences=preprocess(text)
sentences[0]

' Vorhandener Wirkstoff\n\n„Prüfprogramm“ ist die Bezeichnung, die üblicherweise für das Arbeitsprogramm zur Prüfung in Biozidprodukten enthaltener alter Wirkstoffe verwendet wird.'

**Extract Terms from corpus**

In [None]:
#OR USE OWN TOKENIZER + FAKE LABELS TO GET OUTPUT..... (otherwise problems with punctuation)

#pipeline with list of sentences as input ?????
terms_per_sentence=[]
for s in sentences:
  #pipeline output
  term_output=pipeline_terms(s)
  #reconstruct full words from pipeline and asign labels based on start word 
  labels, words=get_term_list(term_output)
  terms_per_sentence.append(extract_terms(labels,words))
  


In [None]:
terms_per_sentence

[['active substance', 'biocidal active substances', 'biocidal products'],
 ['European Commission',
  'Biocidal Products Directive',
  'Biocidal Products Regulation'],
 ['biocidal product'],
 ['Review Programme', 'Commission Regulation'],
 ['rules',
  'Review Programme',
  'BPR',
  'Review Programme Regulation',
  'Commission Regulation'],
 ['Regulation', 'biocidal products', 'Review', 'rules'],
 ['Review'],
 ['Review Programme',
  'Regulation',
  'dossier',
  'BPR',
  'active substances',
  'Review Programme Regulation',
  'ECHA',
  'substance/PT'],
 []]

In [None]:
# flat set of all terms
term_list=[]
for term_l in terms_per_sentence:
  for term in term_l:
    if term not in term_list:
      term_list.append(term)

In [None]:
term_list

['active substance',
 'biocidal active substances',
 'biocidal products',
 'European Commission',
 'Biocidal Products Directive',
 'Biocidal Products Regulation',
 'biocidal product',
 'Review Programme',
 'Commission Regulation',
 'rules',
 'BPR',
 'Review Programme Regulation',
 'Regulation',
 'Review',
 'dossier',
 'active substances',
 'ECHA',
 'substance/PT']

In [None]:
# create a list of concept dictionaries + term2id mapping

term_to_id = dict()

concept_list=[]
for i, c in enumerate(term_list):
  concept_list.append(
      {"id":i, "terms":[c], "relations":[]}
  )
  # map terms to id 
  for term in [c]: 
    term_to_id[term]=i


In [None]:
concept_list

[{'id': 0, 'relations': [], 'terms': ['active substance']},
 {'id': 1, 'relations': [], 'terms': ['biocidal active substances']},
 {'id': 2, 'relations': [], 'terms': ['biocidal products.']},
 {'id': 3, 'relations': [], 'terms': ['European Commission']},
 {'id': 4, 'relations': [], 'terms': ['Biocidal Products Directive']},
 {'id': 5, 'relations': [], 'terms': ['Biocidal Products Regulation']},
 {'id': 6, 'relations': [], 'terms': ['biocidal product']},
 {'id': 7, 'relations': [], 'terms': ['Review Programme']},
 {'id': 8, 'relations': [], 'terms': ['Commission Regulation']},
 {'id': 9, 'relations': [], 'terms': ['rules']},
 {'id': 10, 'relations': [], 'terms': ['BPR']},
 {'id': 11, 'relations': [], 'terms': ['Review Programme Regulation']},
 {'id': 12, 'relations': [], 'terms': ['Regulation']},
 {'id': 13, 'relations': [], 'terms': ['biocidal products']},
 {'id': 14, 'relations': [], 'terms': ['Review']},
 {'id': 15, 'relations': [], 'terms': ['rules,']},
 {'id': 16, 'relations': [], 

**Extract relations per sentence**

In [None]:
# take terms and save them in a dictionary that we later write to tbx
for i in range(len(terms_per_sentence)):
  #create all termpairs for the sentence 
  terms=terms_per_sentence[i]
  sentence=sentences[i]
  term_pairs=list(itertools.combinations(terms, 2))
  print("\n\nSentence:", sentence)
  print("Relations:")
  #extract relation for each possible pair 
  for pair in term_pairs:
    input=pair[0]+". "+pair[1]+". "+sentence
    relation=pipeline_relation(input)
    #print((relation))
    true_rel=label_list[wrong_labels.index(relation[0]["label"])]

    #add true_rel to concept dict
    if "(e1,e2)" in true_rel:
      concept_list[term_to_id[pair[0]]]["relations"].append([true_rel[:-8],pair[1],round(relation[0]["score"],3)])
    elif "(e2,e1)" in true_rel:
      concept_list[term_to_id[pair[1]]]["relations"].append([true_rel[:-8],pair[0]])
    elif  "associativeRelation" == true_rel:
      concept_list[term_to_id[pair[0]]]["relations"].append([true_rel,pair[1]])
    

    print("{:30s}{:30s}{:30s}{:3f}".format(pair[0],pair[1], true_rel, round(relation[0]["score"],3)))   #pair[0],"----", pair[1], true_rel, relation[0]["score"])



Sentence:  Existing active substance

The Review Programme is the name commonly used for the work programme for the examination of existing biocidal active substances contained in biocidal products.
Relations:
active substance              biocidal active substances    partitiveRelation (e1,e2)     0.966000
active substance              biocidal products.            partitiveRelation (e1,e2)     0.875000
biocidal active substances    biocidal products.            partitiveRelation (e1,e2)     0.919000


Sentence: The programme was set up by the European Commission under the Biocidal Products Directive (BPD) and continues under the Biocidal Products Regulation (BPR).
Relations:
European Commission           Biocidal Products Directive   none                          0.559000
European Commission           Biocidal Products Regulation  none                          0.779000
Biocidal Products Directive   Biocidal Products Regulation  none                          0.861000


Sentence: 

E

**Clean Concept List**


In [None]:
# merge concept entries with synonym relations 

# go over all concepts
for i, concept in enumerate(concept_list):
  print("checking", concept["id"], concept["terms"])
  print(concept["relations"])
  #go over all relations
  for k, rel in enumerate(concept["relations"]): 
    print("         ", rel)
    # if its a synonym relation
    if rel[0]=="SYNONYM":
      e2_term=rel[1] #this is a string 
      #find the concept e2 in the list to be able to remove it 
      for j in range(len(concept_list)):
        if e2_term in concept_list[j]["terms"]: break
      #remove found concept if its not the current concept
      e2_concept=concept_list[j]
      if e2_concept!=concept:
        del concept_list[j]
        print("   merge", concept["id"], e2_concept["id"])
        #merge found concept
        for term in e2_concept["terms"]:
          concept["terms"].append(term)
        for rel in e2_concept["relations"]:
          concept["relations"].append(rel)
        #print updated relations
        print(concept["relations"])
      #no need for updating the ids since it works on string basis anyways 


In [None]:
# correct term_to_id (which is broken due to synonymy) and update ids (to have no missing values after merge)

#update term_to_id
for i, concept in enumerate(concept_list):
  concept["id"]="c"+str(i+1)
  for term in concept["terms"]:
    term_to_id[term]=concept["id"]

# update relations to contain ids instead of the string-terms using term_to_id
for concept in concept_list:
  for rel in concept["relations"]:
    #print(rel[1], term_to_id[rel[1]])
    rel[1]=term_to_id[rel[1]]


In [None]:
# delete hypernym + all other relations which became self referential after the merge + delete duplicates

for concept in concept_list:
  legal_relations=[]
  for rel in concept["relations"]:
    #only keep relations which are not synonym and not self referential
    if rel[1]!=concept["id"] and rel[0]!="SYNONYM":
      #do not keep duplicates
      duplicate=False
      for legal_rel in legal_relations:
        #if duplicate, only update probability
        if rel[0]==legal_rel[0] and rel[1]==legal_rel[1]:
          legal_rel[2]=max(legal_rel[2], rel[2])
          duplicate=True
      if not duplicate:
        legal_relations.append(rel)
    else:
      print(rel)
  concept["relations"] = legal_relations

**Write to TBX**

In [None]:
# function to write header
def write_header(root):
  header = etree.SubElement(root, "tbxHeader")
  fileDesc = etree.SubElement(header, "fileDesc")
  sourceDesc = etree.SubElement(fileDesc, "sourceDesc")
  etree.SubElement(sourceDesc, "p").text = "TBX file automatically generated by Text2TCS (https://text2tcs.univie.ac.at/)"
  encodingDec = etree.SubElement(header, "encodingDesc") 
  etree.SubElement(encodingDec, "p", {"type": "XCSURI"}).text="TBXXCSV02.xcs"

In [None]:
# function for writing a single concept 
# concept = list of synonymous terms 
def write_text(root, concept_list):
  date_string = datetime.datetime.now().strftime("%y-%m-%d_%Hh-%Mm")
  text = etree.SubElement(root, "text")
  body = etree.SubElement(text,"body") 
  for concept in concept_list:
    conceptEntry = etree.SubElement(body, "conceptEntry", {"id":concept["id"]})
    transacGrp = etree.SubElement(conceptEntry, "transacGrp")
    transac = etree.SubElement(transacGrp, "transac", {"type": "transactionType"}).text="origination"
    transacNote = etree.SubElement(transacGrp, "transacNote", {"type": "responsibility"}).text="Text2TCS"
    date = etree.SubElement(transacGrp, "date").text = date_string
    #write all terms
    langSec = etree.SubElement(conceptEntry, "langSec", {"{http://www.w3.org/XML/1998/namespace}lang":language})
    for i, term in enumerate(concept["terms"]):
      term_id=concept["id"]+"-"+language+"-t"+str(i)
      termSec = etree.SubElement(langSec, "termSec", {"id":term_id})
      etree.SubElement(termSec, "term").text = term
    #write all relations
    for rel in concept["relations"]:
      descripGrp = etree.SubElement(conceptEntry,"descripGrp")
      etree.SubElement(descripGrp, "descrip", {"type":rel[0]}).text = rel[1]

  


In [None]:
#TODO IMPLEMENT AUTOMATIC LANGUAGE DETECTION // User Input 
language="en"

In [None]:
root = etree.Element("tbx", {"type":"TBX-Core", "style":"dca", "{http://www.w3.org/XML/1998/namespace}lang":language, "xmlns":"urn:iso:std:iso:30042:ed-2"})
#add head xml elements
pi2 = etree.ProcessingInstruction('xml-model', 'href="https://raw.githubusercontent.com/LTAC-Global/TBX-Core_dialect/master/Schemas/TBX-Core.sch" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"') 
pi1 = etree.ProcessingInstruction('xml-model', 'href="https://raw.githubusercontent.com/LTAC-Global/TBX-Core_dialect/master/Schemas/TBXcoreStructV03_TBX-Core_integrated.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"') 
tree = etree.ElementTree(root)
tree.getroot().addprevious(pi1)
tree.getroot().addprevious(pi2)

#write content
write_header(root)
write_text(root, concept_list)

In [None]:
print(etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=True))

b'<?xml version=\'1.0\' encoding=\'utf-8\'?>\n<tbx style="dca" type="TBX-Core" xmlns="urn:iso:std:iso:30042:ed-2" xml:lang="en">\n  <tbxHeader>\n    <fileDesc>\n      <sourceDesc>\n        <p>TBX file automatically generated by Text2TCS (https://text2tcs.univie.ac.at/)</p>\n      </sourceDesc>\n    </fileDesc>\n    <encodingDesc>\n      <p type="XCSURI">TBXXCSV02.xcs</p>\n    </encodingDesc>\n  </tbxHeader>\n  <text>\n    <body>\n      <conceptEntry id="c1">\n        <transacGrp>\n          <transac type="transactionType">origination</transac>\n          <transacNote type="responsibility">Text2TCS</transacNote>\n          <date>21-03-08_21h-03m</date>\n        </transacGrp>\n        <langSec xml:lang="en">\n          <termSec id="c1-en-t0">\n            <term>active substance</term>\n          </termSec>\n          <termSec id="c1-en-t1">\n            <term>biocidal active substances</term>\n          </termSec>\n          <termSec id="c1-en-t2">\n            <term>biocidal products.</

In [None]:
et=etree.ElementTree(root)
et.write("output.tbx", encoding='utf-8', xml_declaration=True, pretty_print=True)

**Graph Vizualization**

In [None]:
def make_graph(concept_list, filename):
  g = Digraph("G", filename=filename)
  #g.attr(size="1000,5")
  #create nodes
  for concept in concept_list:
    nodename=concept["id"]+"\n"+str(concept["terms"])
    g.node(nodename, shape="box")
  #create edges
  for concept in concept_list:
    node1=concept["id"]+"\n"+str(concept["terms"])
    for rel in concept["relations"]:
      for concepte2 in concept_list:
        if concepte2["id"]==rel[1]:
          node2=concepte2["id"]+"\n"+str(concepte2["terms"])
          if rel[0]=="associativeRelation":
            g.edge(node1, node2, label=str(rel[0]), dir="none")
          else:
            g.edge(node1, node2, label=str(rel[0]))
  return g

In [None]:
def make_graph_accumulated(concept_list, filename):
  g = Digraph("G", filename=filename)
  g.graph_attr["rankdir"] = "BT" #change direction to bottom to top 
  #g.attr(size="1000,5")
  #create nodes
  for concept in concept_list:
    nodename=concept["id"]+"\n"+str(concept["terms"])
    g.node(nodename, shape="box")
  #create edges (labels accumulated)
  for concept_1 in concept_list:
    for concept_2 in concept_list:
      #find all relations directions c1 to c2
      relations_c1_c2 = []
      for rel in concept_1["relations"]:
        if rel[1]==concept_2["id"]:
          relations_c1_c2.append(rel)
      #draw arrow between relations 
      if len(relations_c1_c2)>0:
        node1=concept_1["id"]+"\n"+str(concept_1["terms"]) 
        node2=concept_2["id"]+"\n"+str(concept_2["terms"])
        label=""
        for i in range(len(relations_c1_c2)):
          rel=relations_c1_c2[i]
          if i == 0:
            label+=rel[0]
          else:
            label+=", "+rel[0]
        if label!="associativeRelation":
          g.edge(node1, node2, label=label)
        else:
          g.edge(node1, node2, label=label, dir="none")
  return g

In [None]:
make_graph_accumulated(concept_list, "test_graph_acc.gv"+str(text_id))