STEP 1:
Extract texts from PubMed

In [None]:
import re
import requests
from xml.dom import minidom
import xml.etree.ElementTree as ET

# todo: substitute with correct values:
path_to_pubmed_references = "/content/drive/MyDrive/TextcorpusCreation/summary-geneexpres-set.txt"
path_to_extracted_texts = "/content/drive/MyDrive/ExtractedPubMedArticles/"

def get_pubmed_ids(filename):
  """
  extract PubMed ids of articles 
  from the file
  """
  pubmed_ids = []
  pmid_regex = r'PMID: [0-9]+;'
  with open(filename, "r") as f:
    for line in f:
      match = re.search(pmid_regex, line)
      if match:
        result = match.group().replace("PMID: ", "").replace(";", "")
        pubmed_ids.append(result)
  return pubmed_ids

def build_pubmed_links(pubmed_ids):
  """
  create links to the articles in xml-format
  """
  start = "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/"
  end = "/unicode"
  pubmed_links = []
  for pm_id in pubmed_ids:
    link = start + pm_id + end
    pubmed_links.append(link)
  return pubmed_links

def clean_data(content: str) -> str:
  # characters in texts to remove
  noise = ["\x80", "\x89", "±", "Â", "â", "Î","¤", 
         "\x93", "\x97", "Ã", "\x88", "\x92",
         "", "", "", "", "", "", "", "",
          "", "¶", "³", "®", "º",  "¼", "Î",
          "±", "", "Â", "²", "¤", "¥", "", "" ]
  # clean the text
  for char in noise:
    content = content.replace(char, "")
  return content

def get_method_description_from_article(uri):
  """
  Go to the article with the address 
  given as uri. The content is saved as an xml.
  We need only text located in the paragraph named Methods.
  So we extract content of the paragraph without headers
  """
  text = [] 
  try:
    response = requests.get(uri)
     # whole article
    content = response.text  
    root = ET.fromstring(content)

    # all children of the 'document'-element that belong to 'passage'-type
    passages = root.findall("./document/passage")  
    for passage in passages:
      infons = passage.findall("infon")    
      section_type = infons[0]    
      text_type = infons[1]

      # get passages that describe methods and materials
      if section_type.text == "METHODS" and text_type.text == "paragraph":
        content = passage.find("text")
        cleaned_text = clean_data(content.text)
        # add cleaned text         
        text.append(cleaned_text)
  except: 
    # write a link in the text file, if the article is not available 
    print("Article is unavailable:")
    print(uri)
  return text
   

def write_file(data, filename, path):
  """
  Write the given text into the file with the given name and
  save it in the given directory
  """
  file_location = path + filename + ".txt"
  with open(file_location, 'w') as f:
      f.write('\n'.join(data))

def extract_corpus():
  """
  Extract information that contains description of methods
  from PubMed articles. This info is a base for a textcorpus 
  needed to train our Transformer model
  """
  # a file with summary information about medical articles that found in PubMed
  results = "/content/drive/MyDrive/TextcorpusCreation/summary-geneexpres-set.txt"

  # directory where extracted texts will be saved
  path_to_textcorpus = "/content/drive/MyDrive/ExtractedPubMedArticles/"

  # collect pubmed article ids
  pubmed_ids = get_pubmed_ids(results)

  # create uris to pubmed articles
  pubmed_uris = build_pubmed_links(pubmed_ids)

  counter = 0
  extracted_texts_counter = 0
  for uri in pubmed_uris:
    # extract method descriptions, that will be used as text data
    content = get_method_description_from_article(uri)  
    if len(content) > 0:
      # determine a name of the file, where the extracted text will be saved
      file_name = pubmed_ids[counter]
      
      # write content in file
      write_file(content, file_name, path_to_textcorpus)
      extracted_texts_counter += 1
    counter += 1
 
  print("extracted texts:" + str(extracted_texts_counter))


# run the code
print(extract_corpus())





Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/29509546/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/25502501/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/32780725/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/31990678/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/26294732/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/32790648/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/31961829/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/30126994/unicode
Article is unavailable:
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/

In [None]:
#check python version (just in case...)
import sys
print(sys.version)

3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]


STEP 2

*   We have 220 extracted PubMed texts and the task is to label named entities in these texts. NEs of our interest are IVD concepts. 
*   We try to automate labelling steps as much as possible.
*   First of all, we extract already tagged NEs from available annotated data and save them in a file.
*   per line: {concept:[lemma1, lemma2...]}








In [None]:
import json
import nltk
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
nltk.download('punkt')

def find_substring(text: str, start: int, end: int) -> str:
  """
  Find a substring located in the text 
  between start and end index
  """
  substring = ""  
  for x in range(start, end):    
    substring += text[x]  
  return substring 

lemmatizer = WordNetLemmatizer()
def lemmatize(concept: str):
  """
  tokenize and lemmatize the given NE concept
  and get the result as a tuple(concept, [lemmas])
  """  
  word_list = nltk.word_tokenize(concept)
  lemmatized_output = [lemmatizer.lemmatize(w) for w in word_list]
  return (concept, lemmatized_output)

def write_data(data_to_write):
  """
  write extracted concepts from the dictionary
  in the file
  """
  # output info
  path_to_extracted_nes = "/content/drive/MyDrive/TextcorpusCreation/extracted_nes.txt" 
  with open(path_to_extracted_nes, 'w') as f:
    f.write(json.dumps(ivd_entries))

 
# extracted ivd concepts
ivd_entries = dict()
# path to a jsonl file with annotated texts
path_to_annotated = "/content/drive/MyDrive/TextcorpusCreation/textkorpus_doccano.jsonl"

#start point
with open(path_to_annotated) as f:
  lines = f.readlines()
  for line in lines:      
    line_dict = json.loads(line)
    #get labels
    labels = line_dict["label"]
    text = line_dict["data"]    
    #find IVD
    for label in labels:
      start = label[0]      
      end = label[1]     
      concept = find_substring(text, start, end)
      #normalize
      (ne, lemmas) = lemmatize(concept)
      #save them in dictionary
      if ne not in ivd_entries:
        ivd_entries[ne] = lemmas
# write extracted entities in the file        
write_data(ivd_entries)

        

STEP 3:
To automate extraction, a special dictionary of already known NEs is created.

To identify IVD concepts in unlabelled texts, following steps will be done:
*   read the new text
*   tokenize it
*   check if some entries from the extracted_nes.txt are available in the text, if yes -> label them as NE
*   check if some entries from above described dictionareis are available in the text. If yes -> label the as well
*   bring the data in the form that is expected by Label Studio
*   save in a json-file


STEP 3 and 4 are repeated as follows:


*   Annotate N texts (e.g. N = 10) using STEP 3
*   correct annotations using Label Studio
*   use manual annotations to extend a dictionary with NEs (extracted_nes.txt) using STEP 4
*   Repeat the cycle
















 







In [12]:
import glob
import json
import nltk
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

extracted_nes = "/content/drive/MyDrive/TextcorpusCreation/extracted_nes.txt"
path_to_unlabelled_texts = "/content/drive/MyDrive/ExtractedPubMedArticles/"

all_files = glob.glob(path_to_unlabelled_texts + "*.txt")

def read_data(path_to_file: str):
  """
  path_to_file: path to a file with extracted
  read extracted named entities and save them in ivd_dict
  dictionary for further processing
  """
  with open(extracted_nes) as f:
    data = f.read().strip()
    ivd_dict = json.loads(data)    
  return ivd_dict
    
def lemmatize_text(text: str):
  """
  tokenize and lemmatize the given text 
  and get the result as a list of tokens
  """  
  tokenized_text = []
  word_list = nltk.word_tokenize(text)
  for w in word_list:
    lemma = lemmatizer.lemmatize(w)
    tokenized_text.append(lemma)
  return tokenized_text

def handle_simple_concept(concept, concept_list, counter):
  """
  append an index of the given concept 
  in the list of concepts
  """
  new_item = [counter]
  if new_item not in concept_list:
    concept_list.append(new_item)

def handle_complex_concept(text_tokens, counter, concept, concept_list):
  """
  determine whetter found tokens are a NE, if yes -> determine indices of these
  tokens in the text, create a list of them and add this list to the global
  list of concept positions
  text_tokens: tokenized and lemmatized text
  counter: help variable to control a position of a token in the text
  concept: a known NE tokenized and lemmatized
  concept_list: a list of list. Each entry contains positions of a found concept
  when the first token of the concept is found in the text
  check if other tokens of the same concepts are available.
  """ 
  #add the found token in the list of concept tokens
  possible_concept_parts = [text_tokens[counter]]   
  concept_size = len(concept) 
  
  #collect tokens with relevant indices in the text    
  for i in range(1, concept_size): 
    if len(text_tokens) > (counter + i):
       possible_concept_parts.append(text_tokens[counter + i])
    else:
      print("text corpus length: " + str(len(text_tokens)))
      print("current concept counter: " + str(counter + i))
      continue;

  same_tokens = []
  #compare collected tokens with concept tokens
  for item in possible_concept_parts:
    if item in concept:
      same_tokens.append(item)

  #check if all text tokens are parts of the concept
  if len(same_tokens) == concept_size:
    #add indices of the found items to the concept list
    new_items = []
    for i in range(counter, counter + concept_size):
      new_items.append(i)
    if new_items not in concept_list:
      concept_list.append(new_items)   


def find_concepts(text, ivd_dict):
  """
  find NEs in the given text and save them
  in the list  
  """
  text_tokens = lemmatize_text(text)   
  #a list of indices of words that are NEs
  concepts = []
  #NE in dictionary
  for item in ivd_dict:  
    concept_tokens = ivd_dict[item]    
    #helper to get a token index
    counter = 0
    for token in text_tokens:
      #first token of the concept found
      if concept_tokens[0].lower() == token.lower():        
        if(len(concept_tokens) == 1):
          #concept consists of 1 word
          handle_simple_concept(token, concepts, counter)   
        else:
           #concept consists of several words
          handle_complex_concept(text_tokens, counter, concept_tokens, concepts)          
      counter += 1  
  return concepts

def find_concept_in_text(word_list, concept_tokens)-> str:
  """
  word_list: tokenized text
  concept_tokens: positions of concept tokens in text
  determine the whole concept string in the text
  """
  found = ""
  for index in concept_tokens:
    word = word_list[index]    
    found += word
    found += " " 
  return found.strip()

def get_string_start_end(text: str, substring: str): 
  """
  text: a text that contains a substring
  substring: substring in the text
  find all occurences of the substring in the text.
  return a list of tuples: the position of the first and the last character of 
  the given substring in the text
  """ 
  all_occurences = []
  first_pos = 0
  while True:
    first_pos = text.find(substring, first_pos)
    last_pos = first_pos + len(substring)-1
    if first_pos == -1:
       break; 
    result = (first_pos, last_pos)
    all_occurences.append(result)
    first_pos += len(substring)
  return all_occurences

def create_substring_info(start:int, end:int, concept:str):
  """
  start: start position of the concept in the text
  end: end position of the concept in the text
  concept: current concept
  create information for
  each found in the text concept.
  This information is saved in the list of
  dictionaries, a dictionary per concept  
  """
  substring_info = {}
  value = {"start":start, 
           "end":end,
           "text":substring,
           "labels":["MedTech"]}
  substring_info["value"] = value
  substring_info["id"] = str(substring_info_id)
  substring_info["from_name"]="label"
  substring_info["to_name"]="text"
  substring_info["type"]="labels"
  substring_info["origin"]="manual"
  return substring_info

def create_annotation_info(result):
  """
  create a part of concept information 
  for the key "annotations"
  """
  annotations = []
  annot_info = {
        "id":annotation_id,
        "completed_by": 1,
        "result":result,
        "was_cancelled": False,
        "ground_truth": False,
        "created_at":timestamp,
        "updated_at":timestamp,
        "lead_time": 4.362,
        "prediction": {},
        "result_count": 0,
        "task": task_id,
        "parent_prediction":None,
        "parent_annotation":None
    } 
  annotations.append(annot_info)
  return annotations 

def create_concept_info(annotations, line, file_name):
  """
  create a dictionary with all needed information about
  the concept
  """  
  concept_info={
      "id":entry_id,
      "annotations":annotations,
      "file_upload":file_name,
      "drafts":[],
      "predictions":[],
      "data":{
          "text": line.strip()
          },
      "meta": {},
      "created_at":timestamp,
      "updated_at":timestamp,
      "project":project
    }
  return concept_info

def write_json(json_info, filename, path):
  """
  Write the given text into the file with the given name and
  save it in the given directory
  """
  json_name = filename.replace("txt", "json")
  file_location = path + json_name
  with open(file_location, 'w') as f:
    json.dump(json_info, f)  

#file to label
unlabelled_text = "/content/drive/MyDrive/ExtractedPubMedArticles/29311296.txt"
file_with_Nes = "/content/drive/MyDrive/TextcorpusCreation/extracted_nes.txt"
path_to_ls_json = "/content/drive/MyDrive/TextcorpusCreation/ProcessedTexts/"
ivd_dict = read_data(file_with_Nes)
lemmatizer = WordNetLemmatizer()

#json creation helper
substring_info_id = 1000000000
entry_id = 1
annotation_id = 1
task_id = 1
project = 1
timestamp="2022-05-20T15:14:20.100993Z"
ls_json_list=[]

#startpoint
with open(unlabelled_text) as f:
  all_concepts = []  
  lines = f.readlines()
  whole_text = ' '.join(lines)
  for line in lines:    
    result = find_concepts(line, ivd_dict)     
    all_concepts.append(result) 
  
  line_counter = 0
  for line in lines:
    #help variable to control dublicated occurences
    concept_occurences = []
    print("----------")    
    current_concepts = all_concepts[line_counter]
    print("concepts in line:")
    print(current_concepts)   
    
    word_list = nltk.word_tokenize(line) 
    result = []   
    for concept in current_concepts:
      substring = find_concept_in_text(word_list, concept)
      print("concept substring")
      print(substring)
      #get start and end position of each concept occurence in text
      all_occurences = get_string_start_end(line, substring)
      for item in all_occurences:        
        if item not in concept_occurences:
          concept_occurences.append(item)
          print("added concept occurence item")
          print(item)

          #create Label Studio json           
          #1. key "result" in json
          if len(substring) > 0:    
            start = item[0] 
            #add 1 because of LS functionality
            end = item[1] + 1  
            concept_info = create_substring_info(start, end, substring)  
            result.append(concept_info)
            substring_info_id +=1
    # 2.key "annotations"in json   
    annotations = create_annotation_info(result)
    annotation_id += 1
    task_id +=1

    #3.create info about whole text    
    filename = unlabelled_text.split("/")[-1]
    concept_info = create_concept_info(annotations, line, filename)
    entry_id += 1
    project += 1
    line_counter += 1       
    ls_json_list.append(concept_info) 

  #convert result into json  
  endresult = json.dumps(ls_json_list)
  print(endresult)
  #write result in a file
  write_json(ls_json_list, filename, path_to_ls_json)
    

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


text corpus length: 187
current concept counter: 187
----------
concepts in line:
[]
----------
concepts in line:
[]
----------
concepts in line:
[[109]]
concept substring
HPLC
added concept occurence item
(593, 596)
----------
concepts in line:
[]
----------
concepts in line:
[[19], [23]]
concept substring
assay
added concept occurence item
(110, 114)
concept substring
Assay
added concept occurence item
(132, 136)
----------
concepts in line:
[[67], [66, 67], [77], [98], [137, 138], [70], [118], [138], [57], [168], [63], [101]]
concept substring
plate
added concept occurence item
(384, 388)
concept substring
12-well plate
added concept occurence item
(376, 388)
concept substring
DMSO
added concept occurence item
(442, 445)
added concept occurence item
(540, 543)
concept substring
DMSO
concept substring
culture medium
added concept occurence item
(715, 728)
concept substring
medium
added concept occurence item
(395, 400)
added concept occurence item
(633, 638)
added concept occurence i

STEP 4:
*   To extend the collected extracted NEs, automatically + manually annotated files will be searched for new concepts.
*   The extended collection of extracted NEs will be used for further NE identification in the untagged texts







In [1]:
import glob
import json
import nltk
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

path_to_labelled_texts = "/content/drive/MyDrive/TextcorpusCreation/AnnotatedTexts/131-140/"
extracted_nes = "/content/drive/MyDrive/TextcorpusCreation/extracted_nes.txt"


def read_data(path_to_file: str):
  """
  path_to_file: path to a file with extracted
  read extracted named entities and save them in ivd_dict
  dictionary for further processing
  """
  with open(path_to_file) as f:
    data = f.read().strip()
    ivd_dict = json.loads(data)    
  return ivd_dict

def write_data(data_to_write):
  """
  write extracted concepts from the dictionary
  in the file
  """
  with open(extracted_nes, 'w') as f:
    f.write(json.dumps(data_to_write))

def get_concepts(ls_json):
  """
  get a label studio json-file
  and get NEs that are saved in the nested json
  """
  found_concepts = []
  current_line = lines[0]
  info = json.loads(current_line)
  for item in info:
    annotations = item["annotations"]
    current_annotation = annotations[0]
    result = current_annotation["result"]
    for item in result:
      val = item["value"]
      concept = val["text"].strip()
      found_concepts.append(concept)
  return found_concepts
    
def lemmatize(concept: str):
  """
  tokenize and lemmatize the given NE concept
  and get the result as a tuple(concept, [lemmas])
  """  
  word_list = nltk.word_tokenize(concept)
  lemmatized_output = [lemmatizer.lemmatize(w) for w in word_list]
  return (concept, lemmatized_output)

#manually annotated texts
all_files = glob.glob(path_to_labelled_texts + "*.json")

#look-up
extracted_concepts = read_data(extracted_nes)

#nltk WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

#startpoint
for file in all_files:  
  with open(file) as f:    
    lines = f.readlines()
    concepts = get_concepts(lines)   
    for each in concepts:      
      (ne, lemmas) = lemmatize(each.lower())
      #save them in dictionary
      if ne not in extracted_concepts:
        #avoid dublicated values for keys like plate/plates
        if lemmas not in extracted_concepts.values():
          extracted_concepts[ne] = lemmas
          print(extracted_concepts[ne])   
#extend look-up with new concepts          
write_data(extracted_concepts)

print("done")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


['annexin', 'v/pi', 'assay']
['cytokine', 'cocktail']
['abt-199']
['flow', 'cytometry']
['rppa']
['reverse', 'phase', 'protein', 'array']
['immunoblots']
['li-cor', 'odyssey', 'clx', 'infrared', 'imaging', 'system']
['real-time', 'rt-pcr', 'assay']
['taqman', 'human', 'apoptosis', 'array']
['microfluidic', 'card']
['direct-zol', 'miniprep', 'kit']
['nanostrings', 'ncounter', 'gex', 'technology']
['anti-human', 'cd3-percp']
['anti-cd56-apc']
['anti-cd16-pecy7']
['anti-human', 'cd4-bv421']
['anti-human', 'cd107a-bv500']
['anti-human', 'cd38-fitc']
['anti-human', 'hla-dr-pe']
['anti-human', 'nkg2d-bv500']
['ghost', 'red', '780']
['mouse', 'igg1-bv500']
['igg1-apc']
['igg2a-pe']
['bd', 'facsverse', 'flow', 'cytometer']
['ibm', 'spss', 'statistic', '22']
['mononuclear', 'cell', 'preparation', 'tube']
['all', 'prep', 'dna/rna/mirna', 'universal', 'kit']
['truseq', 'rna', 'sample', 'preparation', 'kit']
['illumina', 'hiseq', '2000']
['of', 'medium']
['somalogic', 'somascan', 'assay']
['assay'