In this notebook various transformer models will be fine-tuned with PubMed Data.

---

It will be done in several steps:


*   Set up environment
*   Convert Label Studio data into format accepted by SpaCy
*   Fine-tune models
*   Compare results



STEP 1:Install SpaCy

In [None]:
#install SpaCy3
!pip install spacy==3.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==3.2.0
  Downloading spacy-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 11.5 MB/s 
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
[K     |████████████████████████████████| 660 kB 62.7 MB/s 
[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 28.0 MB/s 
Collecting typing-extensions<4.0.0.0,>=3.7.4
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.2-py3-none-any.whl (27 kB)
Installing collected packages: typing-extensions, typer, pydantic, thinc, spacy
  Attempting uninstall: typing-extensions
    Found existing installatio

STEP 2: Convert Label Studio data into accepted by SpaCy JSON format
*   [["some text", {'entities':[(start_index, end_index, "label"), start_index, end_index, "label")]} ]]






In [None]:
import glob
import json
import os
import spacy
from spacy.tokens import DocBin
import itertools

#ATTENTION: please adjust all paths if use the code

#a path to the directory with further data directories. Each subdirectory contains 10 annotated texts
path_to_labelled_texts = "/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts"

def get_file_paths():
  """
  collect all JSON-file names in the directory
  AnnotatedTexts
  """
  dev_files = []
  train_files = []
  start_dir = path_to_labelled_texts
  #get all subdirectories that contain annotated data for the dev-set
  dev_subdirs = ["/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/100-120",
                 "/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/140-151"]                

  #get all subdirectories that contain annotated data for train-set
  train_subdirs = [x[0] for x in os.walk(start_dir) if x[0] != start_dir and not x[0].endswith("ipynb_checkpoints") and x[0] not in dev_subdirs] 
  print(train_subdirs)
  
  #collect texts for spacy dev-set
  for item in dev_subdirs:   
    text_files = [f for f in os.listdir(item) if f.endswith('.json')]
    abs_paths = [item + "/" + f for f in text_files]    
    dev_files.extend(abs_paths)
  print(dev_files)

  #collect texts for spacy train-set
  for item in train_subdirs:   
    text_files = [f for f in os.listdir(item) if f.endswith('.json')]
    abs_paths = [item + "/" + f for f in text_files]    
    train_files.extend(abs_paths)
  print(train_files)

  return dev_files, train_files

def read_data(path_to_file: str):
  """
  path_to_file: path to a LS Json file
  read JSON file and save it as a dictionary
  """
  with open(path_to_file) as f:
    data = f.read().strip()
    text_info = json.loads(data)    
  return text_info

def get_entity_positions(ls_json):
  """
  ls_json: content of a Label Studio Json file
  determine start and end position of each IVD concept in the text and
  save this information of the form (start_index, end_index, label) 
  in the list
  return: text + list of (start_index, end_index) for each concept in this text
  """
  label = "MedTech"
  total_text = ""  
  entities = []    
  total_size = 0
  for par in ls_json: 
    #text paragraph    
    data = par["data"]
    text = data["text"].strip() + " "
    total_text += text   
    annotations = par["annotations"]
    for annot in annotations:
      result = annot["result"]
      if(len(result) > 0): 
        for res in result:          
          value = res["value"]                   
          start = value["start"]
          end = value["end"]
          total_start = total_size + start          
          total_end = total_size + end          
          entry = (total_start, total_end, label)
          entities.append(entry)
      total_size = len(total_text)
  return (total_text.strip(), entities)

def correct_entity_positions(doc, start, end, label):
  """
  correct wrongly defined positions of concepts
  """
  span = doc.char_span(start, end, label=label)
  if span is not None:
    return span
  span = doc.char_span(start, end + 1, label=label)
  if span is not None:
    return span
  span = doc.char_span(start, end - 1, label=label)
  if span is not None:
    return span
  span = doc.char_span(start + 1, end, label=label)
  if span is not None:
    return span
  span = doc.char_span(start - 1, end, label=label)
  if span is not None:
    return span
  print(f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n{repr(text)}")
  return None

def remove_doublicated_entries(annotations):
  """
  remove overlapping entities and dublicates
  """
  to_remove = []
  for a, b in itertools.combinations(annotations, 2):
    if a[0]<=b[0] and a[1]>=b[1]:
      to_remove.append(b)
      continue
    if a[0]>=b[0] and a[1]<=b[1]:
      to_remove.append(a)
      continue
    if a[0]<b[0] and a[1]>b[0]:
      to_remove.append(a)
      continue
    if a[0]>b[0] and a[1]<b[0]:
      to_remove.append(a)

  for item in to_remove:
    if item in entity_annot:
      entity_annot.remove(item)
  return annotations

#start point
train_data_path = "/content/drive/MyDrive/SpacyData/textkorpus150/val_train.spacy"
dev_data_path = "/content/drive/MyDrive/SpacyData/textkorpus150/val_dev.spacy"
dev_data, train_data = get_file_paths()
spacy_dev_items = []
spacy_train_items = []

nlp = spacy.blank("en")
db_train = DocBin()
db_dev = DocBin()

#create dev-set
for item in dev_data:  
  total_text = ""  
  entities = []    
  total_size = 0
  #read LS JSON
  info = read_data(item)  
  #create items for spacy data format
  text, entities = get_entity_positions(info)
  #create span-index item
  spacy_item = [text, {"entities" : entities}]  
  spacy_dev_items.append(spacy_item)

for text, annotations in spacy_dev_items:
  #process annotations  
  doc = nlp.make_doc(text) 
  ents = []
  entity_annot = annotations["entities"]
  corrected_entities = remove_doublicated_entries(entity_annot)    
  for start, end, label in entity_annot:    
    span = correct_entity_positions(doc, start, end, label)     
    if span is not None:
      ents.append(span)   
  doc.ents = ents
  db_dev.add(doc)
#save data in SpaCy format
db_dev.to_disk(dev_data_path)


#create train-set
for item in train_data:  
  total_text = ""  
  entities = []    
  total_size = 0
  #read LS JSON
  info = read_data(item)  
  #create items for spacy data format
  text, entities = get_entity_positions(info)
  #create span-index item
  spacy_item = [text, {"entities" : entities}]  
  spacy_train_items.append(spacy_item)

for text, annotations in spacy_train_items:
  #process annotations  
  doc = nlp.make_doc(text) 
  ents = []
  entity_annot = annotations["entities"]
  corrected_entities = remove_doublicated_entries(entity_annot)    
  for start, end, label in entity_annot:    
    span = correct_entity_positions(doc, start, end, label)     
    if span is not None:
      ents.append(span)   
  doc.ents = ents
  db_train.add(doc)
#save data in SpaCy format
db_train.to_disk(train_data_path)
print("done")

['/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/1-20', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/20-40', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/40-60', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/60-80', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/80-100', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/120-140']
['/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/100-120/cleaned_r26518356.json', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/100-120/cleaned_r26614966.json', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/100-120/cleaned_r26732679.json', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/100-120/cleaned_r26844283.json', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/100-120/cleaned_r26673617.json', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTexts/100-120/cleaned_r26852132.json', '/content/drive/MyDrive/TextcorpusCreation/ValidatedTe

Download necessary libs for training. Different variants are tried because of errors

In [None]:
#change runtime to GPU and after that check CUDA version (current 11.2)
#!nvidia-smi

# install PyTorch 1.10.0 for CUDA 11.1
!pip3 install torch==1.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html

# install spaCy transformers tuned for CUDA 11.1
!pip3 install -U spacy[cuda111,transformers]==3.2.0
!pip3 install transformers[sentencepiece]

# install spacy transformer pipeline
!python -m spacy download en_core_web_trf

# library, equivalent of NumPy library for GPU
#!pip3 install cupy

!pip3 install numpy


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.10.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp38-cp38-linux_x86_64.whl (2137.6 MB)
[K     |████████████▌                   | 834.1 MB 1.4 MB/s eta 0:15:57tcmalloc: large alloc 1147494400 bytes == 0x3926e000 @  0x7f7a81f9f615 0x5d631c 0x51e4f1 0x51e67b 0x4f7585 0x49ca7c 0x4fdff5 0x49caa1 0x4fdff5 0x49ced5 0x4f60a9 0x55f926 0x4f60a9 0x55f926 0x4f60a9 0x55f926 0x5d7c18 0x5d9412 0x586636 0x5d813c 0x55f3fd 0x55e571 0x5d7cf1 0x49ced5 0x55e571 0x5d7cf1 0x49ec69 0x5d7c18 0x49ca7c 0x4fdff5 0x49ced5
[K     |███████████████▉                | 1055.7 MB 1.4 MB/s eta 0:12:37tcmalloc: large alloc 1434370048 bytes == 0x7d8c4000 @  0x7f7a81f9f615 0x5d631c 0x51e4f1 0x51e67b 0x4f7585 0x49ca7c 0x4fdff5 0x49caa1 0x4fdff5 0x49ced5 0x4f60a9 0x55f926 0x4f60a9 0x55f926 0x4f60a9 0x55f

In [None]:
#validate compatibility of installed modules
!python -m spacy validate

⠙ Loading compatibility table...[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.7/dist-packages/spacy[0m

NAME              SPACY            VERSION                            
en_core_web_trf   >=3.2.0,<3.3.0   [38;5;2m3.2.0[0m   [38;5;2m✔[0m
en_core_web_sm    >=3.4.0,<3.5.0   [38;5;3m3.4.1[0m   --> 3.2.0     

[1m
Use the following commands to update the packages:
python -m spacy download en_core_web_sm



In [None]:
#download a current transformer pipeline if needed
!python -m spacy download en_core_web_trf

STEP 3: Create base configuration that can be adjusted for each model

In [None]:
#fill in base config with appropriate values. This configuration is needed for training with Spacy
!python -m spacy init fill-config /content/drive/MyDrive/SpacyData/base_config.cfg /content/drive/MyDrive/SpacyData/config.cfg 

In [None]:
#validate config file to avoid errors during the training
!python -m spacy debug data /content/drive/MyDrive/SpacyData/config.cfg

STEP 4: Train models: use various cfg-files to tune different transformers.
Default values: train and dev sets, number of epochs = 10.


In [None]:
#Tune PubmedBERT
!python -m spacy train /content/drive/MyDrive/SpacyData/pubmedbert_config.cfg --output /content/drive/MyDrive/SpacyData/models/pubmed_bert150

In [None]:
#Tune RoBERTa model
!python -m spacy train /content/drive/MyDrive/SpacyData/roberta_config.cfg --output /content/drive/MyDrive/SpacyData/models/roberta_150

In [None]:
#Tune SciBERT model (allenai/scibert_scivocab_uncased)
!python -m spacy train /content/drive/MyDrive/SpacyData/scibert_config.cfg --output /content/drive/MyDrive/SpacyData/models/scibert_150

[38;5;2m✔ Created output directory:
/content/drive/MyDrive/SpacyData/models/val_scibert_150[0m
[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/SpacyData/models/val_scibert_150[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-11-25 14:59:21,649] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-11-25 14:59:21,663] [INFO] Pipeline: ['transformer', 'ner']
INFO:spacy:Pipeline: ['transformer', 'ner']
[2022-11-25 14:59:21,668] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-11-25 14:59:21,671] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Downloading: 100% 385/385 [00:00<00:00, 292kB/s]
Downloading: 100% 223k/223k [00:00<00:00, 662kB/s]
Downloading: 100% 422M/422M [00:11<00:00, 39.7MB/s]
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight',