# Install Required Packages

In [1]:
!pip install spacy
!pip install spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (313 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.0/314.0 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.8.0->spacy-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runti

# Import Required Modules

In [2]:
import re
import spacy
from tqdm import tqdm # Used to display progress bar of loops
from spacy.tokens import DocBin
from spacy.util import filter_spans
from spacy_transformers import Transformer

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
# CHANGE PATH
parent_path = "/MACCROBAT2020/" # This folder contains all of the ann and txt files

# Data Preparation
*   Extraction of lines that start with "T" from filepaths.
*   Creating Annotations and "Annotation Groups".

In [3]:
def get_annotations(ann_file_path : str , txt_file_path:str, tag_names:list):
  """
  return format : [text_file_content, (entity_label, start, end, term), ... (entity_label, start, end, term)]
  """
  # Get all file contents
  ann_fhand = open(ann_file_path, "r")
  text_fhand = open(txt_file_path, "r")

  ann_file_content = ann_fhand.read()
  text_file_content = text_fhand.read();

  ann_fhand.close()
  text_fhand.close()

  annotation_group = [text_file_content]

  # Extracting the lines that include desired labels/enities
  regex = r'^T.*(?:' + '|'.join(tag_names) + r').*$'
  all_valid = re.findall(regex , ann_file_content , flags = re.MULTILINE)

  # Processing into Annottaions
  for valid_line in all_valid:
    line_components = valid_line.split()
    entity_type,start_index, end_index , *term = line_components[1:]
    entity_type = entity_type.upper()
    term = " ".join(term)
    if(not(start_index.isdigit() and end_index.isdigit())):
      continue
    start_index = int(start_index)
    end_index = int(end_index)
    entity_type = entity_type.strip()
    annotation = (entity_type, start_index,end_index, term)

    # Add formed annotation to annotation group
    annotation_group.append(annotation)

  return annotation_group

In [4]:
fhand_filenames = open(parent_path +"all_files.txt","r") # This file contains all the names of the ann/txt file
all_filenames = fhand_filenames.readlines()

basenames = [filename[:-5] for filename in all_filenames if filename.endswith(".txt\n")]

all_annotation_groups = list()

# All desired label/entity names
tag_names = ["Sign_symptom" , "Diagnostic_procedure", "Biological_structure", "Disease_disorder","Medication"]

# Commence with the extraction of annotation
for name in tqdm(basenames):
  ann_file_path = parent_path+name+".ann"
  txt_file_path = parent_path+name+".txt"

  # Collectivise all annotations of each text file
  all_annotation_groups.append(get_annotations(ann_file_path, txt_file_path,tag_names))

100%|██████████| 200/200 [05:33<00:00,  1.67s/it]


# Data Preparation for training the model
*   Creating Spans
*   Excluding all duplicate spans

In [5]:
nlp = spacy.blank('en')
doc_bin= DocBin()

for annotation_group in tqdm(all_annotation_groups):
  content = annotation_group[0]
  entities = annotation_group[1:]

  doc = nlp.make_doc(content)
  entity_spans = list()
  for  tag_name , start_index, end_index, term in entities:
    # Make spans from indexes and labels
    span = doc.char_span(start_index , end_index, label = tag_name, alignment_mode="contract")

    if(not(span is None)):
      entity_spans.append(span)
  # Ensure that all spans are unique
  unique_spans = filter_spans(entity_spans)
  doc.set_ents(unique_spans)
  doc_bin.add(doc)
doc_bin.to_disk("train.spacy")

100%|██████████| 200/200 [00:01<00:00, 138.49it/s]


# Train the NER Model
*   Create a config file with custom settings.
*   Start the training of the model.



In [12]:
! python -m spacy init config config.cfg --lang en --pipeline ner --gpu --force

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: GPU
- Transformer: roberta-base
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [7]:
! python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 141kB/s]
config.json: 100% 481/481 [00:00<00:00, 2.91MB/s]
vocab.json: 100% 899k/899k [00:00<00:00, 1.94MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 1.98MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 2.93MB/s]
model.safetensors: 100% 499M/499M [00:01<00:00, 352MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  -

# Basic testing of NER Model

In [8]:
nlp_trained_model = spacy.load("model-best")



In [9]:
doc = nlp_trained_model('''
Patient John Doe, a 45-year-old male, presented with a persistent cough, fever, and shortness of breath. The doctor ordered a chest X-ray and blood tests to diagnose the underlying condition. The X-ray revealed signs of pneumonia, and the blood work showed elevated white blood cell count, indicating an infection.

The patient was prescribed a course of Azithromycin, an antibiotic, to treat the bacterial pneumonia. Additionally, the doctor recommended taking Ibuprofen to alleviate the fever and body aches associated with the illness.

Mr. Doe has a history of hypertension and is currently on Lisinopril to manage his high blood pressure. He also has Type 2 diabetes mellitus and takes Metformin regularly to control his blood sugar levels.

During the follow-up visit, the physician noted that the patient's symptoms had improved, and the pneumonia was resolving. However, the doctor advised Mr. Doe to complete the entire course of antibiotics as prescribed to prevent a relapse.

Furthermore, the doctor recommended a pulmonary function test to evaluate the patient's lung capacity and rule out any underlying chronic respiratory conditions, such as asthma or chronic obstructive pulmonary disease (COPD).

In addition to the medical conditions, the patient reported experiencing occasional heartburn and gastric discomfort. The doctor suggested taking an over-the-counter antacid like Omeprazole to manage the symptoms of acid reflux.

Overall, with proper treatment and medication management, the patient's condition is expected to improve, and the risk of complications should be minimized.
''')

In [10]:
spacy.displacy.render(doc, style="ent", jupyter=True)

In [11]:
output_dir ="ner_model_gpu"
nlp_trained_model.to_disk(output_dir)