# Instalation

In [1]:
# !pip install spacy-transformers
# !pip install kagglehub
# !pip install tqdm

# Imports

In [15]:
import os
import sys
import json

import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

In [16]:
import IPython.display

sys.modules["IPython.core.display"] = IPython.display

import spacy
from spacy import displacy

# Load Data

Source: https://www.kaggle.com/datasets/finalepoch/medical-ner

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("finalepoch/medical-ner")

print("Path to dataset files:", path)
print(f"Name of data files: {os.listdir(path)}")

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/oleksandr/.cache/kagglehub/datasets/finalepoch/medical-ner/versions/5
Name of data files: ['Corona2.json']


In [4]:
with open(os.path.join(path, 'Corona2.json'), 'r') as f:
    data = json.load(f)
    
print(json.dumps(data['examples'][0], indent=4))

{
    "id": "18c2f619-f102-452f-ab81-d26f7e283ffe",
    "content": "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
    "metadata": {},
    "annotations": [

# Data Preparation

We only need the text string, the entity start and end indices, and the entity type. 

In [5]:
training_data = {
    'classes': ['MEDICINE', "MEDICALCONDITION", "PATHOGEN"], 
    'annotations': []
}

for example in data['examples']:
    temp_dict = {}
    temp_dict['text'] = example['content']
    temp_dict['entities'] = []
    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        temp_dict['entities'].append((start, end, label))
    training_data['annotations'].append(temp_dict)
  
print(json.dumps(training_data['annotations'][0], indent=4))

{
    "text": "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
    "entities": [
        [
            360,
            371,
            "MEDICINE"
        

`SpaCy` uses `DocBin` class for annotated data: https://spacy.io/api/docbin

In [6]:
nlp = spacy.blank("en")     # Load a new spacy model
doc_bin = DocBin()          # Create a DocBin object

In [7]:
for training_example  in tqdm(training_data['annotations']): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy") # save the docbin object

100%|██████████| 31/31 [00:00<00:00, 1145.35it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





Generate `base_config.cfg` here: https://spacy.io/usage/training#quickstart

In [11]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


Now we have all that we need to train our model.

In [12]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy --gpu-id 0



[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        4156.60    592.70    0.36    0.33    0.39    0.00
 66     200      117291.70  33184.86   99.61   99.61   99.61    1.00
133     400         195.14    277.45  100.00  100.00  100.00    1.00
200     600          19.18     29.02  100.00  100.00  100.00    1.00
266     800          25.89     43.71  100.00  100.00  100.00    1.00
333    1000         

In [17]:
nlp_ner = spacy.load("model-best")

doc = nlp_ner("Antiretroviral therapy (ART) is recommended for all HIV-infected\
individuals to reduce the risk of disease progression.\nART also is recommended \
for HIV-infected individuals for the prevention of transmission of HIV.\nPatients \
starting ART should be willing and able to commit to treatment and understand the\
benefits and risks of therapy and the importance of adherence. Patients may choose\
to postpone therapy, and providers, on a case-by-case basis, may elect to defer\
therapy on the basis of clinical and/or psychosocial factors.")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options=options, jupyter=True)

# References

[How To Train Custom Named Entity Recognition [NER] Model With SpaCy](https://www.newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3)