In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

The BioNNE-L Shared Task dataset is a biomedical natural language processing resource designed for Named Entity Recognition (NER) and relation extraction.
It contains scientific and clinical texts annotated with biomedical entities (e.g., genes, proteins, chemicals, diseases) and their relationships, making it useful for training and evaluating models in biomedical text mining.

In [3]:
!ls '/kaggle/input/bionnl-shared-task/NEREL-BIO-master/BioNNE-L_Shared_Task/data/texts/en'

dev  test  train


In [4]:
directory_texts =  '/kaggle/input/bionnl-shared-task/NEREL-BIO-master/BioNNE-L_Shared_Task/data/texts/en/train'
file_annotations = '/kaggle/input/bionnl-shared-task/NEREL-BIO-master/BioNNE-L_Shared_Task/data/tsv/en/bionnel_en_train.tsv'

so the structure is as follows :
##### file_annotations this is a .csv file that i printed bellow it has the image name (id) and the text (word or sentence) , its type and the span from where to where the span is essential in training for ner
##### directory_texts this has files each file is a document_id in file_annotations so the text and spans in this file are declared in file_annotations , so we store in a dictionnary the txext as the vaalue and the image id as the key

In [5]:
df = pd.read_csv(file_annotations , sep="\t")
print(df.head())

id_to_text = {}

for file in os.listdir(directory_texts):
    file_id = file.split(".")[0]
    with open(os.path.join(directory_texts , file ), "r") as f:
        id_to_text[file_id] = f.read()

   document_id     text entity_type      spans  UMLS_CUI
0  25591652_en  Seizure        DISO    976-983  C0949003
1  25591652_en     AEDs        CHEM  1187-1191  C0887457
2  25591652_en     AEDs        CHEM  1849-1853  C0887457
3  25591652_en     AEDs        CHEM  2033-2037  C0887457
4  25591652_en     AEDs        CHEM  1654-1658  C0887457


print the first 100 caracters of the image "25842921_en"

In [6]:
id_to_text["25842921_en"][:100]

'[Functional morphology of ischemic cardiomyopathy]. OBJECTIVE To show that ischemic cardiomyopathy ('

read the dataframe grouped by the document id so the result is ofr each document id it groups the entities type and the span of each entity

#### then we get the text for each document id and we iterate through this group and store the spans and the labels then we append these three values in train data in the way that the pipe NER expects (entities)  text, {"entities": entities}


In [7]:
train_data = []
for doc_id, group in df.groupby("document_id"):
    if doc_id not in id_to_text:
        continue

    text = id_to_text[doc_id]
    entities = []

    for _, row in group.iterrows():
        try:
            # sometimes spans are like '476,492-500'
            spans = row["spans"].split(",")  # split multiple spans
            for sp in spans:
                start_end = sp.split("-")
                if len(start_end) != 2:
                    print(f"Skipping invalid span {sp}")
                    continue
                start, end = int(start_end[0]), int(start_end[1])
                label = row["entity_type"]

                span_text = text[start:end]
                # optional sanity check
                #print(doc_id, span_text, "->", label)

                entities.append((start, end, label))
        except Exception as e:
            print(f"Skipping row {row} due to {e}")

    train_data.append((text, {"entities": entities}))


this is a test now data is ready ready so the rest is call the model and perform training

In [8]:
print(train_data[1])

('[Impact of bosentan therapy on stress-induced pulmonary hypertension in patients with systemic sclerosis]. \nAIM To describe hemodynamic and clinical changes in patients with elevated mean pulmonary artery pressure (MPAP) > 30 mm Hg during exercise and the impact of bosentan therapy on stress-induced pulmonary hypertension (SIPH).  \nSUBJECTS AND METHODS The study included 19 patients with systemic sclerosis (SDS) in whom possible causes of pulmonary hypertension (PH) (lung and left heart injuries and thromboembolism) were excluded. \nAll the patients underwent pulmonary artery catheterization at rest and during exercise.  \nThe hemodynamic (right atrial pressure (RAP), systolic and diastolic pressure, MPAP, pulmonary artery wedge pressure (PAWP), cardiac output (CO) by a thermodilution technique), clinical (demographic, immunological, and instrumental) parameters were analyzed and the risk of pulmonary arterial hypertension (PAH) was also calculated; 5 patients with SIPH received 16

library that has the models of ner

In [9]:
import spacy

download the model we'll use anglish_core_web_medium

In [10]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


load the model

In [11]:
nlp = spacy.load("en_core_web_md")

print the pipes ner is already a pipe as you can see

In [12]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

we keep the pip we need which is ner 

In [13]:
ner = nlp("ner")

i printed the entities that pipe "ner" can already recognized

In [14]:
nlp.pipe_labels["ner"]

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

i created it from scratch and add it to nlp pipes

In [15]:
nlp = spacy.blank("en")  # start from scratch
ner = nlp.add_pipe("ner")

in here i added the entities of my data that are stored in train_data to ner that i've just created from scratch

In [16]:
for _ ,annotations in train_data:
    for ent in annotations["entities"]:
            ner.add_label(ent[2])

len(ner.labels)

3

store in other_pipes all the pipes in nlp except ner

In [17]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

a function that remooves everlaps like :
##### if we have a span of a word from [122-128] and another soan from [125-130] this is called overlaps which incorrect we can't have two words merged or it's a problem while creating the dataset from the owner so we remove this spans

In [21]:
def remove_overlaps(entities):
    entities = sorted(entities, key=lambda x: (x[0], x[1]))
    filtered = []
    last_end = -1
    for start, end, label in entities:
        if start >= last_end:
            filtered.append((start, end, label))
            last_end = end
    return filtered

train_data_fixed = []
for text, ann in train_data:
    entities = remove_overlaps(ann["entities"])
    train_data_fixed.append((text, {"entities": entities}))


in this function we just do a simple cleaning from numbers and special caracters

In [22]:
def clean_span(span_str):
    # Remove commas or other non-digit characters
    span_str = re.sub(r"[^\d\-]", "", span_str)
    if "-" not in span_str:
        return None
    start, end = span_str.split("-", 1)
    try:
        return int(start), int(end)
    except ValueError:
        return None

# 4️⃣ Function to remove overlapping entities
def remove_overlaps(entities):
    # Sort by start position
    entities = sorted(entities, key=lambda x: (x[0], x[1]))
    clean_entities = []
    last_end = -1
    for start, end, label in entities:
        if start >= last_end:  # keep non-overlapping
            clean_entities.append((start, end, label))
            last_end = end
    return clean_entities

this part bellow is the training first we disable all the other pipes in nlp we keep only "ner" then we initialize the weights of the optimizer i did 150 eochs it gave me the mowest loss and for each epochs we shuffle data to make the model learns better also i added a minibath to increase speed 

In [None]:


# 8️⃣ Training loop
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    epochs = 150
    for epoch in range(epochs):
        random.shuffle(train_examples)
        losses = {}
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.5))
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            nlp.update(examples, drop=0.3, losses=losses)
        print(f"Epoch {epoch+1}/{epochs} - Losses: {losses}")

# 9️⃣ Save the trained model
output_dir = "./nerel_bio_ner_model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")




Annotations preview:
    document_id     text entity_type      spans  UMLS_CUI
0  25591652_en  Seizure        DISO    976-983  C0949003
1  25591652_en     AEDs        CHEM  1187-1191  C0887457
2  25591652_en     AEDs        CHEM  1849-1853  C0887457
3  25591652_en     AEDs        CHEM  2033-2037  C0887457
4  25591652_en     AEDs        CHEM  1654-1658  C0887457
Number of docs loaded: 54
Total training examples: 54
Example: ('Objective.\nTo analyze epileptic seizure aggravation associated with antiepileptic drugs (AED) in adult patients.\nMaterial and methods.\nWe examined 1407 patients aged 18-89 years. \nDifferent patterns of seizure aggravation were identified in 103 patients. \nResults.\nAggravated seizures due the generic substitution were found in 32 patients. \nFirst was topiramate (TPM) (n=12), followed by valproates (VPA) (n=8), carbamazepine (CBZ) (n=5), lamotrigine (LTG) (n=1) and levetiracetam (LEV) (n=1). \nPatients with idiopathic generalized epilepsies (IGE) suffered aggr

[2025-09-07 13:13:31,706] [INFO] Created vocabulary
[2025-09-07 13:13:31,707] [INFO] Finished initializing nlp object
  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist


Epoch 1/150 - Losses: {'ner': 11967.277745723724}
Epoch 2/150 - Losses: {'ner': 5528.112783432007}
Epoch 3/150 - Losses: {'ner': 3125.5043714297935}
Epoch 4/150 - Losses: {'ner': 2931.766532329377}
Epoch 5/150 - Losses: {'ner': 2811.7167774140835}
Epoch 6/150 - Losses: {'ner': 2736.850614272058}
Epoch 7/150 - Losses: {'ner': 2484.6074265446514}
Epoch 8/150 - Losses: {'ner': 2336.8233747389168}
Epoch 9/150 - Losses: {'ner': 2233.771256131353}
Epoch 10/150 - Losses: {'ner': 2087.0974660967477}
Epoch 11/150 - Losses: {'ner': 1987.7381681975676}
Epoch 12/150 - Losses: {'ner': 1815.5571449412528}
Epoch 13/150 - Losses: {'ner': 1772.9244377919463}
Epoch 14/150 - Losses: {'ner': 1742.1013987311103}
Epoch 15/150 - Losses: {'ner': 1739.0836687487827}
Epoch 16/150 - Losses: {'ner': 1554.107002620399}
Epoch 17/150 - Losses: {'ner': 1494.991301888164}
Epoch 18/150 - Losses: {'ner': 1404.82836344851}
Epoch 19/150 - Losses: {'ner': 1384.8287853726015}
Epoch 20/150 - Losses: {'ner': 1325.236814480242

as you can see above the loss i could reached is 31.  which is husge so we'are expecting bad performance from the model 

In [34]:
len(train_examples[1][1]["entities"])

60

this bellow is a test on an example already trained on and the results were very good it detected all the entities and their types 


In [35]:
#  🔟 Test on an example
test_text, test_ann = train_examples[1]
doc = nlp(test_text)
i=0
print("Entities predicted:")
for ent in doc.ents:
    i+=1
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

print(i)

Entities predicted:
skin 13 17 ANATOMY
neuropeptides 32 45 CHEM
neurotrophins 47 60 CHEM
receptors 71 80 CHEM
pathogenesis 88 100 DISO
dermatoses 104 114 DISO
neurotransmitters 149 166 CHEM
receptors 177 186 CHEM
itch 209 213 DISO
skin 242 246 ANATOMY
inflammatory response 247 268 DISO
psoriasis 286 295 DISO
atopic dermatitis 300 317 DISO
Skin 341 345 ANATOMY
biopsy specimens 346 362 ANATOMY
psoriasis 385 394 DISO
atopic dermatitis 416 433 DISO
protein 575 582 CHEM
PGP9.5 601 607 CHEM
amphiregulin 610 622 CHEM
semaphorin 624 634 CHEM
calcitonin 639 649 CHEM
CGRP 672 676 CHEM
CGRP-R 696 702 CHEM
nerve 705 710 ANATOMY
NGF 726 729 CHEM
TrkA 748 752 CHEM
substance P 758 769 CHEM
SP 771 773 CHEM
SP 792 794 CHEM
skin 989 993 ANATOMY
biopsy specimens 994 1010 ANATOMY
atopic dermatitis 1030 1047 DISO
psoriasis 1052 1061 DISO
amphiregulin 1094 1106 CHEM
NGF 1108 1111 CHEM
PGP9.5 1117 1123 CHEM
epidermal 1158 1167 ANATOMY
nerve 1168 1173 ANATOMY
nerve 1214 1219 ANATOMY
reduction factor 1220 1236

just a simple function to convert data i'll test to data expected by "ner"

In [36]:
def convert_to_data(path_tsv , path_test):
    directory_test = path_test
    annotations_path = path_tsv
    df = pd.read_csv(annotations_path , sep="\t")
    print(df.head())
    id_to_text = {}
    for file in os.listdir(directory_test):
        #file_path = os.path.join(directory_test , file)
        #print(file)
        file_id = file.split(".")[0]
        with open(os.path.join(directory_test , file) , "r") as f:
            id_to_text[file_id] = f.read()
    #print(id_to_text["27456563_en"])
    test_data = []
    for  doc_id,group in df.groupby("document_id"):
        if doc_id not in id_to_text:
            continue

        text = id_to_text[doc_id]
        
        # for _,row in group.iterrows():
        #     start , end = row["span"].split("-")
        #     start , end = int(start) , int(end)
        #     label = row["entity_type"]
        #     entities.append((start  , end , label))

        test_data.append((text))
    
    return test_data

path_csv = "/kaggle/input/bionnl-shared-task/NEREL-BIO-master/BioNNE-L_Shared_Task/data/tsv/en/bionnel_en_test.tsv"
path_test  ="/kaggle/input/bionnl-shared-task/NEREL-BIO-master/BioNNE-L_Shared_Task/data/texts/en/test"

test_data = convert_to_data(path_csv , path_test)
        

print(test_data[10])

   document_id        text entity_type      spans
0  27456563_en  riboflavin        CHEM  1194-1204
1  27456563_en      myopia        DISO  1633-1639
2  27456563_en      stress        DISO    174-180
3  27456563_en      cornea     ANATOMY    188-194
4  27456563_en      cornea     ANATOMY    367-373
[The possibility of using standardized self-report anxiety and depression scales in elderly patients: anxiety scales/questionnaires].
AIM  To describe the specifics of using self-report anxiety scales in elderly patients, determine the parameters of their reliability and validity and develop recommendations on the use of these scales.
MATERIAL AND METHODS
The study included 234 patients, aged over 50 years, with non-psychotic anxiety disorders.
The following scales/questionnaires BAI, GAI, STAI, ZAS, HADS were used at baseline and 12 weeks after treatment.
Conditions of testing, form and content of instructions were similar.
Data analysis included the estimation of reliability and validity o

In [37]:
len(test_data)

153

In [None]:
test_data[0]

and this s a true test on data he has never seen
and the results were good he misses some entities but still it detected 34 out of 57

In [41]:

text = test_data[0]
print("predicted entities for the first text")
doc = nlp(text)
i=0
for ent in doc.ents:
    i+=1
    print(ent.text , "|" , ent.label_)
print(i)

predicted entities for the first text
Riboflavin | CHEM
cross | ANATOMY
cornea | ANATOMY
cornea | ANATOMY
cornea | ANATOMY
cross | ANATOMY
vivo | ANATOMY
eyes | ANATOMY
eyes | ANATOMY
myopia | CHEM
corneal | ANATOMY
samples | ANATOMY
TN1S | CHEM
machine | CHEM
VisuMax | CHEM
WaveLight-FS200 | CHEM
vivo | ANATOMY
cross | ANATOMY
cornea | ANATOMY
Corneal | ANATOMY
corneal | ANATOMY
samples | ANATOMY
load | CHEM
MPa | CHEM
MPa | CHEM
mild to moderate | DISO
riboflavin photoprotection | DISO
fellow eye | ANATOMY
cornea | ANATOMY
cornea | ANATOMY
preliminary | ANATOMY
stroma | ANATOMY
riboflavin | ANATOMY
cross | ANATOMY
34


In [42]:
path_csv = "/kaggle/input/bionnl-shared-task/NEREL-BIO-master/BioNNE-L_Shared_Task/data/tsv/en/bionnel_en_dev.tsv"
path_test  ="/kaggle/input/bionnl-shared-task/NEREL-BIO-master/BioNNE-L_Shared_Task/data/texts/en/dev"

dev_data = convert_to_data(path_csv , path_test)

   document_id                       text entity_type      spans  UMLS_CUI
0  25726786_en                 depression        DISO    686-696  C0011570
1  25726786_en                        MCI        DISO    371-374  C1270972
2  25726786_en  Mild cognitive impairment        DISO       1-26  C1270972
3  25726786_en                   ischemia        DISO  1249-1257  C0022116
4  25726786_en                        CAD        DISO  1130-1133  C1956346


another test also it did a good job the results are acceptable

In [44]:
text = dev_data[3]
print("predicted entities for the first text")
doc = nlp(text)
i=0
for ent in doc.ents:
    i+=1
    print(ent.text , "|" , ent.label_)

print(i)

predicted entities for the first text
arbidol | CHEM
umifenovir | CHEM
influenza | DISO
placebo | CHEM
Arbidol | CHEM
umifenovir | CHEM
influenza | DISO
placebo | CHEM
influenza | DISO
acute respiratory tract infection | DISO
oral | ANATOMY
placebo | CHEM
umifenovir | CHEM
influenza | DISO
influenza | DISO
influenza | DISO
influenza | DISO
placebo | CHEM
umifenovir | CHEM
placebo | CHEM
viral | ANATOMY
influenza | DISO
placebo | CHEM
umifenovir | CHEM
influenza | DISO
appears | ANATOMY
26


In [45]:
import shutil

# source folder
model_dir = "/kaggle/working/nerel_bio_ner_model"

# zip file path (will create nerel_bio_ner_model.zip in working directory)
shutil.make_archive("/kaggle/working/nerel_bio_ner_model", 'zip', model_dir)


'/kaggle/working/nerel_bio_ner_model.zip'

save the working as a zip folder