<a href="https://colab.research.google.com/github/Shouvik-7/Pytorch_examples/blob/main/Spacy_Custom_NER_Youtube.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# training_data.py

TRAIN_DATA = [
    ("Google was founded by Larry Page and Sergey Brin.", {
        "entities": [(0, 6, "ORG"), (24, 34, "PER"), (39, 51, "PER")]
    }),
    ("Facebook is based in Menlo Park.", {
        "entities": [(0, 8, "ORG"), (22, 32, "GPE")]
    }),
    ("Elon Musk founded SpaceX.", {
        "entities": [(0, 9, "PER"), (18, 24, "ORG")]
    }),
    ("Satya Nadella is the CEO of Microsoft.", {
        "entities": [(0, 13, "PER"), (31, 40, "ORG")]
    }),
    ("New York is a large city in the USA.", {
        "entities": [(0, 8, "GPE"), (34, 37, "GPE")]
    }),
]


In [None]:
# train_ner.py

import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

# Create a blank English model
nlp = spacy.blank("en")

# Create the NER pipe and add it to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipeline components while training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(30):  # Training iterations
        print(f"Iteration {itn}")
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(2.0, 8.0, 1.5))
        for batch in batches:
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.3, losses=losses)
        print("Losses", losses)

# Save the model
nlp.to_disk("custom_ner_model")
print("Model saved to 'custom_ner_model'")


Iteration 0
Losses {'ner': np.float32(23.798244)}
Iteration 1




Losses {'ner': np.float32(21.034351)}
Iteration 2
Losses {'ner': np.float32(16.669342)}
Iteration 3
Losses {'ner': np.float32(9.79456)}
Iteration 4
Losses {'ner': np.float32(7.133154)}
Iteration 5
Losses {'ner': np.float32(5.64596)}
Iteration 6
Losses {'ner': np.float32(6.541515)}
Iteration 7
Losses {'ner': np.float32(14.309259)}
Iteration 8
Losses {'ner': np.float32(8.455862)}
Iteration 9
Losses {'ner': np.float32(7.0604196)}
Iteration 10
Losses {'ner': np.float32(5.4379606)}
Iteration 11
Losses {'ner': np.float32(3.0518181)}
Iteration 12
Losses {'ner': np.float32(1.8967929)}
Iteration 13
Losses {'ner': np.float32(1.3391008)}
Iteration 14
Losses {'ner': np.float32(1.3519479)}
Iteration 15
Losses {'ner': np.float32(0.9521105)}
Iteration 16
Losses {'ner': np.float32(2.443947)}
Iteration 17
Losses {'ner': np.float32(0.7021848)}
Iteration 18
Losses {'ner': np.float32(14.684381)}
Iteration 19
Losses {'ner': np.float32(2.0081055)}
Iteration 20
Losses {'ner': np.float32(2.365585)}
Iteration 

In [None]:
# test_ner.py

import spacy

nlp = spacy.load("custom_ner_model")

test_text = "Bill Gates works at Microsoft and lives in Seattle."
doc = nlp(test_text)

for ent in doc.ents:
    print(ent.text, ent.label_)


# FineTune

In [None]:
# fine_tune_ner.py

import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

# Load a pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Get the NER pipe
ner = nlp.get_pipe("ner")

# Add new labels (if any)
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipes for training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    for itn in range(30):
        print(f"Iteration {itn}")
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(2.0, 8.0, 1.5))
        for batch in batches:
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.3, losses=losses)
        print("Losses", losses)

# Save the fine-tuned model
nlp.to_disk("fine_tuned_ner")
print("Fine-tuned model saved to 'fine_tuned_ner'")


Iteration 0
Losses {'ner': np.float32(5.966311)}
Iteration 1
Losses {'ner': np.float32(6.5691113)}
Iteration 2
Losses {'ner': np.float32(6.0968003)}
Iteration 3
Losses {'ner': np.float32(6.871419)}
Iteration 4
Losses {'ner': np.float32(4.463703)}
Iteration 5
Losses {'ner': np.float32(6.2995243)}
Iteration 6
Losses {'ner': np.float32(3.2210028)}
Iteration 7
Losses {'ner': np.float32(2.3194242)}
Iteration 8
Losses {'ner': np.float32(2.5746086)}
Iteration 9
Losses {'ner': np.float32(2.6893277)}
Iteration 10
Losses {'ner': np.float32(2.0484698)}
Iteration 11
Losses {'ner': np.float32(12.110895)}
Iteration 12
Losses {'ner': np.float32(9.228569)}
Iteration 13
Losses {'ner': np.float32(5.0935326)}
Iteration 14
Losses {'ner': np.float32(2.5405564)}
Iteration 15
Losses {'ner': np.float32(1.7189946)}
Iteration 16
Losses {'ner': np.float32(2.6228707)}
Iteration 17
Losses {'ner': np.float32(3.15553)}
Iteration 18
Losses {'ner': np.float32(2.7659142)}
Iteration 19
Losses {'ner': np.float32(1.733785

In [None]:
# test_fine_tuned.py

import spacy

nlp = spacy.load("fine_tuned_ner")

text = "Shouvik Sengupta founded Microsoft in Redmond."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


Shouvik Sengupta PER
Microsoft ORG
Redmond GPE


In [None]:
doc = nlp("Shouvik Sengupta founded Microsoft in Redmond.")

# colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#a6e22d"}
# options = {"colors": colors}

#spacy.displacy.render(doc, style="ent", options= options, jupyter=True)
spacy.displacy.render(doc, style="ent", jupyter=True)