<a href="https://colab.research.google.com/github/Raoina/NLP-Learning-Journey/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
TRAIN_DATA = [
    ("Apple is looking at buying U.K. startup for $1 billion", {
        "entities": [(0, 5, "ORG"), (27, 31, "LOC")]
    }),
    ("San Francisco considers banning sidewalk delivery robots", {
        "entities": [(0, 13, "LOC")]
    }),
]


In [2]:
import spacy
from spacy.training.example import Example

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

import random

optimizer = nlp.begin_training()
for itn in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], drop=0.5, losses=losses)
    print(losses)

doc = nlp("Google is based in California.")
for ent in doc.ents:
    print(ent.text, ent.label_)


{'ner': np.float32(15.972323)}
{'ner': np.float32(15.047707)}
{'ner': np.float32(13.770542)}
{'ner': np.float32(12.311717)}
{'ner': np.float32(10.992633)}
{'ner': np.float32(9.279618)}
{'ner': np.float32(8.261411)}
{'ner': np.float32(6.19745)}
{'ner': np.float32(5.464408)}
{'ner': np.float32(4.825523)}


In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset("eriktks/conll2003")

In [6]:
train_dataset = dataset['train']

In [7]:
labels = dataset['train'].features['ner_tags'].feature.names

print(labels)


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [8]:
def convert_to_spacy_format(dataset):
    spacy_data = []

    for example in dataset:
        tokens = example['tokens']
        tags = example['ner_tags']
        text = ' '.join(tokens)
        entities = []

        offset = 0
        for token, tag in zip(tokens, tags):
            tag_name = labels[tag]
            if tag_name != 'O':
                start = text.find(token, offset)
                end = start + len(token)
                ent_type = tag_name.split('-')[-1]
                entities.append((start, end, ent_type))
                offset = end
            else:
                offset = text.find(token, offset) + len(token)

        spacy_data.append((text, {'entities': entities}))

    return spacy_data

TRAIN_DATA = convert_to_spacy_format(train_dataset)


In [9]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

db = DocBin()
for text, annot in tqdm(TRAIN_DATA):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./train.spacy")


100%|██████████| 14041/14041 [00:03<00:00, 4041.22it/s]


In [None]:
!python -m spacy init config config.cfg --lang en --pipeline ner

In [11]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     46.28    0.00    0.00    0.00    0.00
  0     200         79.75   3317.64   62.25   64.83   59.87    0.62
  0     400        276.63   2134.41   79.35   80.24   78.47    0.79
  0     600        229.00   1927.59   85.12   85.77   84.49    0.85
  0     800        404.25   1994.89   88.82   89.68   87.99    0.89
  0    1000        301.74   2121.26   91.42   91.50   91.33    0.91
  1    1200        364.63   2083.05   94.24   94.64   93.84    0.94
  1    1400        373.89   1569.38   95.40   95.37   95.44    0.95
  1    1600        443.72   1801.37   96.76   97.04   96.49    0.9

In [14]:
import spacy

nlp_ner = spacy.load("./output/model-best")

doc = nlp_ner("I am Rowaina Reda, I graduated form Computer and data Science faculty. I learn NLP course in Alexandria")
for ent in doc.ents:
    print(ent.text, ent.label_)


Rowaina PER
Reda PER
Computer LOC
NLP ORG
Alexandria LOC
