## Load Packages

In [31]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 

In [42]:
from spacy.training.example import Example

In [32]:
import json

In [33]:
# !python3 -m spacy download en_core_web_lg

In [34]:
# nlp1 = spacy.load('en_core_web_lg')
# Load the spaCy model
nlp1 = spacy.load("en_core_web_sm")

## Train Data

In [36]:
# Initialize the TRAIN_DATA list
TRAIN_DATA = []

# Read the JSON data line by line
with open("datasets/6_3.json", 'r') as json_file:
    for line in json_file:
        record = json.loads(line)
        text = record["text"]
        entities = []

        # Extract entity information
        for entity_info in record["entities"]:
            start, end, label = entity_info
            entities.append((start, end, label))

        # Append to TRAIN_DATA
        TRAIN_DATA.append((text, {"entities": entities}))

## Define our variables

In [37]:
model = None
output_dir='output'
n_iter=100

## Load the model

In [38]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


## Set up the pipeline

In [41]:
# Check if 'ner' component already exists in the pipeline
if 'ner' not in nlp.pipe_names:
    # Create a new NER component
    ner = nlp.add_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')


## Train the Recognizer

In [43]:
# Add labels to the NER component
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
        print(losses)

  0%|          | 0/164 [00:00<?, ?it/s]

100%|██████████| 164/164 [00:07<00:00, 22.87it/s]


{'ner': 1347.6596243106585}


100%|██████████| 164/164 [00:06<00:00, 26.47it/s]


{'ner': 768.5579713233699}


100%|██████████| 164/164 [00:06<00:00, 25.90it/s]


{'ner': 638.4248130609822}


100%|██████████| 164/164 [00:06<00:00, 24.53it/s]


{'ner': 628.2462953847352}


100%|██████████| 164/164 [00:06<00:00, 23.85it/s]


{'ner': 552.6909909304702}


100%|██████████| 164/164 [00:06<00:00, 26.32it/s]


{'ner': 479.2872117487972}


100%|██████████| 164/164 [00:06<00:00, 26.27it/s]


{'ner': 451.02838698112447}


100%|██████████| 164/164 [00:06<00:00, 24.22it/s]


{'ner': 442.34009333692563}


100%|██████████| 164/164 [00:06<00:00, 25.65it/s]


{'ner': 402.8228329537828}


100%|██████████| 164/164 [00:06<00:00, 26.09it/s]


{'ner': 353.20408064627156}


100%|██████████| 164/164 [00:06<00:00, 26.34it/s]


{'ner': 367.15249799378273}


100%|██████████| 164/164 [00:06<00:00, 26.84it/s]


{'ner': 346.5758066381511}


100%|██████████| 164/164 [00:06<00:00, 23.80it/s]


{'ner': 383.4199035848401}


100%|██████████| 164/164 [00:07<00:00, 23.17it/s]


{'ner': 339.7968426342353}


100%|██████████| 164/164 [00:06<00:00, 26.07it/s]


{'ner': 291.384081483953}


100%|██████████| 164/164 [00:06<00:00, 25.30it/s]


{'ner': 293.0324880852138}


100%|██████████| 164/164 [00:06<00:00, 24.70it/s]


{'ner': 296.61789051471027}


100%|██████████| 164/164 [00:06<00:00, 25.10it/s]


{'ner': 315.779555556157}


100%|██████████| 164/164 [00:06<00:00, 24.02it/s]


{'ner': 343.3720032548357}


100%|██████████| 164/164 [00:06<00:00, 24.41it/s]


{'ner': 299.35940244064096}


100%|██████████| 164/164 [00:07<00:00, 22.28it/s]


{'ner': 326.2980042630612}


 60%|██████    | 99/164 [00:03<00:02, 27.68it/s]

## Test the trained model

In [1]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

NameError: name 'TRAIN_DATA' is not defined

## Save the model

In [None]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)        

## Test the saved model

In [None]:
# Initialize the TRAIN_DATA list
TEST_DATA = []

# Read the JSON data line by line
with open("datasets/06_9.json", 'r') as json_file:
    for line in json_file:
        record = json.loads(line)
        text = record["text"]
        entities = []

        # Extract entity information
        for entity_info in record["entities"]:
            start, end, label = entity_info
            entities.append((start, end, label))

        # Append to TRAIN_DATA
        TRAIN_DATA.append((text, {"entities": entities}))

In [None]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TEST_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])