In [1]:
import os
import random
import spacy
from spacy.training.example import Example

# Model Configs

In [3]:
model = "pt_core_news_lg"
model = "ner_v0" # (custom model)

# Load Model

In [4]:
nlp = spacy.load(model)  

#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

# Load Data

In [5]:
data_folder = "data"
data_files = os.listdir(data_folder)

TRAIN_DATA = []
for file in data_files:
    with open(f'{data_folder}/{file}', 'r') as file:
        # Read the contents of the file as a string
        json_str = file.read()
    TRAIN_DATA.extend(eval(json_str))
TRAIN_DATA[-1]

{'product': 'Drink Pronto Sabor Gin e Tônica Beats Drinks Gt Skol 269ml',
 'tags': [['B-PRO', 0, 5],
  ['B-ESP', 6, 12],
  ['O', 13, 18],
  ['B-ESP', 19, 22],
  ['O', 23, 24],
  ['B-ESP', 25, 31],
  ['I-ESP', 32, 37],
  ['B-ESP', 38, 44],
  ['I-ESP', 45, 47],
  ['B-MAR', 48, 52],
  ['B-TAM', 53, 58]]}

Check data

In [6]:
def check_data():
    for idx, product in enumerate(TRAIN_DATA):
        tags = product['tags']
        assert tags[0][1] == 0
        end = tags[0][2]
        for tag in tags[1:]:
            start = tag[1]
            try:
                assert start == end + 1
            except:
                print(idx)
            end = tag[2]
        try:
            assert end == len(product['product'])
        except:
            print(end, len(product['product']))
            print(idx)
check_data()

# Parse Data

In [7]:
for elem in TRAIN_DATA:
    tags = elem['tags']
    for idx in range(len(tags)):
        if tags[idx][0].startswith('I-'):
            r_iter = 1
            while tags[idx - r_iter][0].startswith('I-'):
                r_iter += 1
            tags[idx-r_iter][2] = tags[idx][2]
    elem['tags'] = [x for x in elem['tags'] if not x[0].startswith('I-')]    

In [8]:
check_data()

# Train

In [9]:
LABELS = [
    "O",
    "B-PRO",
    "B-MAR",
    "B-ESP",
    "B-TAM",
    "B-QUA",
]

In [10]:
train_data_parsed = []
for elem in TRAIN_DATA:
    text = elem['product']
    dic = {}
    dic['entities'] = [(x[1], x[2], x[0]) for x in elem['tags']]
    train_data_parsed.append((text, dic))

In [11]:
n_iter=200

In [None]:
for _, annotations in train_data_parsed:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

examples = []
for text, annotations in train_data_parsed:
    example = Example.from_dict(nlp.make_doc(text), annotations)
    examples.append(example)

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.create_optimizer()
    for itn in range(n_iter):
        random.shuffle(train_data_parsed)
        losses = {}
        nlp.update(
            examples=examples,
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
        print(losses)

# Validate

In [12]:
colors = ["#FFFF00", "#008000", "#FF0000", "#A020F0", "#f5f5dc", "#808080"]
label_to_color = {a: b for a, b in zip(LABELS, colors)}
options = {"colors": label_to_color} 

In [13]:
# Open the JSON file for reading
with open('data.json', 'r') as file:
    # Read the contents of the file as a string
    json_str = file.read()

# Use eval to parse the JSON string into a Python list
json_data = eval(json_str)

# Now, json_data contains a list of JSON objects
products = [x['product'] for x in json_data]
products[-1]

'Creme de Cebola Menos Sódio Maggi 61g'

In [14]:
nlp.to_disk("ner_v0")

In [15]:
for product in products:
    doc = nlp(product)
    spacy.displacy.render(doc, style="ent", options=options, jupyter=True)