In [1]:
from pathlib import Path
import spacy
import random
from spacy.training.example import Example
import os

# Model Configs

In [2]:
model = "pt_core_news_lg"
output_dir=Path("ner")

# Load Model

In [3]:
nlp = spacy.load(model)  

#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

# Load Data

In [4]:
data_folder = "data"
data_files = os.listdir(data_folder)

TRAIN_DATA = []
for file in data_files:
    with open(f'{data_folder}/{file}', 'r') as file:
        # Read the contents of the file as a string
        json_str = file.read()
    TRAIN_DATA.extend(eval(json_str))
TRAIN_DATA[-1]

{'product': 'Drink Pronto Sabor Gin e Tônica Beats Drinks Gt Skol 269ml',
 'tags': [['B-PRO', 0, 5],
  ['B-ESP', 6, 12],
  ['O', 13, 18],
  ['B-ESP', 19, 22],
  ['O', 23, 24],
  ['B-ESP', 25, 31],
  ['I-ESP', 32, 37],
  ['B-ESP', 38, 44],
  ['I-ESP', 45, 47],
  ['B-MAR', 48, 52],
  ['B-TAM', 53, 58]]}

Check data

In [5]:
def check_data():
    for idx, product in enumerate(TRAIN_DATA):
        tags = product['tags']
        assert tags[0][1] == 0
        end = tags[0][2]
        for tag in tags[1:]:
            start = tag[1]
            try:
                assert start == end + 1
            except:
                print(idx)
            end = tag[2]
        try:
            assert end == len(product['product'])
        except:
            print(end, len(product['product']))
            print(idx)
check_data()

# Parse Data

In [6]:
for elem in TRAIN_DATA:
    tags = elem['tags']
    for idx in range(len(tags)):
        if tags[idx][0].startswith('I-'):
            r_iter = 1
            while tags[idx - r_iter][0].startswith('I-'):
                r_iter += 1
            tags[idx-r_iter][2] = tags[idx][2]
    elem['tags'] = [x for x in elem['tags'] if not x[0].startswith('I-')]    

In [7]:
check_data()

# Train

In [8]:
LABELS = [
    "O",
    "B-PRO",
    "B-MAR",
    "B-ESP",
    "B-TAM",
    "B-QUA",
]

In [9]:
train_data_parsed = []
for elem in TRAIN_DATA:
    text = elem['product']
    dic = {}
    dic['entities'] = [(x[1], x[2], x[0]) for x in elem['tags']]
    train_data_parsed.append((text, dic))

In [10]:
n_iter=200

In [11]:
for _, annotations in train_data_parsed:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

examples = []
for text, annotations in train_data_parsed:
    example = Example.from_dict(nlp.make_doc(text), annotations)
    examples.append(example)

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.create_optimizer()
    for itn in range(n_iter):
        random.shuffle(train_data_parsed)
        losses = {}
        nlp.update(
            examples=examples,
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
        print(losses)

{'ner': 2805.475048196502}
{'ner': 2694.137901967857}
{'ner': 2642.671508669853}
{'ner': 2633.8405019607453}
{'ner': 2563.4608026742935}
{'ner': 2478.9382765907794}
{'ner': 2480.0443819761276}
{'ner': 2375.076074361801}
{'ner': 2241.653748035431}
{'ner': 2229.9303835630417}
{'ner': 2119.865308880806}
{'ner': 2082.8194093704224}
{'ner': 2003.0276726484299}
{'ner': 1935.2298483848572}
{'ner': 1886.6481873989105}
{'ner': 1918.1649994850159}
{'ner': 1902.1891714334488}
{'ner': 1930.6337124109268}
{'ner': 1899.3346998691559}
{'ner': 1845.881385564804}
{'ner': 1836.036756157875}
{'ner': 1807.5600908398628}
{'ner': 1786.383133172989}
{'ner': 1764.785614848137}
{'ner': 1766.3593258857727}
{'ner': 1733.9834020137787}
{'ner': 1742.6779681444168}
{'ner': 1744.5112590789795}
{'ner': 1720.0563331842422}
{'ner': 1691.0411868095398}
{'ner': 1680.445146203041}
{'ner': 1671.562949001789}
{'ner': 1712.4032965898514}
{'ner': 1687.4830968379974}
{'ner': 1746.5347197055817}
{'ner': 1811.2185112833977}
{'ne

# Validate

In [12]:
colors = ["#FFFF00", "#008000", "#FF0000", "#A020F0", "#f5f5dc", "#808080"]
label_to_color = {a: b for a, b in zip(LABELS, colors)}
options = {"colors": label_to_color} 

In [16]:
# Open the JSON file for reading
with open('data.json', 'r') as file:
    # Read the contents of the file as a string
    json_str = file.read()

# Use eval to parse the JSON string into a Python list
json_data = eval(json_str)

# Now, json_data contains a list of JSON objects
products = [x['product'] for x in json_data]
products[-1]

'Creme de Cebola Menos Sódio Maggi 61g'

In [17]:
for product in products:
    doc = nlp(product)
    spacy.displacy.render(doc, style="ent", options=options, jupyter=True)