In [1]:
from pathlib import Path
import spacy
import random
from spacy.training.example import Example
import os

# Model Configs

In [2]:
model = "pt_core_news_lg"
output_dir=Path("ner")

# Load Model

In [3]:
nlp = spacy.load(model)  

#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

# Load Data

In [4]:
data_folder = "data"
data_files = os.listdir(data_folder)

TRAIN_DATA = []
for file in data_files:
    with open(f'{data_folder}/{file}', 'r') as file:
        # Read the contents of the file as a string
        json_str = file.read()
    TRAIN_DATA.extend(eval(json_str))
TRAIN_DATA[-1]

{'product': 'Drink Pronto Sabor Gin e Tônica Beats Drinks Gt Skol 269ml',
 'tags': [['B-PRO', 0, 5],
  ['B-ESP', 6, 12],
  ['O', 13, 18],
  ['B-ESP', 19, 22],
  ['O', 23, 24],
  ['B-ESP', 25, 31],
  ['I-ESP', 32, 37],
  ['B-ESP', 38, 44],
  ['I-ESP', 45, 47],
  ['B-MAR', 48, 52],
  ['B-TAM', 53, 58]]}

Check data

In [5]:
def check_data():
    for idx, product in enumerate(TRAIN_DATA):
        tags = product['tags']
        assert tags[0][1] == 0
        end = tags[0][2]
        for tag in tags[1:]:
            start = tag[1]
            try:
                assert start == end + 1
            except:
                print(idx)
            end = tag[2]
        try:
            assert end == len(product['product'])
        except:
            print(end, len(product['product']))
            print(idx)
check_data()

# Parse Data

In [6]:
for elem in TRAIN_DATA:
    tags = elem['tags']
    for idx in range(len(tags)):
        if tags[idx][0].startswith('I-'):
            r_iter = 1
            while tags[idx - r_iter][0].startswith('I-'):
                r_iter += 1
            tags[idx-r_iter][2] = tags[idx][2]
    elem['tags'] = [x for x in elem['tags'] if not x[0].startswith('I-')]    

In [7]:
check_data()

# Train

In [8]:
LABELS = [
    "O",
    "B-PRO",
    "B-MAR",
    "B-ESP",
    "B-TAM",
    "B-QUA",
]

In [9]:
train_data_parsed = []
for elem in TRAIN_DATA:
    text = elem['product']
    dic = {}
    dic['entities'] = [(x[1], x[2], x[0]) for x in elem['tags']]
    train_data_parsed.append((text, dic))

In [10]:
n_iter=100

In [11]:
for _, annotations in train_data_parsed:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

examples = []
for text, annotations in train_data_parsed:
    example = Example.from_dict(nlp.make_doc(text), annotations)
    examples.append(example)

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.create_optimizer()
    for itn in range(n_iter):
        random.shuffle(train_data_parsed)
        losses = {}
        nlp.update(
            examples=examples,
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
        print(losses)

{'ner': 2813.980626344681}
{'ner': 2783.1789622306824}
{'ner': 2677.9091601371765}
{'ner': 2613.418913303758}
{'ner': 2554.893795967102}
{'ner': 2609.4687184095383}
{'ner': 2405.9029693603516}
{'ner': 2314.9414994716644}
{'ner': 2265.6971839666367}
{'ner': 2160.076962660998}
{'ner': 2106.7569868564606}
{'ner': 2030.7033240795135}
{'ner': 1956.338317990303}
{'ner': 1933.6746555566788}
{'ner': 1935.5505343675613}
{'ner': 1893.3204870224}
{'ner': 1898.5464707612991}
{'ner': 1881.3622817993164}
{'ner': 1888.2663218975067}
{'ner': 1849.60569357872}
{'ner': 1813.9232369661331}
{'ner': 1821.58562541008}
{'ner': 1762.6743241548538}
{'ner': 1750.4379618167877}
{'ner': 1771.1041090488434}
{'ner': 1739.1390050649643}
{'ner': 1740.1000369787216}
{'ner': 1713.021278977394}
{'ner': 1696.6705377101898}
{'ner': 1681.4146958589554}
{'ner': 1674.2626199126244}
{'ner': 1670.4209226965904}
{'ner': 1663.8306838870049}
{'ner': 1711.2012093663216}
{'ner': 1760.1138554215431}
{'ner': 1757.8908155560493}
{'ner

In [12]:
colors = ["#FFFF00", "#008000", "#FF0000", "#A020F0", "#f5f5dc", "#808080"]
label_to_color = {a: b for a, b in zip(LABELS, colors)}
options = {"colors": label_to_color} 

In [14]:
# Open the JSON file for reading
with open('data.json', 'r') as file:
    # Read the contents of the file as a string
    json_str = file.read()

# Use eval to parse the JSON string into a Python list
json_data = eval(json_str)

# Now, json_data contains a list of JSON objects
products = [x['product'] for x in json_data]
products

['Sabão Barra Artesanal Coco Qualitá Flow Pack 180g',
 'Vinho Tinto Chileno Reserva Carménère Medalla Real 750ml',
 'Pipoca de Micro Sabor Bacon Qualitá 100g',
 'Hambúrguer Sadia Tradicional 672g',
 'Cereal Matinal Duo Nescau 400g',
 'Bolo Coco Panco 300g',
 'Caldo Sabor Carne Qualitá 114g',
 'Twix Triplo Chocolate 40g',
 'Café Torrado e Moído Extra Forte Vácuo Pilão 500g',
 'Isotônico de Laranja Gatorade 500ml',
 'Sabonet em Barra Clássico Cerejeira Ylang Leve Mais Pague Menos Francis',
 'Limpador com Brilho Cerâmica e Porcelanato Qualitá 750ml',
 'Vinho Francês Tinto Seco Excellence Merlot Cabernet Sauvignon Bordeaux Garrafa 750ml',
 'Bebida de Amêndoa a Tal da Castanha 1l',
 'Batata Lisa Sour Cream Lays 135g',
 'Coca-Cola sem Açúcar 310ml',
 'Energético de Melancia Red Bull 250ml',
 'Requeijão Cremoso Danubio Light 220g',
 'Paçoca Rolha Açúcar Mascavo Qualitá 180g',
 'Manteiga e Margarina Leco Extra Cremosa com Sal 200g',
 'Shampoo Hidratação Intensa Dove 400ml',
 'Café Torrado e Mo

In [15]:
for product in products:
    doc = nlp(product)
    spacy.displacy.render(doc, style="ent", options=options, jupyter=True)