In [3]:
from pathlib import Path
import spacy
from spacy.lang.pt.examples import sentences 

In [4]:
model = "pt_core_news_lg"
output_dir=Path("ner")
n_iter=100

In [5]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

Loaded model 'pt_core_news_lg'


In [125]:
with open('data/atacadao-sao-jose-dos-campos-shopping-jd-satelite.json', 'r') as file:
    # Read the contents of the file as a string
    json_str = file.read()
TRAIN_DATA = eval(json_str)

Check data

In [126]:
for idx, product in enumerate(TRAIN_DATA):
    tags = product['tags']
    assert tags[0][1] == 0
    end = tags[0][2]
    for tag in tags[1:]:
        start = tag[1]
        try:
            assert start == end + 1
        except:
            print(idx)
        end = tag[2]
    try:
        assert end == len(product['product'])
    except:
        print(end, len(product['product']))
        print(idx)

Parse Data

In [127]:
for elem in TRAIN_DATA:
    tags = elem['tags']
    for idx in range(len(tags)):
        if tags[idx][0].startswith('I-'):
            r_iter = 1
            while tags[idx - r_iter][0].startswith('I-'):
                r_iter += 1
            tags[idx-r_iter][2] = tags[idx][2]
    elem['tags'] = [x for x in elem['tags'] if not x[0].startswith('I-')]
    

Check new data

In [130]:
for idx, product in enumerate(TRAIN_DATA):
    tags = product['tags']
    assert tags[0][1] == 0
    end = tags[0][2]
    for tag in tags[1:]:
        start = tag[1]
        try:
            assert start == end + 1
        except:
            print(idx)
        end = tag[2]
    try:
        assert end == len(product['product'])
    except:
        print(end, len(product['product']))
        print(idx)

# Train

In [138]:
LABELS = [
    "O",
    "B-PRO",
    "B-MAR",
    "B-ESP",
    "B-TAM",
    "B-QUA",
]

In [141]:
training_data = {'classes' : LABELS, 'annotations' : []}
for example in TRAIN_DATA:
  temp_dict = {}
  temp_dict['text'] = example['product']
  temp_dict['entities'] = []
  for annotation in example['tags']:
    start = annotation[1]
    end = annotation[2]
    label = annotation[0]
    temp_dict['entities'].append((start, end, label))
  training_data['annotations'].append(temp_dict)
  
print(training_data['annotations'][4])

{'text': 'Sobremesa Frutap Doce de Leite 180g', 'entities': [(0, 9, 'B-PRO'), (10, 16, 'B-MAR'), (17, 30, 'B-ESP'), (31, 35, 'B-TAM')]}


In [155]:
train_test = []
for elem in TRAIN_DATA:
    text = elem['product']
    dic = {}
    dic['entities'] = [(x[1], x[2], x[0]) for x in elem['tags']]
    train_test.append((text, dic))

In [160]:
from spacy.training.example import Example

for _, annotations in train_test:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

examples = []
for text, annotations in train_test:
    example = Example.from_dict(nlp.make_doc(text), annotations)
    examples.append(example)

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.create_optimizer()
    for itn in range(n_iter):
        random.shuffle(train_test)
        losses = {}
        nlp.update(
            examples=examples,
            drop=0.5,  
            sgd=optimizer,
            losses=losses)
        print(losses)

{'ner': 980.3170143056195}
{'ner': 931.8181735295129}
{'ner': 922.7806548023792}
{'ner': 874.8278806060553}
{'ner': 846.1621073560018}
{'ner': 845.0349526717328}
{'ner': 824.102467879653}
{'ner': 801.1773087168112}
{'ner': 770.9181451685727}
{'ner': 739.9013715684414}
{'ner': 697.2733237072825}
{'ner': 706.1960806027055}
{'ner': 659.507222853601}
{'ner': 663.0485783805489}
{'ner': 648.7035458981991}
{'ner': 653.6652179658413}
{'ner': 628.3182065902947}
{'ner': 642.955263197422}
{'ner': 641.638463973999}
{'ner': 635.415625333786}
{'ner': 630.8640522956848}
{'ner': 624.4884921312332}
{'ner': 612.1227043569088}
{'ner': 602.0574856698513}
{'ner': 600.6464277803898}
{'ner': 598.1929865777493}
{'ner': 588.1275249123573}
{'ner': 585.6000021696091}
{'ner': 581.4426157251}
{'ner': 569.962310841307}
{'ner': 569.9583116956055}
{'ner': 563.904808960855}
{'ner': 564.1957620382309}
{'ner': 562.6455273590982}
{'ner': 556.8549888953567}
{'ner': 580.5258156713098}
{'ner': 574.1384381949902}
{'ner': 572

In [182]:
colors = ["#FFFF00", "#008000", "#FF0000", "#A020F0", "#f5f5dc", "#808080"]
label_to_color = {a: b for a, b in zip(LABELS, colors)}
options = {"colors": label_to_color} 

In [183]:
# Open the JSON file for reading
with open('data.json', 'r') as file:
    # Read the contents of the file as a string
    json_str = file.read()

# Use eval to parse the JSON string into a Python list
json_data = eval(json_str)

# Now, json_data contains a list of JSON objects
products = [x['product'] for x in json_data]
products

['Adoçante Líquido Adocyl 200ml',
 'Shampoo Dabelle 250ml',
 'Restaurador Móveis Escuros Peroba 200ml',
 "Shampoo de Glicerina para Bebê Johnson's Baby 400ml",
 'Azeitona Verde Diza com Caroço Vidro 360g',
 'Massas Frescas para Pizza Massaleve Brotinho Pacote 300g',
 'Batata Canoa Sadia Pre Frita Cong 1,05kg',
 'Sabonete de Lavanda Siene 85g',
 'Cacau em Pó Nestlé 200g',
 'Kit Shampo 375ml e Condicionador 170ml Elseve Reparação Total 5 Embalagem 2 Un',
 'Mistura para Bolo Integral União Maçã e Canela Embalagem 400g',
 'Suco Del Valle 100% Maçã 1,5l',
 'Pão Integral 12 Grãos Pullman Pacote 450g',
 'Torcida Pão de Alho 38g',
 'Leite Condensado Itambé Lata 395g',
 'Combo 24 Heineken lata 350ml',
 'Gel Dental Liquifresh Ice Close Up 100g',
 'Geleia Frutas Amarelas Baldoni 270g',
 'Desodorante Aerosol Jasmin e Coco Bem Estar Suave 200ml',
 'Sabonete em Barra Botanicals Flor de Lótus Lux 85g',
 'Sopa Vono Chef Creme de Cebola 58g',
 'Bebida Láctea Elegê Morango e Coco 540g',
 'Barra de Choco

In [184]:
for product in products:
    doc = nlp(product)
    spacy.displacy.render(doc, style="ent", options=options, jupyter=True)