Run this notebook once you have trained the NER model (see `train.ipynb`)

# Load data from many markets

In [3]:
import utils.scrapping
from sys import stderr
import os

In [4]:
RAW_DATA_DIR = os.path.join("data", "raw")

if not os.path.exists(RAW_DATA_DIR):
    os.mkdir(RAW_DATA_DIR)

In [None]:
markets = utils.scrapping.list_markets_in_city("São José dos Campos", "SP")

In [None]:
len(markets)

In [None]:
# save data for 5 markets
N = 5
for market in markets[:N]:
    print(f"retrieving products of {market.name}...")
    utils.scrapping.load_products_from_website(market, True)
    products = utils.scrapping.load_products_from_local_file(market)
    if products is None:
        print("FAILED", file=stderr)
        continue
    fname = os.path.join(RAW_DATA_DIR, market.slug_name + ".json")
    print(f"saving products of {market.name} into {fname}...")
    utils.scrapping.generate_data_file(products, fname)
print("DONE")

We can find the raw (i.e. not yet tagged) data in `data/raw/`

# Tag data

In [5]:
from utils.ner import NERModel
import json
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
TAGGED_DATA_DIR = os.path.join("data", "tagged_bert")

if not os.path.exists(TAGGED_DATA_DIR):
    os.mkdir(TAGGED_DATA_DIR)

In [7]:
ner = NERModel()

In [8]:
def tag_market(m):
    ner_resp = ner.retrieve_tags(m['product'])
    return {
        'product': m['product'],
        'tags': ner_resp['tags']
    }

for fname in os.listdir(RAW_DATA_DIR):

    print(f"tagging {fname}...")

    # read raw data:
    with open(os.path.join(RAW_DATA_DIR, fname), 'r') as f:
        market_products = eval(f.read()) # TODO dump as json, load as json; eval unsafe
    
    # tag data:
    market_products = list(tqdm(map(tag_market, market_products), total=len(market_products)))
    
    # save tagged data:
    with open(os.path.join(TAGGED_DATA_DIR, fname), 'w+') as f:
        json.dump(market_products, f, ensure_ascii=False)

tagging atacadao-sao-jose-dos-campos-shopping-jd-satelite.json...


  0%|          | 0/9443 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 9443/9443 [06:09<00:00, 25.53it/s]


tagging carrefour-hiper---sj-campos-serimbura.json...


100%|██████████| 10366/10366 [06:38<00:00, 26.04it/s]


tagging coop---jd-morumbi-jd-morumbi.json...


100%|██████████| 8996/8996 [05:46<00:00, 25.99it/s]


tagging dia-supermercado---aquarius-jardim-alvorada.json...


100%|██████████| 3420/3420 [02:07<00:00, 26.73it/s]


tagging extra-sjc-centro-jd-osvaldo-cruz.json...


100%|██████████| 10356/10356 [06:29<00:00, 26.57it/s]
