In [1]:
import logging
logger = logging.getLogger()

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Setup file handler
fhandler  = logging.FileHandler('retrain-wd-entity-linking.log', mode='a')
fhandler.setLevel(logging.DEBUG)
fhandler.setFormatter(formatter)

# Configure stream handler for the cells
chandler = logging.StreamHandler()
chandler.setLevel(logging.DEBUG)
chandler.setFormatter(formatter)

# Add both handlers
logger.addHandler(fhandler)
logger.addHandler(chandler)
logger.setLevel(logging.DEBUG)

# Show the handlers
logger.handlers

# Log Something
logger.info("Test info")
logger.debug("Test debug")
logger.error("Test error")

2019-12-02 19:51:01,529 - root - INFO - Test info
2019-12-02 19:51:01,531 - root - DEBUG - Test debug
2019-12-02 19:51:01,532 - root - ERROR - Test error


In [2]:
def run_el_toy_example(nlp):
    text = (
        "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, "
        "Douglas reminds us to always bring our towel, even in China or Brazil. "
        "The main character in Doug's novel is the man Arthur Dent, "
        "but Dougledydoug doesn't write about George Washington or Homer Simpson."
    )
    doc = nlp(text)
    logger.info(text)
    for ent in doc.ents:
        logger.info(" ".join(["ent", ent.text, ent.label_, ent.kb_id_]))

In [3]:
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path
import sys
import random
sys.path.insert(1, '/data/users/romain.claret/tm/spaCy/bin/wiki_entity_linking')
import kb_creator as kbc
import wikipedia_processor as wp
import entity_linker_evaluation as ele

dir_kb = Path("/data/users/romain.claret/tm/data2/")
KB_FILE = "kb"
KB_MODEL_DIR = "nlp_kb"
OUTPUT_MODEL_DIR = "nlp_custom_1"
nlp_dir = dir_kb / OUTPUT_MODEL_DIR
kb_path = dir_kb / KB_FILE

paths = ["pieces/x00.jsonl",
        "pieces/x01.jsonl",
        "pieces/x02.jsonl",
        "pieces/x03.jsonl",
        "pieces/x04.jsonl",
        "pieces/x05.jsonl",
        "pieces/x06.jsonl"]

epochs=10
dropout=0.5
lr=0.005
l2=1e-6
train_inst=1000000
dev_inst=159000
labels_discard=None

logger.info("PRE 0: loading nlp model: "+str(nlp_dir))
nlp = spacy.load(nlp_dir)
logger.info("PRE 1: loading kb model: "+str(kb_path))
kb = kbc.read_kb(nlp, kb_path)
logger.info("PRE 2: setup entity linker")
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
el_pipe = nlp.get_pipe("entity_linker")

with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
    optimizer = nlp.begin_training()
    optimizer.learn_rate = lr
    optimizer.L2 = l2

start_from = 2

for i, path in enumerate(paths):
    if i >= start_from:
        training_path = dir_kb / paths[start_from]
        logger.info("STEP 0: starting with: "+str(training_path))

        logger.info("STEP 1: loading training data")
        train_data = wp.read_training(
                nlp=nlp,
                entity_file_path=training_path,
                dev=False,
                limit=train_inst,
                kb=kb,
                labels_discard=labels_discard
            )

        logger.info("STEP 2: loading dev data")
        dev_data = wp.read_training(
            nlp=nlp,
            entity_file_path=training_path,
            dev=True,
            limit=dev_inst,
            kb=None,
            labels_discard=labels_discard
        )

        logger.info("STEP 3: evaluating the baseline")
        ele.measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)

        logger.info("STEP 4: starting training")
        for itn in range(epochs):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
            batchnr = 0

            with nlp.disable_pipes(*other_pipes):
                for batch in batches:
                    try:
                        docs, golds = zip(*batch)
                        nlp.update(
                            docs=docs,
                            golds=golds,
                            sgd=optimizer,
                            drop=dropout,
                            losses=losses,
                        )
                        batchnr += 1
                    except Exception as e:
                        logger.error("Error updating batch:" + str(e))
            if batchnr > 0:
                logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
                ele.measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)

        logger.info("STEP 5: evaluating training")
        ele.measure_performance(dev_data, kb, el_pipe)

        logger.info("STEP 6: evaluating with example")
        run_el_toy_example(nlp)

        logger.info("STEP 7: save current state of nlp model")
        nlp_output_dir = dir_kb / str("nlp_custom_"+str(i))
        nlp.to_disk(nlp_output_dir)

logger.info("Done!")

2019-12-02 19:51:01,893 - root - INFO - PRE 0: loading nlp model: /data/users/romain.claret/tm/data2/nlp_custom_1
2019-12-02 19:51:31,963 - root - INFO - PRE 1: loading kb model: /data/users/romain.claret/tm/data2/kb
2019-12-02 19:51:35,985 - root - INFO - PRE 2: setup entity linker
2019-12-02 19:51:36,936 - root - INFO - STEP 0: starting with: /data/users/romain.claret/tm/data2/pieces/x02.jsonl
2019-12-02 19:51:36,937 - root - INFO - STEP 1: loading training data
2019-12-02 19:51:36,938 - wikipedia_processor - INFO - Reading train data with limit 1000000
2019-12-03 02:20:42,017 - wikipedia_processor - INFO - Read 1000000 entities in 153807 articles
2019-12-03 02:20:42,017 - root - INFO - STEP 2: loading dev data
2019-12-03 02:20:42,018 - wikipedia_processor - INFO - Reading dev data with limit 159000
2019-12-03 03:07:44,415 - wikipedia_processor - INFO - Read 159005 entities in 20221 articles
2019-12-03 03:07:44,416 - root - INFO - STEP 3: evaluating the baseline
2019-12-03 03:07:48,8

NameError: name 'random' is not defined

In [None]:
import random
from spacy.util import minibatch, compounding

logger.info("STEP 4: starting training")
for itn in range(epochs):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
    batchnr = 0

    with nlp.disable_pipes(*other_pipes):
        for batch in batches:
            try:
                docs, golds = zip(*batch)
                nlp.update(
                    docs=docs,
                    golds=golds,
                    sgd=optimizer,
                    drop=dropout,
                    losses=losses,
                )
                batchnr += 1
            except Exception as e:
                logger.error("Error updating batch:" + str(e))
    if batchnr > 0:
        logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
        ele.measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)

logger.info("STEP 5: evaluating training")
ele.measure_performance(dev_data, kb, el_pipe)

logger.info("STEP 6: evaluating with example")
run_el_toy_example(nlp)

logger.info("STEP 7: save current state of nlp model")
nlp_output_dir = dir_kb / str("nlp_custom_"+str(i))
nlp.to_disk(nlp_output_dir)

logger.info("Done!")

2019-12-03 11:11:53,880 - root - INFO - STEP 4: starting training
2019-12-03 11:51:00,289 - root - INFO - Epoch 0, train loss 0.18
2019-12-03 12:22:09,244 - entity_linker_evaluation - INFO - Context Only: F-score = 0.688 | Recall = 0.63 | Precision = 0.758 | F-score by label = {'CARDINAL': 0.22269807280513917, 'DATE': 0.10487353485502778, 'EVENT': 0.7352587244283996, 'FAC': 0.6566894594245904, 'GPE': 0.6790285219588665, 'LANGUAGE': 0.5528846153846154, 'LAW': 0.7163120567375886, 'LOC': 0.6448343079922028, 'MONEY': 0.48484848484848486, 'NORP': 0.3727645305514158, 'ORDINAL': 0.08421052631578947, 'ORG': 0.6747019471540159, 'PERCENT': 0.0, 'PERSON': 0.7646110702207148, 'PRODUCT': 0.6458247634718223, 'QUANTITY': 0.5569620253164558, 'TIME': 0.6823529411764706, 'WORK_OF_ART': 0.6146148782093482}
2019-12-03 12:54:18,952 - entity_linker_evaluation - INFO - Context And Prior: F-score = 0.772 | Recall = 0.706 | Precision = 0.851 | F-score by label = {'CARDINAL': 0.2569593147751606, 'DATE': 0.12831