In [2]:
import logging
logger = logging.getLogger()

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Setup file handler
fhandler  = logging.FileHandler('retrain-wd-entity-linking.log', mode='a')
fhandler.setLevel(logging.DEBUG)
fhandler.setFormatter(formatter)

# Configure stream handler for the cells
chandler = logging.StreamHandler()
chandler.setLevel(logging.DEBUG)
chandler.setFormatter(formatter)

# Add both handlers
logger.addHandler(fhandler)
logger.addHandler(chandler)
logger.setLevel(logging.DEBUG)

# Show the handlers
logger.handlers

# Log Something
logger.info("Test info")
logger.debug("Test debug")
logger.error("Test error")

2019-12-05 14:46:12,951 - root - INFO - Test info
2019-12-05 14:46:12,954 - root - DEBUG - Test debug
2019-12-05 14:46:12,956 - root - ERROR - Test error


In [3]:
def run_el_toy_example(nlp):
    text = (
        "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, "
        "Douglas reminds us to always bring our towel, even in China or Brazil. "
        "The main character in Doug's novel is the man Arthur Dent, "
        "but Dougledydoug doesn't write about George Washington or Homer Simpson."
    )
    doc = nlp(text)
    logger.info(text)
    for ent in doc.ents:
        logger.info(" ".join(["ent", ent.text, ent.label_, ent.kb_id_]))

In [None]:
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path
import sys
import random
sys.path.insert(1, '/data/users/romain.claret/tm/spaCy/bin/wiki_entity_linking')
import kb_creator as kbc
import wikipedia_processor as wp
import entity_linker_evaluation as ele

dir_kb = Path("/data/users/romain.claret/tm/data2/")
KB_FILE = "kb"
KB_MODEL_DIR = "nlp_kb"
OUTPUT_MODEL_DIR = "nlp_custom_3"
nlp_dir = dir_kb / OUTPUT_MODEL_DIR
kb_path = dir_kb / KB_FILE

paths = ["pieces/x00.jsonl",
        "pieces/x01.jsonl",
        "pieces/x02.jsonl",
        "pieces/x03.jsonl",
        "pieces/x04.jsonl",
        "pieces/x05.jsonl",
        "pieces/x06.jsonl"]

epochs=10
dropout=0.5
lr=0.005
l2=1e-6
train_inst=1000000
dev_inst=159000
labels_discard=None

logger.info("PRE 0: loading nlp model: "+str(nlp_dir))
nlp = spacy.load(nlp_dir)
logger.info("PRE 1: loading kb model: "+str(kb_path))
kb = kbc.read_kb(nlp, kb_path)
logger.info("PRE 2: setup entity linker")
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
el_pipe = nlp.get_pipe("entity_linker")

with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
    optimizer = nlp.begin_training()
    optimizer.learn_rate = lr
    optimizer.L2 = l2

start_from = 4

for i, path in enumerate(paths):
    if i >= start_from:
        training_path = dir_kb / paths[i]
        logger.info("STEP 0: starting with: "+str(training_path))

        logger.info("STEP 1: loading training data")
        train_data = wp.read_training(
                nlp=nlp,
                entity_file_path=training_path,
                dev=False,
                limit=train_inst,
                kb=kb,
                labels_discard=labels_discard
            )

        logger.info("STEP 2: loading dev data")
        dev_data = wp.read_training(
            nlp=nlp,
            entity_file_path=training_path,
            dev=True,
            limit=dev_inst,
            kb=None,
            labels_discard=labels_discard
        )

        logger.info("STEP 3: evaluating the baseline")
        ele.measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)

        logger.info("STEP 4: starting training")
        for itn in range(epochs):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
            batchnr = 0

            with nlp.disable_pipes(*other_pipes):
                for batch in batches:
                    try:
                        docs, golds = zip(*batch)
                        nlp.update(
                            docs=docs,
                            golds=golds,
                            sgd=optimizer,
                            drop=dropout,
                            losses=losses,
                        )
                        batchnr += 1
                    except Exception as e:
                        logger.error("Error updating batch:" + str(e))
            if batchnr > 0:
                logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
                ele.measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)

        logger.info("STEP 5: evaluating training")
        ele.measure_performance(dev_data, kb, el_pipe)

        logger.info("STEP 6: evaluating with example")
        run_el_toy_example(nlp)

        logger.info("STEP 7: save current state of nlp model")
        nlp_output_dir = dir_kb / str("nlp_custom_"+str(i))
        nlp.to_disk(nlp_output_dir)
        
        del train_data
        del dev_data

logger.info("Done!")

2019-12-05 14:46:14,340 - root - INFO - PRE 0: loading nlp model: /data/users/romain.claret/tm/data2/nlp_custom_3
2019-12-05 14:47:00,499 - root - INFO - PRE 1: loading kb model: /data/users/romain.claret/tm/data2/kb
2019-12-05 14:47:04,524 - root - INFO - PRE 2: setup entity linker
2019-12-05 14:47:06,029 - root - INFO - STEP 0: starting with: /data/users/romain.claret/tm/data2/pieces/x04.jsonl
2019-12-05 14:47:06,030 - root - INFO - STEP 1: loading training data
2019-12-05 14:47:06,030 - wikipedia_processor - INFO - Reading train data with limit 1000000
 20%|█▉        | 195625/1000000 [1:16:10<7:26:10, 30.05it/s] 

In [None]:
#import random
#from spacy.util import minibatch, compounding
#
#logger.info("STEP 4: starting training")
#for itn in range(epochs):
#    random.shuffle(train_data)
#    losses = {}
#    batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
#    batchnr = 0
#
#    with nlp.disable_pipes(*other_pipes):
#        for batch in batches:
#            try:
#                docs, golds = zip(*batch)
#                nlp.update(
#                    docs=docs,
#                    golds=golds,
#                    sgd=optimizer,
#                    drop=dropout,
#                    losses=losses,
#                )
#                batchnr += 1
#            except Exception as e:
#                logger.error("Error updating batch:" + str(e))
#    if batchnr > 0:
#        logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
#        ele.measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)
#
#logger.info("STEP 5: evaluating training")
#ele.measure_performance(dev_data, kb, el_pipe)
#
#logger.info("STEP 6: evaluating with example")
#run_el_toy_example(nlp)
#
#logger.info("STEP 7: save current state of nlp model")
#nlp_output_dir = dir_kb / str("nlp_custom_"+str(i))
#nlp.to_disk(nlp_output_dir)
#
#logger.info("Done!")

In [5]:
logger.info("Done!")

2019-12-09 13:28:34,200 - root - INFO - Done!


In [7]:
#del train_data
del 

NameError: name 'dev_data' is not defined