In [1]:
import spacy

In [1]:
import logging
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
logger.debug("test")
logging.info('this is an info message')

11:12:49 DEBUG:test
11:12:49 INFO:this is an info message


In [1]:
import logging
logger = logging.getLogger()

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Setup file handler
fhandler  = logging.FileHandler('retrain-wd-entity-linking.log', mode='a')
fhandler.setLevel(logging.DEBUG)
fhandler.setFormatter(formatter)

# Configure stream handler for the cells
chandler = logging.StreamHandler()
chandler.setLevel(logging.DEBUG)
chandler.setFormatter(formatter)

# Add both handlers
logger.addHandler(fhandler)
logger.addHandler(chandler)
logger.setLevel(logging.DEBUG)

# Show the handlers
logger.handlers

# Log Something
logger.info("Test info")
logger.debug("Test debug")
logger.error("Test error")

2019-12-02 11:24:24,467 - root - INFO - Test info
2019-12-02 11:24:24,469 - root - DEBUG - Test debug
2019-12-02 11:24:24,470 - root - ERROR - Test error


In [3]:
from pathlib import Path

In [4]:
dir_kb = Path("/data/users/romain.claret/tm/data2/")
KB_FILE = "kb"
KB_MODEL_DIR = "nlp_kb"
OUTPUT_MODEL_DIR = "nlp_custom"

nlp_dir = dir_kb / OUTPUT_MODEL_DIR
kb_path = dir_kb / KB_FILE

In [11]:
import sys
sys.path.insert(1, '/data/users/romain.claret/tm/spaCy/bin/wiki_entity_linking')

In [28]:
epochs=10
dropout=0.5
lr=0.005
l2=1e-6
train_inst=1000000
dev_inst=159000
labels_discard=None

In [None]:
training_path = dir_kb / "pieces/x01.jsonl"

In [3]:
nlp = spacy.load(nlp_dir)

In [16]:
import kb_creator as kbc
kb = kbc.read_kb(nlp, kb_path)

In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fe5774bddd0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fe577387280>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fe5773872f0>),
 ('entity_linker', <spacy.pipeline.pipes.EntityLinker at 0x7fe576667c90>)]

In [6]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]

In [7]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fe5774bddd0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fe577387280>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fe5773872f0>),
 ('entity_linker', <spacy.pipeline.pipes.EntityLinker at 0x7fe576667c90>)]

In [34]:
el_pipe = nlp.get_pipe("entity_linker")

In [10]:
with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
    optimizer = nlp.begin_training()
    optimizer.learn_rate = lr
    optimizer.L2 = l2

In [None]:
import wikipedia_processor as wp
train_data = wp.read_training(
        nlp=nlp,
        entity_file_path=training_path,
        dev=False,
        limit=train_inst,
        kb=kb,
        labels_discard=labels_discard
    )

01:44:59 INFO:Reading train data with limit 1000000
 27%|██▋       | 274523/1000000 [1:37:46<3:48:43, 52.86it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 82%|████████▏ | 824119/1000000 [4:58:14<1:02:15, 47.08it/s] 

In [29]:
import wikipedia_processor as wp
dev_data = wp.read_training(
        nlp=nlp,
        entity_file_path=training_path,
        dev=True,
        limit=dev_inst,
        kb=None,
        labels_discard=labels_discard
    )

 63%|██████▎   | 99690/159000 [29:11<28:03, 35.24it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [46]:
import entity_linker_evaluation as ele
ele.measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)

12:22:06 INFO:Counts: {'CARDINAL': 254, 'DATE': 1693, 'EVENT': 1635, 'FAC': 4534, 'GPE': 38917, 'LANGUAGE': 536, 'LAW': 203, 'LOC': 4080, 'MONEY': 32, 'NORP': 5402, 'ORDINAL': 72, 'ORG': 32536, 'PERCENT': 2, 'PERSON': 61579, 'PRODUCT': 1765, 'QUANTITY': 43, 'TIME': 51, 'WORK_OF_ART': 5674}
12:22:06 INFO:Random: F-score = 0.55 | Recall = 0.507 | Precision = 0.602 | F-score by label = {'CARDINAL': 0.20824295010845983, 'DATE': 0.11950257810130421, 'EVENT': 0.39567128846804195, 'FAC': 0.633625619519634, 'GPE': 0.29359390822888853, 'LANGUAGE': 0.23076923076923075, 'LAW': 0.5714285714285714, 'LOC': 0.5399048625792812, 'MONEY': 0.5384615384615384, 'NORP': 0.277179050860276, 'ORDINAL': 0.16901408450704225, 'ORG': 0.5946940389608157, 'PERCENT': 0.0, 'PERSON': 0.7387328955309931, 'PRODUCT': 0.6228695382708398, 'QUANTITY': 0.5599999999999999, 'TIME': 0.46153846153846156, 'WORK_OF_ART': 0.6014630724521495}
12:22:06 INFO:Prior: F-score = 0.796 | Recall = 0.734 | Precision = 0.871 | F-score by label

In [47]:
ele.measure_performance(dev_data, kb, el_pipe)

12:23:36 INFO:Counts: {'CARDINAL': 254, 'DATE': 1693, 'EVENT': 1635, 'FAC': 4534, 'GPE': 38917, 'LANGUAGE': 536, 'LAW': 203, 'LOC': 4080, 'MONEY': 32, 'NORP': 5402, 'ORDINAL': 72, 'ORG': 32536, 'PERCENT': 2, 'PERSON': 61579, 'PRODUCT': 1765, 'QUANTITY': 43, 'TIME': 51, 'WORK_OF_ART': 5674}
12:23:36 INFO:Random: F-score = 0.55 | Recall = 0.506 | Precision = 0.601 | F-score by label = {'CARDINAL': 0.20390455531453364, 'DATE': 0.11950257810130421, 'EVENT': 0.40108217788298955, 'FAC': 0.6249841148811793, 'GPE': 0.29601378276365453, 'LANGUAGE': 0.2270168855534709, 'LAW': 0.6064139941690962, 'LOC': 0.5354122621564483, 'MONEY': 0.42307692307692313, 'NORP': 0.2826621289468708, 'ORDINAL': 0.14084507042253522, 'ORG': 0.5931809975756951, 'PERCENT': 0.0, 'PERSON': 0.73676909079444, 'PRODUCT': 0.6241090796405329, 'QUANTITY': 0.5866666666666667, 'TIME': 0.5054945054945055, 'WORK_OF_ART': 0.6010622306844373}
12:23:36 INFO:Prior: F-score = 0.796 | Recall = 0.734 | Precision = 0.871 | F-score by label 

In [None]:
for itn in range(epochs):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
    batchnr = 0

    with nlp.disable_pipes(*other_pipes):
        for batch in batches:
            try:
                docs, golds = zip(*batch)
                nlp.update(
                    docs=docs,
                    golds=golds,
                    sgd=optimizer,
                    drop=dropout,
                    losses=losses,
                )
                batchnr += 1
            except Exception as e:
                logger.error("Error updating batch:" + str(e))
    if batchnr > 0:
        logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
        ele.measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)

In [50]:
ele.measure_performance(dev_data, kb, el_pipe)

02:22:58 INFO:Counts: {'CARDINAL': 254, 'DATE': 1693, 'EVENT': 1635, 'FAC': 4534, 'GPE': 38917, 'LANGUAGE': 536, 'LAW': 203, 'LOC': 4080, 'MONEY': 32, 'NORP': 5402, 'ORDINAL': 72, 'ORG': 32536, 'PERCENT': 2, 'PERSON': 61579, 'PRODUCT': 1765, 'QUANTITY': 43, 'TIME': 51, 'WORK_OF_ART': 5674}
02:22:58 INFO:Random: F-score = 0.55 | Recall = 0.507 | Precision = 0.602 | F-score by label = {'CARDINAL': 0.19956616052060738, 'DATE': 0.12132241431604489, 'EVENT': 0.3963476496449104, 'FAC': 0.6229508196721311, 'GPE': 0.2945145126714625, 'LANGUAGE': 0.20075046904315197, 'LAW': 0.553935860058309, 'LOC': 0.5422832980972515, 'MONEY': 0.4615384615384615, 'NORP': 0.28379655889582156, 'ORDINAL': 0.18309859154929575, 'ORG': 0.5938687436598408, 'PERCENT': 0.0, 'PERSON': 0.7388590114315055, 'PRODUCT': 0.6253486210102263, 'QUANTITY': 0.5066666666666667, 'TIME': 0.5714285714285715, 'WORK_OF_ART': 0.6028660186391422}
02:22:58 INFO:Prior: F-score = 0.796 | Recall = 0.734 | Precision = 0.871 | F-score by label 

In [52]:
def run_el_toy_example(nlp):
    text = (
        "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, "
        "Douglas reminds us to always bring our towel, even in China or Brazil. "
        "The main character in Doug's novel is the man Arthur Dent, "
        "but Dougledydoug doesn't write about George Washington or Homer Simpson."
    )
    doc = nlp(text)
    logger.info(text)
    for ent in doc.ents:
        logger.info(" ".join(["ent", ent.text, ent.label_, ent.kb_id_]))
run_el_toy_example(nlp=nlp)

02:42:39 INFO:In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, Douglas reminds us to always bring our towel, even in China or Brazil. The main character in Doug's novel is the man Arthur Dent, but Dougledydoug doesn't write about George Washington or Homer Simpson.
02:42:39 INFO:ent Douglas Adams PERSON Q42
02:42:39 INFO:ent Douglas PERSON Q112220
02:42:39 INFO:ent China GPE Q148
02:42:39 INFO:ent Brazil GPE Q155
02:42:39 INFO:ent Doug PERSON Q1251705
02:42:39 INFO:ent Arthur Dent PERSON Q613901
02:42:39 INFO:ent Dougledydoug ORG NIL
02:42:39 INFO:ent George Washington PERSON Q23
02:42:39 INFO:ent Homer Simpson PERSON Q7810


In [53]:
nlp_output_dir = dir_kb / "nlp_custom"

In [54]:
nlp.to_disk(nlp_output_dir)

In [4]:
import spacy
from pathlib import Path
import sys
sys.path.insert(1, '/data/users/romain.claret/tm/spaCy/bin/wiki_entity_linking')
import kb_creator as kbc
import wikipedia_processor as wp
import entity_linker_evaluation as ele

dir_kb = Path("/data/users/romain.claret/tm/data2/")
KB_FILE = "kb"
KB_MODEL_DIR = "nlp_kb"
OUTPUT_MODEL_DIR = "nlp_custom_1"
nlp_dir = dir_kb / OUTPUT_MODEL_DIR
kb_path = dir_kb / KB_FILE

paths = ["pieces/x00.jsonl",
        "pieces/x01.jsonl",
        "pieces/x02.jsonl",
        "pieces/x03.jsonl",
        "pieces/x04.jsonl",
        "pieces/x05.jsonl",
        "pieces/x06.jsonl"]

epochs=10
dropout=0.5
lr=0.005
l2=1e-6
train_inst=1000000
dev_inst=159000
labels_discard=None

logger.info("PRE 0: loading nlp model: "+str(nlp_dir))
nlp = spacy.load(nlp_dir)
logger.info("PRE 1: loading kb model: "+str(kb_path))
kb = kbc.read_kb(nlp, kb_path)
logger.info("PRE 2: setup entity linker")
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
el_pipe = nlp.get_pipe("entity_linker")

with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
    optimizer = nlp.begin_training()
    optimizer.learn_rate = lr
    optimizer.L2 = l2

start_from = 2

#for i, path in enumerate(paths):
#    if i >= start_from:
training_path = dir_kb / paths[start_from]
logger.info("STEP 0: starting with: "+str(training_path))

logger.info("STEP 1: loading training data")
train_data = wp.read_training(
        nlp=nlp,
        entity_file_path=training_path,
        dev=False,
        limit=train_inst,
        kb=kb,
        labels_discard=labels_discard
    )

logger.info("STEP 2: loading dev data")
dev_data = wp.read_training(
    nlp=nlp,
    entity_file_path=training_path,
    dev=True,
    limit=dev_inst,
    kb=None,
    labels_discard=labels_discard
)

logger.info("STEP 3: evaluating the baseline")
ele.measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)

logger.info("STEP 4: starting training")
for itn in range(epochs):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
    batchnr = 0

    with nlp.disable_pipes(*other_pipes):
        for batch in batches:
            try:
                docs, golds = zip(*batch)
                nlp.update(
                    docs=docs,
                    golds=golds,
                    sgd=optimizer,
                    drop=dropout,
                    losses=losses,
                )
                batchnr += 1
            except Exception as e:
                logger.error("Error updating batch:" + str(e))
    if batchnr > 0:
        logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
        ele.measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)

logger.info("STEP 5: evaluating training")
ele.measure_performance(dev_data, kb, el_pipe)

logger.info("STEP 6: evaluating with example")
run_el_toy_example(nlp)

logger.info("STEP 7: save current state of nlp model")
nlp_output_dir = dir_kb / str("nlp_custom_"+str(i))
nlp.to_disk(nlp_output_dir)
    
logger.info("Done!")

2019-12-02 18:08:24,740 - root - INFO - PRE 0: loading nlp model: /data/users/romain.claret/tm/data2/nlp_custom_1


KeyboardInterrupt: 

In [4]:


for i, path in enumerate(paths):
    start_from = 2
    if i >= start_from:
        nlp_output_dir = dir_kb / str("nlp_custom_"+str(i))
        print(nlp_output_dir)
    #continue_link_entities(path)

/data/users/romain.claret/tm/data2/nlp_custom_2
/data/users/romain.claret/tm/data2/nlp_custom_3
/data/users/romain.claret/tm/data2/nlp_custom_4
/data/users/romain.claret/tm/data2/nlp_custom_5
/data/users/romain.claret/tm/data2/nlp_custom_6


In [6]:
paths[2]

'pieces/x02.jsonl'

In [23]:
logger.info("STEP 4: Final performance measurement of Entity Linking pipe"+training_path)

03:20:21 INFO:STEP 4: Final performance measurement of Entity Linking pipelol


In [2]:
logger.info("STEP 0: starting with: "+str(training_path))

12:11:02 INFO:STEP 0: starting with: /data/users/romain.claret/tm/data2/pieces/x02.jsonl


In [3]:
print("test")

test


In [8]:
del nlp
del kb

NameError: name 'nlp' is not defined