In [1]:
import pickle
import random
import spacy
import pandas as pd
from spacy.util import minibatch, compounding
from spacy.kb import KnowledgeBase

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('mapped_data.pkl', 'rb') as handle:
    mapped_data = pickle.load(handle)

In [3]:
data_filtered = pd.read_csv("filtered_data.csv", index_col=0)

In [4]:
# with open('df_filtered.pickle', 'rb') as handle:
#     data_filtered = pickle.load(handle)

In [5]:
items = pd.read_csv("./data/archive/item.csv")

In [6]:
items_aliases = pd.read_csv("./data/archive/item_aliases.csv")

In [7]:
items.head(5)

Unnamed: 0,item_id,en_label,en_description
0,1,Universe,totality of space and all contents
1,2,Earth,third planet from the Sun in the Solar System
2,3,life,matter capable of extracting energy from the e...
3,4,death,permanent cessation of vital functions
4,5,human,"common name of Homo sapiens, unique extant spe..."


In [8]:
items_aliases.head(5)

Unnamed: 0,item_id,en_alias
0,1,Our Universe
1,1,The Universe
2,1,The Cosmos
3,1,cosmos
4,2,Blue Planet


In [9]:
items_aliases[items_aliases["item_id"]==188285]

Unnamed: 0,item_id,en_alias


In [10]:
set_indexes = set(data_filtered.index.values)

In [11]:
set_indexes.update(set(data_filtered["Work_of_art"].values))

In [12]:
len(set_indexes)

171723

In [13]:
filtered_items = items[items["item_id"].isin(set_indexes)]

In [14]:
filtered_aliases = items_aliases[items_aliases["item_id"].isin(set_indexes)]

In [15]:
filtered_items = filtered_items.dropna()

In [16]:
filtered_items[filtered_items["en_label"] == "wayside cross"]

Unnamed: 0,item_id,en_label,en_description
1904910,2309609,wayside cross,"cross by a footpath, track or road"
45023997,66103239,wayside cross,"monument in Bechyňská Smoleč, Czechia"
45024026,66103270,wayside cross,iron cross between Bechyňská Smoleč and Černýš...
45024112,66103366,wayside cross,"monument in the village of Černýšovice, Czechia"
45024498,66103780,wayside cross,iron monument remembering 2 drowned people nea...


In [17]:
dupa = filtered_items.reset_index(drop=True).set_index(["en_label"]).sort_index()

In [18]:
dupa.loc["wayside cross"]

Unnamed: 0_level_0,item_id,en_description
en_label,Unnamed: 1_level_1,Unnamed: 2_level_1
wayside cross,66103270,iron cross between Bechyňská Smoleč and Černýš...
wayside cross,2309609,"cross by a footpath, track or road"
wayside cross,66103366,"monument in the village of Černýšovice, Czechia"
wayside cross,66103780,iron monument remembering 2 drowned people nea...
wayside cross,66103239,"monument in Bechyňská Smoleč, Czechia"


In [19]:
en_label = pd.unique(filtered_items.en_label)
for ind, label in enumerate(en_label):
    #filtered_items[filtered_items["en_label"] == label]
    print(f"\r{ind:6} / 79908",end="")
    dupa.loc[label]

 79907 / 79908

In [20]:
data_filtered

Unnamed: 0_level_0,Work_of_art
qid,Unnamed: 1_level_1
149,63412991
844,24856
904,21198342
1351,1344
7216,11424
...,...
67102008,24862
67102980,371752
67103629,11424
67103743,11424


In [21]:

#kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

In [22]:
#kb.dump("my_kb")

In [23]:
nlp = spacy.load("en_core_web_lg")
nlp_vectors = spacy.load("en_core_web_lg")

In [24]:
def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab, entity_vector_length=300)
    en_label = pd.unique(filtered_items.en_label)
    for ind, label in enumerate(en_label):
        print(f"\r{ind:6} / 79908",end="")
        res = dupa.loc[label]
        name = label
        if type(res) == pd.Series:
            qid = [res.values[0]]
            desc = [res.values[1]]
        else:
            qid = res.values[:,0]
            desc = res.values[:,1]

        for ind, qid_ in enumerate(qid):
            desc_enc = nlp_vectors(desc[ind]).vector
            kb.add_entity(entity=str(qid_), entity_vector=desc_enc, freq=342)

        probs = [1/len(qid)] * len(qid) if len(qid) > 1 else [1]
        try:
            kb.add_alias(alias=name, entities=list(map(str, qid)), probabilities=probs)
        except Exception as e:
            print(name)
            print(list(qid))
            print(probs)
            raise e
    return kb

In [25]:
config={"incl_prior":False}
entity_linker = nlp.add_pipe("entity_linker", config=config)
entity_linker.set_kb(create_kb)

 79907 / 79908

In [37]:
qids = list(set_indexes)

In [38]:
gold_ids = []
for text, annot in mapped_data:
    for span, links_dict in annot["links"].items():
        for link, value in links_dict.items():
            if value:
                gold_ids.append(link)

from collections import Counter
#print(Counter(gold_ids))

In [39]:
len(mapped_data)

19092

In [40]:
random.shuffle(mapped_data)

train_data = mapped_data
#test_data = mapped_data[18092:]

In [41]:
from spacy.training import Example

In [42]:
TRAIN_DOCS = []
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")
sentencizer = nlp.get_pipe("sentencizer")
for text, annotation in train_data:
    for span, code in annotation["links"].items():
        str_id = str(list(code.keys())[0])
        new_ann = {"entities": [(*span, "WORK_OF_ART")]} | {"links": {span: {str_id: list(code.values())[0]}}}
    try:
        example = Example.from_dict(nlp.make_doc(text), new_ann)
    except Exception as e:
        continue
    example.reference = sentencizer(example.reference)
    #print(example)
    TRAIN_DOCS.append(example)

In [43]:
len(TRAIN_DOCS)

18577

In [44]:
entity_linker.initialize(get_examples=lambda: TRAIN_DOCS)

In [45]:
from spacy.util import minibatch, compounding

In [46]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [47]:
with nlp.select_pipes(enable=["entity_linker"]):   # train only the entity_linker
    optimizer = nlp.resume_training()
    for itn in range(500):   # 500 iterations takes about a minute to train
        random.shuffle(TRAIN_DOCS)
        batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for batch in batches:
            nlp.update(
                batch,   
                drop=0.2,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

KeyboardInterrupt: 

In [154]:
filtered_items[filtered_items["item_id"]==188285]

Unnamed: 0,item_id,en_label,en_description
174729,188285,motet,choral musical composition


In [152]:
ttt = "The 'Tenebrae Responsories' by Tomás Luis de Victoria are a set of eighteen motets for four voices a cappella. The late Renaissance Spanish composer set the Responsories for Holy Week known as Tenebrae responsories. They are liturgical texts prescribed for use in the Catholic observances during the Triduum of the Holy Week, in the Matins of Maundy Thursday, Good Friday and Holy Saturday. The compositions were published in Rome in 1585. The eighteen Tenebrae Responsories are set for four voices each but with varying disposition of the voices soprano (S), alto (A), tenor (T) and bass (B). Soprano, tenor and bass are at times divided. Six responsories are dedicated to each Matins of Maundy Thursday ("", the Lord's supper), Good Friday, and Holy Saturday (""). *"
ttt[76:81]

'motet'

In [22]:
entity_linker = nlp.create_pipe("entity_linker", config={"incl_prior":False})
entity_linker.set_kb(kb)
nlp.add_pipe(entity_linker, last=True)

In [26]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
with nlp.disable_pipes(*other_pipes):   # train only the entity_linker
    optimizer = nlp.begin_training()
    for itn in range(500):   # 500 iterations takes about a minute to train
        random.shuffle(TRAIN_DOCS)
        batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  
                annotations,   
                drop=0.2,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

RuntimeError: [E188] Could not match the gold entity links to entities in the doc - make sure the gold EL data refers to valid results of the named entity recognizer in the `nlp` pipeline.

In [23]:


# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
# with nlp.disable_pipes(*other_pipes):
#     optimizer = nlp.begin_training()
#     for i in range(500):
#         random.shuffle(TRAIN_DOCS)#DOSTOSOWAĆ DANE
#         batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
#         losses = {}
#         for batch in batches:
#             texts, annotations = zip(*batch)
#             nlp.update(
#                 texts,
#                 annotations,
#                 drop=0.2,
#                 losses=losses,
#                 sgd=optimizer
#             )
#         if  i % 50 == 0:
#             print(i, "Losses", losses)

# nlp.to_disk("my_nlp_el")

# with open("test_set.pkl", "wb") as f:
#     pickle.dump(f)

RuntimeError: [E188] Could not match the gold entity links to entities in the doc - make sure the gold EL data refers to valid results of the named entity recognizer in the `nlp` pipeline.

In [None]:
spacy

In [None]:
nlp = spacy.load(output_dir / "my_nlp_el")
text = ""
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)