In [1]:
import pickle
import random
import spacy
import pandas as pd
from spacy.util import minibatch, compounding
from spacy.kb import KnowledgeBase

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('./data/dataset/2023-01-14T001854_dataset.pkl', 'rb') as handle:
    mapped_data = pickle.load(handle)

In [3]:
mapped_data[3]

('Schweitzer was born in Kaysersberg, Haute Alsace, the son of Louis Schweitzer and Adèle Schillinger. He spent his childhood in the Alsatian village of Gunsbach, where his father, the local Lutheran-Evangelical pastor of the EPCAAL, taught him how to play music. The tiny village became home to the Association Internationale Albert Schweitzer (AIAS). The medieval parish church of Gunsbach was shared by the Protestant and Catholic congregations, which held their prayers in different areas at different times on Sundays. This compromise arose after the Protestant Reformation and the Thirty Years\' War. Schweitzer, the pastor\'s son, grew up in this exceptional environment of religious tolerance, and developed the belief that true Christianity should always work towards a unity of faith and purpose. Schweitzer\'s first language was the Alsatian dialect of German language. At the Mulhouse gymnasium he received his "Abitur" (the certificate at the end of secondary education) in 1893. He stud

In [4]:
with open('./ppl_filtered.pkl', 'rb') as handle:
    ppl_filtered = pickle.load(handle)

In [47]:
items = pd.read_csv("./data/archive/item.csv")

In [6]:
items_aliases = pd.read_csv("./data/archive/item_aliases.csv")

In [7]:
items.head(5)

Unnamed: 0,item_id,en_label,en_description
0,1,Universe,totality of space and all contents
1,2,Earth,third planet from the Sun in the Solar System
2,3,life,matter capable of extracting energy from the e...
3,4,death,permanent cessation of vital functions
4,5,human,"common name of Homo sapiens, unique extant spe..."


In [8]:
items_aliases.head(5)

Unnamed: 0,item_id,en_alias
0,1,Our Universe
1,1,The Universe
2,1,The Cosmos
3,1,cosmos
4,2,Blue Planet


In [74]:
filtered_items = items[items["item_id"].isin(ppl_filtered)].reset_index(drop=True)

In [75]:
filtered_aliases = items_aliases[items_aliases["item_id"].isin(ppl_filtered)].reset_index(drop=True)

In [73]:
filtered_aliases

Unnamed: 0,item_id,en_alias
0,206,Stephen Joseph Harper
1,254,Mozart
2,254,Joannes Chrysostomus Wolfgangus Theophilus Ama...
3,254,W. A. Mozart
4,254,Johann Chrysostom Wolfgang Amadeus Mozart
...,...,...
27972,77193323,Lucien George Jr.
27973,77193323,Bow-Legged Lou
27974,77193643,Curt Bedeau
27975,77194102,Baby Gee


In [76]:
filtered_items

Unnamed: 0,item_id,en_label,en_description
0,206,Stephen Harper,22nd Prime Minister of Canada
1,254,Wolfgang Amadeus Mozart,Austrian composer of the Classical period
2,255,Ludwig van Beethoven,German classical and romantic composer
3,303,Elvis Presley,American singer and actor
4,392,Bob Dylan,"American recording artist; singer-songwriter, ..."
...,...,...,...
90014,77197048,Gregg Maizel,"musician, member of the band Vigil"
90015,77197304,Jo Connor,"musician, member of the band Vigil"
90016,77197507,Andy R,"musician, member of the band Vigil"
90017,77197510,X Factor,"musician, member of the band Vigil"


In [77]:
sorted_ppl = filtered_items.reset_index(drop=True).set_index(["en_label"]).sort_index()

In [108]:
sorted_ppl.loc["40"].values

array([[12580615, 'Korean R&B singer songwriter'],
       [3599072, 'Canadian hip-hop record producer']], dtype=object)

In [107]:
sorted_ppl

Unnamed: 0_level_0,item_id,en_description
en_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Joseph Thorne Harris,22070195,English pianist and composer
!PAUS3,3466056,Ukrainian musician
"""Bassy"" Bob Brockmann",13416958,American music producer and musician
"""E""qual",11184600,Japanese hip-hop artist and producer
"""King"" Bennie Nawahi",1742005,American steel guitar master from Hawaii
...,...,...
Ștefan Mangoianu,21396231,Romanian composer
Ștefan Niculescu,397145,Romanian composer
小松耕輔,11461237,Japanese composer
袁惟仁,9340858,Taiwanese musician


In [28]:
nlp = spacy.load("en_core_web_lg")
nlp_vectors = spacy.load("en_core_web_lg")

In [29]:
def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab, entity_vector_length=300)
    added_labels = set()
    for ind, label in enumerate(sorted_ppl.index.values):
        print(f"\r{ind:6} / {len(sorted_ppl)-1}",end="")
        if label in added_labels:
            continue

        res = sorted_ppl.loc[label]
        name = label
        added_labels.update([name])
        if type(res) == pd.Series:
            qid = [res.values[0]]
            desc = [res.values[1]]
        else:
            qid = res.values[:,0]
            desc = res.values[:,1]

        for ind, qid_ in enumerate(qid):
            desc_enc = nlp_vectors(desc[ind]).vector
            kb.add_entity(entity=str(qid_), entity_vector=desc_enc, freq=342)

        probs = [1/len(qid)] * len(qid) if len(qid) > 1 else [1]
        # try:
        kb.add_alias(alias=name, entities=list(map(str, qid)), probabilities=probs)
        # except Exception as e:
        #     print(name)
        #     print(list(qid))
        #     print(probs)
        #     raise e
    return kb

In [30]:
config={"incl_prior":False}
entity_linker = nlp.add_pipe("entity_linker", config=config)
entity_linker.from_disk("./models/entity_linker_v1")

<spacy.pipeline.entity_linker.EntityLinker at 0x18b6e181160>

In [117]:
# config={"incl_prior":False}
# entity_linker = nlp.add_pipe("entity_linker", config=config)
# entity_linker.set_kb(create_kb)

 90018 / 90018

In [119]:
#entity_linker.to_disk("./models/entity_linker_v1")

In [31]:
#random.shuffle(mapped_data)

train_data = mapped_data[:1000]
#test_data = mapped_data[18092:]

In [32]:
from spacy.training import Example

In [33]:
TRAIN_DOCS = []
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")
sentencizer = nlp.get_pipe("sentencizer")
count = 0
for ind, (text, annotation) in enumerate(train_data):
    print(f"\r{ind:6} / {len(train_data)-1}",end="")
    for span, code in annotation["links"].items():
        str_id = str(list(code.keys())[0])
        new_ann = {"entities": [(*span, "PERSON")]} | {"links": {span: {str_id: list(code.values())[0]}}}
    try:
        example = Example.from_dict(nlp.make_doc(text), new_ann)
    except Exception as e:
        count += 1
        continue
    example.reference = sentencizer(example.reference)
    #print(example)
    TRAIN_DOCS.append(example)

   999 / 999

In [34]:
count

7

In [129]:
# with open("TRAIN_DOCS_DATA.pkl", "wb") as file:
#     pickle.dump(TRAIN_DOCS, file)

In [13]:
# with open("TRAIN_DOCS_DATA.pkl", "rb") as file:
#     TRAIN_DOCS=pickle.load(file)

In [35]:
entity_linker.initialize(get_examples=lambda: TRAIN_DOCS)

In [36]:
from spacy.util import minibatch, compounding

In [37]:
len(TRAIN_DOCS)

993

In [38]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [39]:
spacy.prefer_gpu()

True

In [41]:
with nlp.select_pipes(enable=["entity_linker"]):   # train only the entity_linker
    optimizer = nlp.resume_training()
    for itn in range(50):   # 500 iterations takes about a minute to train
        batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for i, batch in enumerate(batches):
            print(f"\r{i:6}",end="")
            nlp.update(
                batch,   
                drop=0.2,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        print()
        if itn % 10 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

   243
0 Losses {'entity_linker': 0.6821393668651581}
   243
   243
   243
   243
   243
   243
   243
   243
   243
   243
10 Losses {'entity_linker': 0.6010531932115555}
   243
   243
   243
   243
   243
   243
   243
   243
   243
   243
20 Losses {'entity_linker': 0.4001265615224838}
   243
   243
   243
   243
   243
   243
   243
   243
   243
   243
30 Losses {'entity_linker': 0.28795699775218964}
   243
   243
   243
   243
   243
   243
   243
   243
   243
   243
40 Losses {'entity_linker': 0.19795291125774384}
   243
   243
   243
   243
   243
   243
   243
   243
   243
49 Losses {'entity_linker': 0.11580753326416016}


In [55]:
train_data[3][0]

'After his many various jobs, he decided that he wanted to write in Tin Pan Alley and signed several contracts with music publishers before his contracts were eventually terminated. His first song credit is listed as "In Love with the Memory of You", with music by William Schuman, published in 1931. Loesser\'s early lyrics included two hit songs of 1934, "Junk Man" and "I Wish I Were Twins" (both with music by Joe Meyer, and the latter with co-lyric credit to Eddie DeLange). However, they apparently did not help his reputation, and in later years, he never mentioned them. In the mid-1930s he would sing for his suppers at The Back Drop, a night spot on east 52nd Street along with composer Irving Actman, but during the day he worked on the staff of Leo Feist Inc. writing lyrics to Joseph Brandfon\'s music at $100 a week. After a year, Feist had not published any of them. He fared only slightly better collaborating with the future classical composer William Schuman, selling one song, that

In [58]:
text = "As a young musician, William Schuman compositions greatly influenced my understanding of the American classical music tradition."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)

William Schuman PERSON 503225
American NORP NIL


In [52]:
items[items["item_id"] == 1238247]

Unnamed: 0,item_id,en_label,en_description
1027281,1238247,Mohammad-Reza Shajarian,Iranian singer and musician
