In [1]:
import pickle
import random
import spacy
import pandas as pd
from spacy.util import minibatch, compounding
from spacy.kb import KnowledgeBase

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('./data/dataset/2023-01-14T001854_dataset.pkl', 'rb') as handle:
    mapped_data = pickle.load(handle)

In [3]:
with open('./ppl_filtered.pkl', 'rb') as handle:
    ppl_filtered = pickle.load(handle)

In [13]:
items = pd.read_csv("./data/archive/item.csv")

In [5]:
items_aliases = pd.read_csv("./data/archive/item_aliases.csv")

In [6]:
items.head(5)

Unnamed: 0,item_id,en_label,en_description
0,1,Universe,totality of space and all contents
1,2,Earth,third planet from the Sun in the Solar System
2,3,life,matter capable of extracting energy from the e...
3,4,death,permanent cessation of vital functions
4,5,human,"common name of Homo sapiens, unique extant spe..."


In [7]:
items_aliases.head(5)

Unnamed: 0,item_id,en_alias
0,1,Our Universe
1,1,The Universe
2,1,The Cosmos
3,1,cosmos
4,2,Blue Planet


In [8]:
filtered_items = items[items["item_id"].isin(ppl_filtered)].reset_index(drop=True)

In [9]:
filtered_items.item_id.values

array([     206,      254,      255, ..., 77197507, 77197510, 77224469],
      dtype=int64)

In [10]:
filtered_aliases = items_aliases[items_aliases["item_id"].isin(ppl_filtered)].reset_index(drop=True)

In [11]:
filtered_aliases

Unnamed: 0,item_id,en_alias
0,206,Stephen Joseph Harper
1,254,Mozart
2,254,Joannes Chrysostomus Wolfgangus Theophilus Ama...
3,254,W. A. Mozart
4,254,Johann Chrysostom Wolfgang Amadeus Mozart
...,...,...
27972,77193323,Lucien George Jr.
27973,77193323,Bow-Legged Lou
27974,77193643,Curt Bedeau
27975,77194102,Baby Gee


In [12]:
sorted_aliases = filtered_aliases.reset_index(drop=True).set_index(["en_alias"]).sort_index()

In [13]:
sorted_aliases

Unnamed: 0_level_0,item_id
en_alias,Unnamed: 1_level_1
"""Blind"" Boone",2906608
"""El Torito""",17278692
"""Fast"" Eddie Clarke",980220
"""Gatemouth"" Brown",1095432
"""Harmonica"" Smith",458486
...,...
프라임,16183261
한 태윤,3548661
한태윤,3548661
홍성지,2366485


In [14]:
sorted_aliases.index.values

array(['"Blind" Boone', '"El Torito"', '"Fast" Eddie Clarke', ..., '한태윤',
       '홍성지', '황찬성'], dtype=object)

In [15]:
filtered_items

Unnamed: 0,item_id,en_label,en_description
0,206,Stephen Harper,22nd Prime Minister of Canada
1,254,Wolfgang Amadeus Mozart,Austrian composer of the Classical period
2,255,Ludwig van Beethoven,German classical and romantic composer
3,303,Elvis Presley,American singer and actor
4,392,Bob Dylan,"American recording artist; singer-songwriter, ..."
...,...,...,...
90014,77197048,Gregg Maizel,"musician, member of the band Vigil"
90015,77197304,Jo Connor,"musician, member of the band Vigil"
90016,77197507,Andy R,"musician, member of the band Vigil"
90017,77197510,X Factor,"musician, member of the band Vigil"


In [16]:
sorted_ppl = filtered_items.reset_index(drop=True).set_index(["en_label"]).sort_index()

In [17]:
sorted_ppl.loc["40"].values

array([[12580615, 'Korean R&B singer songwriter'],
       [3599072, 'Canadian hip-hop record producer']], dtype=object)

In [18]:
sorted_ppl

Unnamed: 0_level_0,item_id,en_description
en_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Joseph Thorne Harris,22070195,English pianist and composer
!PAUS3,3466056,Ukrainian musician
"""Bassy"" Bob Brockmann",13416958,American music producer and musician
"""E""qual",11184600,Japanese hip-hop artist and producer
"""King"" Bennie Nawahi",1742005,American steel guitar master from Hawaii
...,...,...
Ștefan Mangoianu,21396231,Romanian composer
Ștefan Niculescu,397145,Romanian composer
小松耕輔,11461237,Japanese composer
袁惟仁,9340858,Taiwanese musician


In [3]:
nlp = spacy.load("en_core_web_lg")
nlp_vectors = spacy.load("en_core_web_lg")

In [20]:
def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab, entity_vector_length=300)
    added_labels = set()
    for ind, label in enumerate(sorted_ppl.index.values):
        print(f"\r{ind:6} / {len(sorted_ppl)-1}",end="")
        if label in added_labels:
            continue

        res = sorted_ppl.loc[label]
        name = label
        added_labels.update([name])
        if type(res) == pd.Series:
            qid = [res.values[0]]
            desc = [res.values[1]]
        else:
            qid = res.values[:,0]
            desc = res.values[:,1]

        for ind, qid_ in enumerate(qid):
            desc_enc = nlp_vectors(desc[ind]).vector
            kb.add_entity(entity=str(qid_), entity_vector=desc_enc, freq=342)

        probs = [1/len(qid)] * len(qid) if len(qid) > 1 else [1]
        kb.add_alias(alias=name, entities=list(map(str, qid)), probabilities=probs)
    
    print()
    added_aliases = set()
    for ind, name in enumerate(sorted_aliases.index.values):
        print(f"\r{ind:6} / {len(sorted_aliases)-1}",end="")
        if (name in added_aliases) or (name in added_labels):
            continue
        
        res = sorted_aliases.loc[name]
        added_aliases.update([name])

        if type(res) == pd.Series:
            qids = [res.values[0]]
        else:
            qids = res.values[:,0]
        
        probs = [1/len(qids)] * len(qids) if len(qids) > 1 else [1]
        try:
            kb.add_alias(alias=name, entities=list(map(str, qids)), probabilities=probs)
        except Exception as e:
            print(e)
            print(name)
            print(list(map(str, qids)))
            print(probs)

    return kb

In [4]:
nlp.from_disk("./models/nlp_v2/")

<spacy.lang.en.English at 0x16ce4ee7730>

In [5]:
config={"incl_prior":False}
entity_linker = nlp.add_pipe("entity_linker", config=config)
entity_linker.from_disk("./models/entity_linker_v2_TRAINED")

<spacy.pipeline.entity_linker.EntityLinker at 0x16c9483c940>

In [22]:
resssb = sorted_aliases.loc["00029827555 IPI"]

In [23]:
# config={"incl_prior":False}
# entity_linker = nlp.add_pipe("entity_linker", config=config)
# entity_linker.set_kb(create_kb)

 90018 / 90018
 27976 / 27976

In [24]:
# entity_linker.to_disk("./models/entity_linker_v2")

In [6]:
#random.shuffle(mapped_data)

train_data = mapped_data[:300]
#test_data = mapped_data[18092:]

In [25]:
from spacy.training import Example

In [26]:
TRAIN_DOCS = []
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")
sentencizer = nlp.get_pipe("sentencizer")
count = 0
for ind, (text, annotation) in enumerate(train_data):
    print(f"\r{ind:6} / {len(train_data)-1}",end="")
    for span, code in annotation["links"].items():
        str_id = str(list(code.keys())[0])
        new_ann = {"entities": [(*span, "PERSON")]} | {"links": {span: {str_id: list(code.values())[0]}}}
    try:
        example = Example.from_dict(nlp.make_doc(text), new_ann)
    except Exception as e:
        count += 1
        continue
    example.reference = sentencizer(example.reference)
    #print(example)
    TRAIN_DOCS.append(example)

   299 / 299

In [28]:
count

2

In [49]:
# with open("TRAIN_DOCS_DATA.pkl", "wb") as file:
#     pickle.dump(TRAIN_DOCS, file)

In [50]:
# with open("TRAIN_DOCS_DATA.pkl", "rb") as file:
#     TRAIN_DOCS=pickle.load(file)

In [29]:
entity_linker.initialize(get_examples=lambda: TRAIN_DOCS)

In [27]:
from spacy.util import minibatch, compounding

In [31]:
len(TRAIN_DOCS)

298

In [28]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [29]:
spacy.require_gpu(0)

True

In [34]:
with nlp.select_pipes(enable=["entity_linker"]):   # train only the entity_linker
    optimizer = nlp.resume_training()
    for itn in range(50):   # 500 iterations takes about a minute to train
        batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for i, batch in enumerate(batches):
            #print(f"\r{i:6}",end="")
            nlp.update(
                batch,   
                drop=0.2,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        if itn % 10 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

0 Losses {'entity_linker': 2.785752832889557}
10 Losses {'entity_linker': 0.816733717918396}
20 Losses {'entity_linker': 0.6053882539272308}
30 Losses {'entity_linker': 0.5001179724931717}
40 Losses {'entity_linker': 0.30261580646038055}
49 Losses {'entity_linker': 0.3449285328388214}


In [38]:
# entity_linker.to_disk("./models/entity_linker_v2_TRAINED")

In [39]:
# nlp.to_disk("./models/nlp_v2")

In [7]:
train_data[3][0]

'Schweitzer was born in Kaysersberg, Haute Alsace, the son of Louis Schweitzer and Adèle Schillinger. He spent his childhood in the Alsatian village of Gunsbach, where his father, the local Lutheran-Evangelical pastor of the EPCAAL, taught him how to play music. The tiny village became home to the Association Internationale Albert Schweitzer (AIAS). The medieval parish church of Gunsbach was shared by the Protestant and Catholic congregations, which held their prayers in different areas at different times on Sundays. This compromise arose after the Protestant Reformation and the Thirty Years\' War. Schweitzer, the pastor\'s son, grew up in this exceptional environment of religious tolerance, and developed the belief that true Christianity should always work towards a unity of faith and purpose. Schweitzer\'s first language was the Alsatian dialect of German language. At the Mulhouse gymnasium he received his "Abitur" (the certificate at the end of secondary education) in 1893. He studi

In [15]:
text= 'Bach'

In [175]:
def gen_from_ent(nlp, text):
    doc = nlp(text)

    print("<======== Detected ========>")

    for ent in doc.ents:
        if ent.kb_id_ != "NIL" :#and ent.label_ =="PERSON":
            print(ent.text, "https://www.wikidata.org/wiki/Q"+ent.kb_id_)
    
    
    print("\n<======== Not Detected ========>")
    
    for ent in doc.ents:
        if ent.kb_id_ == "NIL" and ent.label_ =="PERSON":
            print(ent.text, ent.label_,)

In [176]:
# doc = nlp(text)
# # for ent in doc.ents:
# #     if ent.label_ =="PERSON":
# #         print(ent.text, ent.kb_id_)

# print("<=================>")

# for ent in doc.ents:
#     if ent.kb_id_ != "NIL" :#and ent.label_ =="PERSON":
#         print(ent.text, "https://www.wikidata.org/wiki/Q"+ent.kb_id_)



# for ent in doc.ents:
#     if ent.kb_id_ == "NIL":
#         print(ent.text, ent.label_)


In [177]:
text1 = "Richard Wagner was a German composer and theatre director, primarily known for his operas. He is considered one of the most influential figures in Western classical music, and his works are still widely performed today. Wagner's use of leitmotifs, or recurring musical themes associated with specific characters, places, or ideas, was a major innovation in opera, and his use of chromatic harmony and orchestration was groundbreaking. Wagner's ideas and music have been the subject of much controversy, including his association with the Nazi Party and his anti-semitic writings. Despite this, his music is still widely regarded as some of the most powerful and evocative ever composed."

In [178]:
text2 = "Antonio Vivaldi was an Italian Baroque composer, virtuoso violinist, and priest. He is best known for his concertos, particularly The Four Seasons, a set of violin concertos that depict the seasons of the year. Vivaldi composed over 500 concertos, many of which were written for his all-female orchestra at the Ospedale della Pietà, an orphanage where he worked as a music teacher. His music was widely popular in his lifetime and was known for its technical brilliance, rhythmic vitality and melodic invention. He was a major influence on Johann Sebastian Bach and many other composers of his time. Vivaldi's music was rediscovered in the 20th century and is now considered an important part of the Baroque repertoire. He is also considered as one of the most important figures in the development of the concerto and the modern solo form."

In [179]:
text3 = "Wolfgang Amadeus Mozart was an Austrian composer and musician of the Classical period. He is widely considered to be one of the greatest composers of all time, and his works are still widely performed today. He began composing at the age of 5 and composed over 600 works, including operas, symphonies, chamber music, piano works, and religious music. Mozart's music is characterized by its melodic beauty, formal structure, and technical skill. His operas, such as The Marriage of Figaro, Don Giovanni, and The Magic Flute, are considered some of the greatest works in the genre and continue to be widely performed. His piano concertos and symphonies are also considered masterpieces of the Classical era. Mozart's influence on music was and still is immense, he exerted on the development of virtually every genre of Western music including opera, chamber music, and the piano concerto."

In [180]:
text4 = "Johann Sebastian Bach was a German composer and musician of the Baroque period. He is widely considered to be one of the greatest composers of all time and his music is known for its technical depth, emotional range, and profound faith. He composed in nearly every genre of his time, including sacred and secular vocal works (such as the St Matthew Passion and the Mass in B Minor), keyboard music (such as the Well-Tempered Clavier) and orchestral music (such as the Brandenburg Concertos). Bach's music is known for its complex counterpoint and use of fugues, as well as its emotional expressiveness. He was also a notable organist and many of his works were written specifically for the organ. Bach's music was not widely known during his lifetime, but his work was rediscovered in the 19th century and has since come to be considered a cornerstone of Western classical music. His music is still widely performed and studied today, and continues to be an inspiration to many composers."

In [188]:
text5 = """Johann Sebastian Bach, the renowned German composer and musician, had always dreamed of visiting the city of Paris. Finally, the opportunity presented itself and Bach eagerly set off on his journey.
As soon as he arrived in Paris, Bach was struck by the city's beauty and grandeur. He couldn't wait to start exploring and experiencing all that it had to offer. The first stop on his itinerary was a visit to the famous Notre-Dame Cathedral. As he walked through the ancient building, Bach couldn't help but feel a sense of awe and inspiration. He knew that he had to compose a piece of music to honor such a magnificent structure.
As the days passed, Bach spent his time visiting other famous landmarks such as the Eiffel Tower and the Louvre Museum. He also had the chance to meet with other musicians and composers of his time, and was thrilled to exchange ideas and learn from them. But the highlight of his trip was a concert he gave at the Palace of Versailles, playing his own compositions and impressing the royal court with his virtuosity.
As his trip came to an end, Bach realized that his time in Paris had been a truly transformative experience. He had not only been able to see and experience new things, but he had also been able to draw inspiration from them. Bach returned home with a renewed sense of purpose and an eagerness to continue composing and creating music that would inspire others just as Paris had inspired him."""

In [189]:
gen_from_ent(nlp, text1)

Richard Wagner https://www.wikidata.org/wiki/Q1511
Wagner https://www.wikidata.org/wiki/Q1511
Wagner https://www.wikidata.org/wiki/Q1511



In [190]:
gen_from_ent(nlp, text2)

Antonio Vivaldi https://www.wikidata.org/wiki/Q1340
Vivaldi https://www.wikidata.org/wiki/Q1340
Johann Sebastian Bach https://www.wikidata.org/wiki/Q1339
Vivaldi https://www.wikidata.org/wiki/Q1340



In [191]:
gen_from_ent(nlp, text3)

Wolfgang Amadeus Mozart https://www.wikidata.org/wiki/Q254
Mozart https://www.wikidata.org/wiki/Q254
Mozart https://www.wikidata.org/wiki/Q254

Don Giovanni PERSON


In [192]:
gen_from_ent(nlp, text4)

Johann Sebastian Bach https://www.wikidata.org/wiki/Q1339
Bach https://www.wikidata.org/wiki/Q1339
Bach https://www.wikidata.org/wiki/Q1339



In [193]:
gen_from_ent(nlp, text5)

Johann Sebastian Bach https://www.wikidata.org/wiki/Q1339
Paris https://www.wikidata.org/wiki/Q520621
Bach https://www.wikidata.org/wiki/Q1339
Paris https://www.wikidata.org/wiki/Q520621
Bach https://www.wikidata.org/wiki/Q1339
Bach https://www.wikidata.org/wiki/Q1339
Bach https://www.wikidata.org/wiki/Q1339
Bach https://www.wikidata.org/wiki/Q1339
Paris https://www.wikidata.org/wiki/Q520621
Bach https://www.wikidata.org/wiki/Q1339
Paris https://www.wikidata.org/wiki/Q520621

