In [23]:
import requests

def search_wikidata(term):
    base_url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "language": "en",
        "format": "json",
        "search": term
    }
    response = requests.get(base_url, params=params)
    return response.json()

# Example usage
search_results = search_wikidata("Victoria Falls")
print(search_results)


{'searchinfo': {'search': 'Victoria Falls'}, 'search': [{'id': 'Q43278', 'title': 'Q43278', 'pageid': 45475, 'display': {'label': {'value': 'Victoria Falls', 'language': 'en'}, 'description': {'value': 'waterfall on the Zambezi River in Zambia and Zimbabwe', 'language': 'en'}}, 'repository': 'wikidata', 'url': '//www.wikidata.org/wiki/Q43278', 'concepturi': 'http://www.wikidata.org/entity/Q43278', 'label': 'Victoria Falls', 'description': 'waterfall on the Zambezi River in Zambia and Zimbabwe', 'match': {'type': 'label', 'language': 'en', 'text': 'Victoria Falls'}}, {'id': 'Q1393804', 'title': 'Q1393804', 'pageid': 1332889, 'display': {'label': {'value': 'Victoria Falls', 'language': 'en'}, 'description': {'value': 'town in the province of Matabeleland North, Zimbabwe', 'language': 'en'}}, 'repository': 'wikidata', 'url': '//www.wikidata.org/wiki/Q1393804', 'concepturi': 'http://www.wikidata.org/entity/Q1393804', 'label': 'Victoria Falls', 'description': 'town in the province of Matabe

In [24]:

def extract_entities(api_response):
    entities = []
    for item in api_response['search']:
        entity = {
            'id': item['id'],
            'label': item['label'],
            'description': item['description'],
            'url': f"https://www.wikidata.org/wiki/{item['id']}"
        }
        entities.append(entity)
    return entities

# Use the function on the API response you received
extracted_entities = extract_entities(search_results)
print(extracted_entities)


KeyError: 'description'

#### Load the Pre-trained Model and Tokenizer: We'll load DistilBERT and its tokenizer.

In [25]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel

# Load pre-trained model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_sentence_embedding(sentence):
    # Tokenize the input sentence and convert to tensor
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Get embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the pooled output for sentence-level embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Example usage
sentence = "Victoria Falls is one of the seven wonders of the world"
embedding = get_sentence_embedding(sentence)
print(embedding)


tensor([[-4.2955e-01,  9.1230e-02,  2.2616e-01,  2.9911e-02,  3.1104e-01,
         -6.6161e-03,  6.3255e-01,  7.9957e-01, -4.8268e-01, -1.6842e-01,
          1.2238e-01, -7.3071e-01, -1.1764e-01,  9.0988e-01, -2.2506e-01,
          1.1047e-01,  2.9680e-02,  7.9962e-02,  2.1271e-01,  2.1784e-01,
         -6.1431e-02, -2.8651e-01,  2.1794e-03,  2.5708e-01,  3.3211e-01,
          1.0114e-01, -2.0919e-01,  1.0943e-01,  3.4491e-02, -1.2300e-01,
          7.5801e-02,  8.5728e-02, -3.0309e-01, -2.1564e-01, -1.0767e-01,
         -1.1439e-01, -4.9221e-03, -4.0638e-01, -3.4736e-01,  3.8438e-01,
         -6.4663e-01, -4.6051e-01,  2.5702e-01, -6.5436e-02, -1.0081e-01,
         -1.2579e-01,  7.8719e-02, -2.2484e-01,  1.8120e-01, -1.3343e-01,
         -2.4357e-01,  3.4546e-01, -1.5124e-01,  1.2019e-01,  1.8515e-01,
          5.2130e-01, -1.8632e-01,  1.1250e-01, -5.1981e-01, -8.0578e-02,
         -3.1802e-02,  1.0935e-01, -9.7734e-02, -4.0052e-01, -1.9681e-01,
          1.0941e-01,  1.3305e-01,  4.

In [26]:
def get_entity_embedding(text):
    # Tokenize and get embedding
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Example usage for an entity label
entity_label = "Victoria Falls"
entity_embedding = get_entity_embedding(entity_label)
print(entity_embedding)


tensor([[ 1.1037e-01,  2.7955e-01, -1.7265e-01,  1.2097e-01,  7.5108e-02,
         -2.2447e-02,  5.9097e-01,  9.8301e-02, -2.3359e-01, -1.5754e-01,
         -2.4414e-02, -3.0062e-01,  1.1841e-01,  3.1373e-01, -3.4294e-01,
          3.1577e-03, -8.2589e-02, -8.3827e-02,  5.3908e-01, -4.4387e-02,
         -7.8509e-02, -3.7347e-01,  1.0616e-01,  2.6537e-01,  1.2256e-01,
          5.5676e-02, -2.7630e-02,  1.3146e-01, -8.9944e-02, -2.6673e-01,
         -2.2254e-02, -1.1912e-01,  3.8635e-02,  1.3671e-01, -6.4841e-02,
         -4.2411e-01,  3.8246e-02, -8.5504e-02, -3.0877e-01, -1.2501e-01,
         -3.2923e-01, -3.7611e-01, -2.1320e-01, -2.2347e-01,  2.9087e-01,
         -8.9807e-02, -2.0779e-01, -1.0389e-01, -2.7529e-01, -3.2148e-01,
         -4.2356e-01,  4.4369e-01, -2.8014e-01,  2.9709e-02,  1.4740e-01,
          2.5723e-01,  3.7854e-02,  2.7916e-02, -5.4874e-02,  4.5558e-03,
          1.8528e-01,  1.3546e-01, -9.5570e-02, -7.9707e-02,  2.3744e-01,
         -1.1839e-01,  3.2108e-01,  9.

In [27]:
from torch.nn.functional import cosine_similarity

# Assuming 'embedding' is the sentence embedding and 'entity_embedding' is obtained for an entity
similarity = cosine_similarity(embedding, entity_embedding)
print(similarity)


tensor([0.6224])


In [28]:
from torch.nn.functional import cosine_similarity

def find_best_entity_match(sentence, entities):
    sentence_embedding = get_sentence_embedding(sentence)
    best_entity = None
    highest_similarity = -1  # Start with the lowest possible similarity

    for entity in entities:
        entity_label = entity['label']
        entity_embedding = get_entity_embedding(entity_label)
        
        # Calculate similarity
        similarity = cosine_similarity(sentence_embedding, entity_embedding).item()  # Convert tensor to a Python float
        print(f"Similarity with {entity_label}: {similarity}")  # Optional: print similarity for each entity

        if similarity > highest_similarity:
            highest_similarity = similarity
            best_entity = entity

    return best_entity, highest_similarity

# Example usage
sentence = "Liverpool won the match today."
best_match, similarity_score = find_best_entity_match(sentence, extracted_entities)
print(f"Best Match: {best_match}, Similarity Score: {similarity_score}")


Similarity with Liverpool: 0.3676189184188843
Similarity with Liverpool F.C.: 0.527188777923584
Similarity with Liverpool: 0.3676189184188843
Similarity with Liverpool: 0.3676189184188843
Similarity with Liverpool: 0.3676189184188843
Similarity with Liverpool: 0.3676189184188843
Similarity with Liverpool: 0.3676189184188843
Best Match: {'id': 'Q1130849', 'label': 'Liverpool F.C.', 'description': 'Association football club in Liverpool, England', 'url': 'https://www.wikidata.org/wiki/Q1130849'}, Similarity Score: 0.527188777923584


In [29]:
import spacy

# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

def extract_key_terms(sentence):
    doc = nlp(sentence)
    return [ent.text for ent in doc.ents]

# Example usage
sentence = "Victoria Falls is a waterfall in southern Africa on the Zambezi River at the border between Zambia and Zimbabwe."
key_terms = extract_key_terms(sentence)
print(key_terms)  # Should print entities like 'Eiffel Tower' and 'Paris'


['Victoria Falls', 'Africa', 'the Zambezi River', 'Zambia', 'Zimbabwe']


In [30]:
def search_wikidata(term):
    base_url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "language": "en",
        "format": "json",
        "search": term
    }
    response = requests.get(base_url, params=params)
    return response.json()

# Example usage for each key term
for term in key_terms:
    wikidata_results = search_wikidata(term)
    print(f"Results for {term}: {wikidata_results}")


Results for Victoria Falls: {'searchinfo': {'search': 'Victoria Falls'}, 'search': [{'id': 'Q43278', 'title': 'Q43278', 'pageid': 45475, 'display': {'label': {'value': 'Victoria Falls', 'language': 'en'}, 'description': {'value': 'waterfall on the Zambezi River in Zambia and Zimbabwe', 'language': 'en'}}, 'repository': 'wikidata', 'url': '//www.wikidata.org/wiki/Q43278', 'concepturi': 'http://www.wikidata.org/entity/Q43278', 'label': 'Victoria Falls', 'description': 'waterfall on the Zambezi River in Zambia and Zimbabwe', 'match': {'type': 'label', 'language': 'en', 'text': 'Victoria Falls'}}, {'id': 'Q1393804', 'title': 'Q1393804', 'pageid': 1332889, 'display': {'label': {'value': 'Victoria Falls', 'language': 'en'}, 'description': {'value': 'town in the province of Matabeleland North, Zimbabwe', 'language': 'en'}}, 'repository': 'wikidata', 'url': '//www.wikidata.org/wiki/Q1393804', 'concepturi': 'http://www.wikidata.org/entity/Q1393804', 'label': 'Victoria Falls', 'description': 'to

In [35]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Most people in Berlin think that Angela Merkel is the Chancellor of Germany"

ner_results = nlp(example)
print(ner_results)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-LOC', 'score': 0.9997603, 'index': 4, 'word': 'Berlin', 'start': 15, 'end': 21}, {'entity': 'B-PER', 'score': 0.9997875, 'index': 7, 'word': 'Angela', 'start': 33, 'end': 39}, {'entity': 'I-PER', 'score': 0.99973124, 'index': 8, 'word': 'Me', 'start': 40, 'end': 42}, {'entity': 'I-PER', 'score': 0.99930453, 'index': 9, 'word': '##rk', 'start': 42, 'end': 44}, {'entity': 'I-PER', 'score': 0.88110745, 'index': 10, 'word': '##el', 'start': 44, 'end': 46}, {'entity': 'B-LOC', 'score': 0.99969304, 'index': 15, 'word': 'Germany', 'start': 68, 'end': 75}]
