In [17]:
from SPARQLWrapper import SPARQLWrapper, JSON
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load KB-BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased")
model = AutoModel.from_pretrained("KB/bert-base-swedish-cased")

# Define your SPARQL endpoint and query
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
headword = "Andelsmejeri"
definition = "kallas ett mejeri, anlagdt som samaffär af ett antal landtbrukare i samma trakt och drifvet för deras räkning, i det att inkomsten delas i förhållande till mängden af den mjölk, som en hvar af dem levererat. Denna föres som söt mjölk till mejeriet, och sedan den blifvit skummad, föres skummjölken tillbaka till producenterna. Det första andelsmejeriet i Danmark inrättades 1882 i närheten af Varde, men idén dertill utbredde sig hastigt, så att nu finnas omkr. 1,000 sådana, spridda öfver hela land"

query = f"""
SELECT ?item ?itemLabel ?description WHERE {{
  ?item rdfs:label "{headword}"@sv.
  ?item schema:description ?description.
  FILTER(LANG(?description) = "sv").
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],sv". }}
}}
LIMIT 10
"""
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# Extract results
items = []
for result in results["results"]["bindings"]:
    items.append({
        "uri": result["item"]["value"],
        "label": result["itemLabel"]["value"],
        "description": result["description"]["value"]
    })

# Function to compute sentence embeddings using KB-BERT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use mean pooling over the token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu()

# Encode definition and descriptions
definition_embedding = get_embedding(definition)
description_embeddings = torch.stack([get_embedding(item["description"]) for item in items])

# Compute cosine similarity between definition and descriptions
cosine_scores = cosine_similarity(
    definition_embedding.unsqueeze(0).numpy(),
    description_embeddings.numpy()
).flatten()

# Attach scores to items and sort by similarity
for i, score in enumerate(cosine_scores):
    items[i]["score"] = score
items = sorted(items, key=lambda x: x["score"], reverse=True)

# Display results
for item in items:
    print(f"URI: {item['uri']}")
    print(f"Label: {item['label']}")
    print(f"Description: {item['description']}")
    print(f"Score: {item['score']:.4f}")
    print()


URI: http://www.wikidata.org/entity/Q2211
Label: Malmö
Description: tätort i Skåne, Sverige
Score: 0.8758

URI: http://www.wikidata.org/entity/Q117125790
Label: Malmö
Description: tågfärja
Score: 0.8227

URI: http://www.wikidata.org/entity/Q1884148
Label: Malmö
Description: Wikimedia-förgreningssida
Score: 0.7590

URI: http://www.wikidata.org/entity/Q108747955
Label: Malmö
Description: teckning av Gustaf Wilhelm Palm
Score: 0.6061

