In [31]:
from SPARQLWrapper import SPARQLWrapper, JSON
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import requests

def fetch_coordinates(qid):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "props": "claims",
        "format": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()
    
    # Extract coordinates from property P625
    claims = data.get("entities", {}).get(qid, {}).get("claims", {})
    coordinate_claims = claims.get("P625", [])
    if coordinate_claims:
        coordinates = coordinate_claims[0]["mainsnak"]["datavalue"]["value"]
        lat, lon = coordinates["latitude"], coordinates["longitude"]
        return lat, lon
    else:
        return None


def fetch_wikidata_entity(qid):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "format": "json"
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Failed to fetch data: {response.status_code}")

# Load KB-BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased")
model = AutoModel.from_pretrained("KB/bert-base-swedish-cased")

# Define your SPARQL endpoint and query
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
headword = "Elizabeth"
definition = "stad i nordamerikanska staten New Jersey, vid Staten-Sound. 19 kilom. (2,5 mil) från New York, till hvilken den nästan kan anses som förstad. 20,832 innev. (1870). E. anlades 1665 och var 175557 New Jerseys hufvudstad."

query = f"""
SELECT ?item ?itemLabel ?description WHERE {{
  ?item rdfs:label "{headword}"@sv.
  ?item schema:description ?description.
  FILTER(LANG(?description) = "sv").
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],sv". }}
}}
LIMIT 10
"""
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# Extract results
items = []
for result in results["results"]["bindings"]:
    description = result["description"]["value"]
    if description.strip():  # Ensure description is not empty
        items.append({
            "uri": result["item"]["value"],
            "label": result["itemLabel"]["value"],
            "description": description
        })

# Function to compute sentence embeddings using KB-BERT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use mean pooling over the token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu()

# Check if we have items to process
if items:
    # Encode definition and descriptions
    definition_embedding = get_embedding(definition)
    description_embeddings = torch.stack([get_embedding(item["description"]) for item in items])

    # Compute cosine similarity between definition and descriptions
    cosine_scores = cosine_similarity(
        definition_embedding.unsqueeze(0).numpy(),
        description_embeddings.numpy()
    ).flatten()

    # Attach scores to items and sort by similarity
    for i, score in enumerate(cosine_scores):
        items[i]["score"] = score
    items = sorted(items, key=lambda x: x["score"], reverse=True)

    # Display results
    # for item in items:
    #     print(f"URI: {item['uri']}")
    #     print(f"Label: {item['label']}")
    #     print(f"Description: {item['description']}")
    #     print(f"Score: {item['score']:.4f}")
    #     print()

    # print(items[0])
    parts = items[0]['uri'].split("/")
    # data = fetch_wikidata_entity(parts[len(parts)-1])
    data = fetch_coordinates(parts[len(parts)-1])

    # Print JSON response
    print(data)
else:
    print("No valid items with descriptions found.")


(40.662222222222, -74.209166666667)
