In [54]:
import json
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import warnings
from SPARQLWrapper import SPARQLWrapper, JSON
import requests

warnings.filterwarnings("ignore")


In [49]:
with open("../world_leaders_qa_dataset.json", "r") as f:
    kg_data = json.load(f)

print(f"loaded {len(kg_data)} QA items")


loaded 5 QA items


In [62]:
def get_wikidata_labels(qids, language='en'):

    url = 'https://www.wikidata.org/w/api.php'
    labels = {}
    batch_size = 50  # Max per API call

    for i in range(0, len(qids), batch_size):
        batch = qids[i:i+batch_size]
        params = {
            'action': 'wbgetentities',
            'ids': '|'.join(batch),
            'format': 'json',
            'props': 'labels',
            'languages': language
        }
        response = requests.get(url, params=params)
        data = response.json()
        for qid in batch:
            label = data["entities"][qid]["labels"].get(language, {}).get("value")
            labels[qid] = label if label else qid
    return labels

def extract_statements(item, map_qids=True, language='en'):

    statements = []
    qids = set()

    for triple in item.get("triples", []):
        subj, _, _, rel, obj = triple
        if map_qids:
            if subj.startswith("Q"):
                qids.add(subj)
            if obj.startswith("Q"):
                qids.add(obj)

    id_to_label = get_wikidata_labels(list(qids), language) if map_qids else {}

    for triple in item.get("triples", []):
        subj, _, _, rel, obj = triple
        subj_label = id_to_label.get(subj, subj)
        obj_label = id_to_label.get(obj, obj)
        statements.append(f"{subj_label} {rel} {obj_label}.")

    if "context_hint" in item:
        statements.append(f"Hint: {item['context_hint']}")

    return "\n".join(statements)

documents = [
    Document(
        page_content=extract_statements(item),
        metadata={"question": item.get("question", ""), "answer": item.get("answer", "")}
    )
    for item in kg_data
]
print(f"Prepared {len(documents)} documents.")


Prepared 5 documents.


In [63]:
embedding_model = "all-MiniLM-L6-v2" # used for semantic encoding of the question
embedder = HuggingFaceEmbeddings(model_name=embedding_model)
vectorstore = FAISS.from_documents(documents, embedder)

print("vectorstore created")


vectorstore created


In [64]:
#models = {
#    "phi": "microsoft/phi-1_5",
#    "deepseek": "deepseek-ai/deepseek-coder-1.3b-instruct",
#    "tinyllama": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
#}


model_id = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=32,               # short answers 
    temperature=0.0,                 # deterministic
    return_full_text=False
)

llm = HuggingFacePipeline(pipeline=generator)

Device set to use cpu


In [65]:
template = """
You are given the following knowledge facts:

{context}

Based on this information, answer the following question concisely.
Only answer the current question. Do not continue or generate other questions.

Question: {question}
Answer:"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)


In [66]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 1}), # use only the first most relevant context
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [67]:
# example input
query = "Who is the head of state of Italy?"
ground_truth = "Sergio Mattarella"

result = qa_chain({"query": query})
raw_answer = result["result"].strip()
short_answer = raw_answer.split("\n")[0] 

print("-- QUESTION --")
print(query)

print("\n-- RETRIEVED CONTEXTS --")
for i, doc in enumerate(result["source_documents"]):
    print(f"\n-- Context --")
    print(doc.page_content.strip()[:800])  

print("\n-- MODEL ANSWER --")
print(short_answer)

print("\n-- GROUND TRUTH --")
print(ground_truth)


-- QUESTION --
Who is the head of state of Italy?

-- RETRIEVED CONTEXTS --

-- Context --
Sergio Mattarella image http://commons.wikimedia.org/wiki/Special:FilePath/Sergio%20Mattarella%20Presidente%20della%20Repubblica%20Italiana.jpg.
Sergio Mattarella place of birth Palermo.
Sergio Mattarella sex or gender male.
Sergio Mattarella father Bernardo Mattarella.
Sergio Mattarella mother Maria Buccellato.
Sergio Mattarella spouse Marisa Chiazzese.
Sergio Mattarella country of citizenship Italy.
Sergio Mattarella instance of human.
Sergio Mattarella position held President of Italy.
Sergio Mattarella child Bernardo Giorgio Mattarella.
Sergio Mattarella child Laura Mattarella.
Sergio Mattarella educated at Sapienza University of Rome.
Sergio Mattarella educated at University of Palermo.
Sergio Mattarella field of work politics.
Sergio Mattarella field of work law.
Sergio Mattarella me

-- MODEL ANSWER --
Sergio Mattarella, as he is the head of state in Italy.

-- GROUND TRUTH --
Sergio Matta

In [None]:
# example input, context encoded, wrong answer
query = "Who is the head of state of Italy?"
ground_truth = "Sergio Mattarella"

result = qa_chain({"query": query})
raw_answer = result["result"].strip()
short_answer = raw_answer.split("\n")[0] 

print("-- QUESTION --")
print(query)

print("\n-- RETRIEVED CONTEXTS --")
for i, doc in enumerate(result["source_documents"]):
    print(f"\n-- Context --")
    print(doc.page_content.strip()[:800])  

print("\n-- MODEL ANSWER --")
print(short_answer)

print("\n-- GROUND TRUTH --")
print(ground_truth)


-- QUESTION --
Who is the head of state of Italy?

-- RETRIEVED CONTEXTS --

-- Context --
Q3956186 image http://commons.wikimedia.org/wiki/Special:FilePath/Sergio%20Mattarella%20Presidente%20della%20Repubblica%20Italiana.jpg.
Q3956186 place of birth Palermo.
Q3956186 sex or gender male.
Q3956186 father Bernardo Mattarella.
Q3956186 mother Maria Buccellato.
Q3956186 spouse Marisa Chiazzese.
Q3956186 country of citizenship Italy.
Q3956186 instance of human.
Q3956186 position held President of Italy.
Q3956186 child Bernardo Giorgio Mattarella.
Q3956186 child Laura Mattarella.
Q3956186 educated at Sapienza University of Rome.
Q3956186 educated at University of Palermo.
Q3956186 field of work politics.
Q3956186 field of work law.
Q3956186 member of political party Democratic Party.
Q3956186 native language Italian.
Q3956186 occupation judge.
Q3956186 occupation lawyer.
Q3956186 occu

-- MODEL ANSWER --
Bernardo Mattarella, as he is the head of state of Italy.

-- GROUND TRUTH --
Sergio Mat

In [14]:
raw_query = "Who is the head of state of Italy?"

direct_response = llm(raw_query)

print("-- QUESTION --")
print(raw_query)

print("\n-- MODEL RESPONSE WITHOUT CONTEXT --")
print(direct_response.strip())


-- QUESTION --
Who is the head of state of Italy?

-- MODEL RESPONSE WITHOUT CONTEXT --
SELECT Head_of_State FROM table WHERE Country = 'Italy';

-- Answer: The head of state of Italy is Carlo
