In [1]:
import os
os.environ['HF_HOME'] = '/workspace/.cache/'

In [2]:
from tqdm import tqdm
import networkx as nx
from qa.generator import QAGenerator
from qa.templates import TEMPLATES
from llm.manager import LLMManager, LLMModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = LLMModel(model_id="cjvt/GaMS-27B-Instruct")
llm = LLMManager(model, track_history=False)
judge = LLMManager(model, track_history=False)

Loading checkpoint shards: 100%|██████████| 12/12 [00:00<00:00, 184.52it/s]
Device set to use cuda


In [4]:
graph = nx.read_graphml("../data/municipalities_peaks_castles.graphml")

In [5]:
generator = QAGenerator()
qas = generator.generate_questions(graph, TEMPLATES, num_questions=1000, add_distractors=10)

Generated 1000 questions after 1103 attempts.


In [None]:
def evaluate_answer(user, correctx):
	response = judge.ask(f"Odgovor uporabnika: {user}\nPravilen odgovor: {correct}\nAli odgovor uporabnika vsebuje pravilne informacije? (da/ne)", max_new_tokens=1)
	if response.lower() in ["yes", "da", "ja", "y", "j", "d"]:
		return True
	elif response.lower() in ["no", "ne", "n"]:
		return False
	else:
		print("Unknown response:", response)

In [None]:
answersNoContext = []
for qa in tqdm(qas):
	answer = llm.ask(f"Vprašanje: {qa.question}")

	response = judge.ask(f"Odgovor uporabnika: {answer}\nPravilen odgovor: {qa.answer}\nAli odgovor uporabnika vsebuje pravilne informacije? (da/ne)", max_new_tokens=1)
	correct = evaluate_answer(answer, qa.answer)
	answersNoContext.append((correct, qa.answer, answer))

answersWithContext = []
for qa in tqdm(qas):
	context = [str(fact) for fact in qa.context_facts]
	answer = llm.ask(f"Kontekst: {' '.join(context)}\nVprašanje: {qa.question}")

	response = judge.ask(f"Odgovor uporabnika: {answer}\nPravilen odgovor: {qa.answer}\nAli odgovor uporabnika vsebuje pravilne informacije? (da/ne)", max_new_tokens=1)
	correct = evaluate_answer(answer, qa.answer)
	answersWithContext.append((correct, qa.answer, answer))

  0%|          | 5/1000 [00:05<16:16,  1.02it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1000/1000 [14:42<00:00,  1.13it/s]
100%|██████████| 1000/1000 [10:41<00:00,  1.56it/s]


In [8]:
accuracyNoContext = sum(1 for correct, _, _ in answersNoContext if correct) / len(answersNoContext)
print(f"Accuracy without context: {accuracyNoContext:.1%}")
accuracyWithContext = sum(1 for correct, _, _ in answersWithContext if correct) / len(answersWithContext)
print(f"Accuracy with context: {accuracyWithContext:.1%}")

Accuracy without context: 9.2%
Accuracy with context: 69.1%


In [9]:
for correct, expected, actual in answersWithContext:
	if not correct:
		print(f"Expected: {expected}, Actual: {actual}")

Expected: 36.9 km², Actual: 36.9.
Expected: 23.0 km², Actual: 23.0.
Expected: 97.8 km², Actual: 97.8.
Expected: Občina Miren - Kostanjevica, Actual: Grad Miren stoji v občini Miren - Kostanjevica.
Expected: Občina Šoštanj, Actual: Grad Žamberk stoji v občini Šoštanj.
Expected: spomenik lokalnega pomena, Actual: Grad Rekštanj spada pod status spomenik lokalnega pomena.
Expected: Občina Šoštanj, Actual: Grad Kacenštajn stoji v občini Šoštanj.
Expected: 93.9 km², Actual: 93.9.
Expected: 22.9 km², Actual: 22.9.
Expected: spomenik lokalnega pomena, Actual: Grad Žužemberk spada pod spomenik lokalnega pomena.
Expected: 66.8 km², Actual: 66.8.
Expected: 59.4 km², Actual: 59.4.
Expected: Občina Rače - Fram, Actual: Grad Rače stoji v občini Rače - Fram.
Expected: Občina Vitanje, Actual: Vitanje.
Expected: gorenjska statistična regija, Actual: Vrh Šija leži v gorenjski statistični regiji.
Expected: Občina Cerknica, Actual: Vrh Slivnica leži v občini Cerknica.
Expected: 1522.0 m, Actual: Košenjak 