In [2]:
import os
import json

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_core.documents import Document
from groq import Groq

In [3]:
os.makedirs("data", exist_ok=True)

In [4]:

with open("data/simplified_squad_300.json", "r", encoding="utf-8") as f:
    data = json.load(f)

documents = [Document(page_content=d["context"]) for d in data]
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents, embedding_model)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

llm = Ollama(model="phi3")

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=False
)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  llm = Ollama(model="phi3")


In [5]:
# Baseline + Rag answers

baseline_results = []
for sample in data[:20]:
    question = sample["question"]
    gt_answer = sample["answer"]
    response = llm.invoke(question)
    baseline_results.append({
        "question": question,
        "ground_truth": gt_answer,
        "generated": response
    })

with open("data/baseline_answers.json", "w", encoding="utf-8") as f:
    json.dump(baseline_results, f, ensure_ascii=False, indent=2)
print("✅ baseline_answers.json готово")

rag_results = []
for sample in data[:20]:
    question = sample["question"]
    gt_answer = sample["answer"]
    result = rag_chain.run(question)
    rag_results.append({
        "question": question,
        "ground_truth": gt_answer,
        "generated": result
    })

with open("data/rag_answers.json", "w", encoding="utf-8") as f:
    json.dump(rag_results, f, ensure_ascii=False, indent=2)
print("✅ rag_answers.json готово")

✅ baseline_answers.json готово


  result = rag_chain.run(question)


✅ rag_answers.json готово


In [6]:
from lab3.SECRET import API
client = Groq(api_key=API)

In [7]:
# Judge

JUDGE_PROMPT_TEMPLATE = """
You are an impartial judge evaluating answer correctness.

QUESTION: {question}
GROUND TRUTH: {ground_truth}
CANDIDATE ANSWER: {generated}

On a scale from 1 to 5, where:
1 = completely wrong
2 = mostly wrong
3 = partially correct
4 = mostly correct
5 = perfect answer

Respond with ONLY the score.
"""

def evaluate_answer(question, ground_truth, generated):
    prompt = JUDGE_PROMPT_TEMPLATE.format(
        question=question,
        ground_truth=ground_truth,
        generated=generated
    )
    response = client.chat.completions.create(
        model="llama3-70b-8192",  # або будь-яка інша доступна модель
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [8]:
# Evaluate

def evaluate_file(input_file, output_file):
    with open(f"data/{input_file}", "r", encoding="utf-8") as f:
        data = json.load(f)

    results = []
    for item in data:
        score = evaluate_answer(item["question"], item["ground_truth"], item["generated"])
        item["judge_score"] = score
        results.append(item)

    with open(f"data/{output_file}", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

evaluate_file("baseline_answers.json", "baseline_scored.json")
print("✅ baseline_scored.json оцінено")

evaluate_file("rag_answers.json", "rag_scored.json")
print("✅ rag_scored.json оцінено")

✅ baseline_scored.json оцінено
✅ rag_scored.json оцінено


In [9]:
# RECALL

test_samples = data[:100]
k_values = [10, 15, 20, 25, 30]
recall_scores = {k: 0 for k in k_values}

for sample in test_samples:
    question = sample["question"]
    ground_truth = sample["answer"].strip().lower()

    for k in k_values:
        retriever = vectorstore.as_retriever(search_kwargs={"k": k})
        retrieved_docs = retriever.invoke(question)
        hit = any(ground_truth in doc.page_content.lower() for doc in retrieved_docs)
        if hit:
            recall_scores[k] += 1

total = len(test_samples)
for k in k_values:
    recall_at_k = recall_scores[k] / total
    print(f"Recall@{k}: {recall_at_k:.2f}")

Recall@10: 1.00
Recall@15: 1.00
Recall@20: 1.00
Recall@25: 1.00
Recall@30: 1.00


In [10]:
def load_scores(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    scores = [int(item["judge_score"]) for item in data if str(item["judge_score"]).isdigit()]
    avg_score = sum(scores) / len(scores)
    return avg_score

rag_avg = load_scores("data/rag_scored.json")
baseline_avg = load_scores("data/baseline_scored.json")

print(f"Середня оцінка baseline: {baseline_avg:.2f}")
print(f"Середня оцінка RAG: {rag_avg:.2f}")

Середня оцінка baseline: 2.10
Середня оцінка RAG: 4.95
