In [1]:
from typing import List, Dict
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from utils import *
from llm import OpenRouterLLM

  from .autonotebook import tqdm as notebook_tqdm


### Tạo chunk

In [2]:
import re
import json
from pathlib import Path
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

# Tải bộ tokenizer nếu chưa có
nltk.download("punkt", download_dir="nltk_data")
nltk.download("punkt_tab", download_dir="nltk_data") 

nltk.data.path.append("nltk_data")
# Hàm chia đoạn dài thành các chunk khoảng 150–300 từ
def split_passage_to_chunks(title, passage, min_words=150, max_words=300):
    sentences = sent_tokenize(passage)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words = word_tokenize(sentence)
        current_chunk.append(sentence)
        current_word_count += len(words)

        if current_word_count >= min_words:
            chunks.append({
                "title": title.strip(),
                "text": " ".join(current_chunk).strip(),
                "embedding": None
            })
            current_chunk = []
            current_word_count = 0

    # Nếu còn sót lại
    if current_chunk:
        chunks.append({
            "title": title.strip(),
            "text": " ".join(current_chunk).strip(),
            "embedding": None
        })

    return chunks

# Đọc file .txt đầu vào
def parse_corpus(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Tách theo Title:... và Passage:...
    raw_entries = re.findall(r"Title:(.*?)\nPassage:(.*?)(?=Title:|$)", text, re.DOTALL)

    all_chunks = []
    for title, passage in raw_entries:
        chunks = split_passage_to_chunks(title, passage)
        all_chunks.extend(chunks)

    return all_chunks

# Ghi ra file JSON
def save_to_json(data, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
input_file = "VDT2025_Multihop_RAG_dataset/multihoprag_corpus.txt"  
output_file = "VDT2025_Multihop_RAG_dataset/chunks.json"

chunks = parse_corpus(input_file)
save_to_json(chunks, output_file)

print(f"Đã tạo {len(chunks)} chunk và lưu vào {output_file}")

Đã tạo 7713 chunk và lưu vào VDT2025_Multihop RAG dataset/chunks.json


### Embedding

In [4]:
from sentence_transformers import SentenceTransformer
import json
from tqdm import tqdm
import torch
# Load chunks
with open("VDT2025_Multihop RAG dataset/chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

# Load embedding model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer("BAAI/bge-m3", device=device)

# Encode and update embeddings
for chunk in tqdm(chunks, desc="Encoding chunks"):
    embedding = model.encode(chunk["text"], normalize_embeddings=True)
    chunk["embedding"] = embedding.tolist()

# Save as knowledge base
output_file = "VDT2025_Multihop RAG dataset/knowledge_base.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)

Using device: cuda


Encoding chunks: 100%|██████████| 7713/7713 [14:24<00:00,  8.92it/s]


### Tạo questions

In [5]:
with open("VDT2025_Multihop RAG dataset/MultiHopRAG.json", "r", encoding="utf-8") as f:
    test_set = json.load(f)

# Chuẩn hóa cho IRCoT
questions = []
for q in test_set:
    questions.append({
        "question": q["query"],
        "answer": q["answer"],
        "evidence": [e["title"] for e in q["evidence_list"]]
    })

### IRCOT

In [6]:
class IRCoTSystem:
    def __init__(self, knowledge_base, prompt_template, my_llm, embedding_model="BAAI/bge-m3"):
        self.knowledge_base = knowledge_base
        self.llm = my_llm
        self.embedding_model = SentenceTransformer(embedding_model)
        self.prompt_template = prompt_template

    def retrieve_initial(self, question, k=3):
        question_embedding = self.embedding_model.encode([question])[0]
        doc_embeddings = np.array([doc["embedding"] for doc in self.knowledge_base])
        sims = cosine_similarity([question_embedding], doc_embeddings)[0]
        top_ids = sims.argsort()[-k:][::-1]
        return [self.knowledge_base[i] for i in top_ids]

    def retrieve_with_cot(self, cot_sentence, k=1):
        cot_embedding = self.embedding_model.encode([cot_sentence])[0]
        doc_embeddings = np.array([doc["embedding"] for doc in self.knowledge_base])
        sims = cosine_similarity([cot_embedding], doc_embeddings)[0]
        top_ids = sims.argsort()[-k:][::-1]
        return [self.knowledge_base[i] for i in top_ids]

    def generate_next_cot(self, question, paragraphs, cot_so_far):
        formatted = ""
        for p in paragraphs:
            formatted += f"Title: {p['title']}\n{p['text']}\n\n"
        cot_text = "\n".join(cot_so_far)
        full_prompt = f"{formatted}Q: {question}\nA: {cot_text}\n"
        return self.llm.generate_response(self.prompt_template + full_prompt)

    def eval_evidence(self, output, target):
        f1 = cal_f1_score(output, target)
        acc = cal_accuracy_score(output, target)
        return f"F1: {f1:.2f}, Acc: {acc:.2f}"

    def answer_question(self, question_set, max_iterations=5):
        q = question_set["question"]
        gt = question_set["evidence"]
        paragraphs = self.retrieve_initial(q)
        cot_sentences = []

        for _ in range(max_iterations):
            next_cot = self.generate_next_cot(q, paragraphs, cot_sentences)
            cot_sentences.append(next_cot)
            if "answer is:" in next_cot.lower():
                break
            new_paras = self.retrieve_with_cot(next_cot)
            paragraphs.extend(new_paras)
            paragraphs = remove_duplicates(paragraphs)

        titles = extract_titles(paragraphs)
        eval_score = self.eval_evidence(titles, gt)
        return "\n".join(cot_sentences), eval_score, titles, gt


### Test answer

In [7]:
template = """
    Please generate your chain-of-thought reasoning step by step with provided evidences.  \n
    Your response should not 
    Once you are provided enough evidence, conclude with a sentence beginning with ‘Answer is:’ to state the final answer. \n

    
    For example:
    Q: Jeremy Theobald and Christopher Nolan share what profession?
    A: Jeremy Theobald is an actor and producer. Christopher Nolan is a director, producer, and screenwriter. Therefore, they
    both share the profession of being a producer. So the answer is: producer.
    Q: What film directed by Brian Patrick Butler was inspired by a film directed by F.W. Murnau?
    A: Brian Patrick Butler directed the film The Phantom Hour. The Phantom Hour was inspired by the films such as Nosferatu
    and The Cabinet of Dr. Caligari. Of these Nosferatu was directed by F.W. Murnau. So the answer is: The Phantom Hour.
    Q: How many episodes were in the South Korean television series in which Ryu Hye−young played Bo−ra?
    A: The South Korean television series in which Ryu Hye−young played Bo−ra is Reply 1988. The number of episodes Reply
    1988 has is 20. So the answer is: 20 \n \n

    Evidence: \n
    """
knowledge_base = json.load(open("VDT2025_Multihop RAG dataset/knowledge_base.json", "r", encoding="utf-8"))
ircot = IRCoTSystem(knowledge_base=knowledge_base, my_llm=OpenRouterLLM("deepseek/deepseek-chat-v3-0324:free"), prompt_template=template)

f1_scores = []
for i, qset in enumerate(questions[:10]):  # chạy 10 câu hỏi đầu tiên
    print(f"--- Question {i+1} ---")
    print("Question:", qset["question"])
    print("Answer:", qset["answer"])
    answer, score, titles, gt = ircot.answer_question(qset)
    print("Generated Reasoning:\n", answer)
    print("Titles:", titles)
    print("Ground Truth:", gt)
    print("Evaluation:", score)
    print("\n")
    # Extract F1 score from evaluation string
    f1 = float(score.split(",")[0].split(":")[1])
    f1_scores.append(f1)

avg_f1 = sum(f1_scores) / len(f1_scores)
print(f"Average F1 Score: {avg_f1:.2f}")

--- Question 1 ---
Question: Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?
Answer: Sam Bankman-Fried
Generated Reasoning:
 To determine the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, let's analyze the evidence step by step:

1. The first piece of evidence mentions "Sam Bankman-Fried" and describes his fall from grace in the crypto industry, implicating him in a fraud trial where the government must prove intent.

2. The second piece of evidence explicitly states that "Sam Bankman-Fried, founder of former Coinbase rival FTX, was found guilty of seven criminal fraud counts tied to the collapse of his exchange and the theft of customer funds."

3. The third piece of evidence further discusses the FTX trial and mentions Bankman-Fried

### Evaluate intinal retrival

In [8]:
def evaluate_init_retriever(system: IRCoTSystem, questions, k=3):
    f1s, accs = [], []

    for q in questions:
        retrieved = system.retrieve_initial(q["question"], k=k)
        predicted_titles = [doc["title"] for doc in retrieved]
        gold_titles = q["evidence"]

        print(f"Q: {q['question']}")
        print(f"GT: {gold_titles}")
        print(f"Predicted: {predicted_titles}")
        
        eval_result = system.eval_evidence(predicted_titles, gold_titles)
        print(f"{eval_result}\n")

        # Trích xuất F1, Acc từ chuỗi kết quả
        f1 = float(eval_result.split(",")[0].split(":")[1])
        acc = float(eval_result.split(",")[1].split(":")[1])
        f1s.append(f1)
        accs.append(acc)

    macro_f1 = sum(f1s) / len(f1s)
    accuracy = sum(accs) / len(accs)

    print(f"Retrieval Evaluation Results:")
    print(f"- Macro F1: {macro_f1:.2f}")
    print(f"- Accuracy (Hit@{k}): {accuracy:.2f}")

    return macro_f1, accuracy

In [9]:
f1, acc = evaluate_init_retriever(ircot, questions, k=3)
print(f"F1: {f1:.2f}, Acc: {acc:.2f}")

Q: Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?
GT: ['The FTX trial is bigger than Sam Bankman-Fried', 'SBF’s trial starts soon, but how did he — and FTX — get here?', 'Sam Altman backs teens’ startup, Google unveils the Pixel 8 and TikTok tests an ad-free tier']
Predicted: ['The FTX trial is bigger than Sam Bankman-Fried', 'Coinbase rallies more than 60% in same month that FTX and Binance founders brace for prison', 'The FTX trial is bigger than Sam Bankman-Fried']
F1: 0.33, Acc: 0.33

Q: Which individual is implicated in both inflating the value of a Manhattan apartment to a figure not yet achieved in New York City's real estate history, according to 'Fortune', and is also accused of adjusting this apartment's valuation to compensate for a loss in another asset's worth, as reported by 'The Age'