In [None]:
!pip install llama-cpp-python gradio langchain chromadb pypdf PyMuPDF sentence-transformers huggingface_hub


Collecting PyMuPDF
  Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
   - -------------------------------------- 0.8/16.5 MB 5.6 MB/s eta 0:00:03
   ------ --------------------------------- 2.6/16.5 MB 8.4 MB/s eta 0:00:02
   ----------- ---------------------------- 4.7/16.5 MB 9.2 MB/s eta 0:00:02
   ---------------- ----------------------- 6.8/16.5 MB 9.1 MB/s eta 0:00:02
   -------------------- ------------------- 8.7/16.5 MB 9.3 MB/s eta 0:00:01
   ------------------------ --------------- 10.0/16.5 MB 8.9 MB/s eta 0:00:01
   ----------------------------- ---------- 12.1/16.5 MB 8.8 MB/s eta 0:00:01
   --------------------------------- ------ 13.9/16.5 MB 9.0 MB/s eta 0:00:01
   ---------------------------------------  16.3/16.5 MB 9.3 MB/s eta 0:00:01
   ---------------------------------------- 16.5/16.5 MB 8.9 MB/s eta 0:00:00
Inst

# RAG with GUI to upload your doc and answer questions

In [2]:
import gradio as gr
from llama_cpp import Llama
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import logging

# Setup Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load LLM model (Adjust path accordingly)
llm = Llama(
    model_path="D:\\Competitions\\39\\Q3\\Qwen_3B_GRPO_Enabled.gguf",
    n_ctx=4096,
    n_gpu_layers=-1
)

# Embedding Model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Initialize ChromaDB Instances (PDF & History)
pdf_db = Chroma(
    collection_name="pdf_context",
    embedding_function=embeddings,
    persist_directory="./pdf_db"
)
history_db = Chroma(
    collection_name="chat_history",
    embedding_function=embeddings,
    persist_directory="./history_db"
)

## PDF Processing Function (Avoid Duplicate Inserts)
def process_pdf(pdf_file):
    try:
        loader = PyMuPDFLoader(pdf_file.name)
        documents = loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
        texts = splitter.split_documents(documents)

        existing_docs = pdf_db.get()
        existing_texts = set(existing_docs['documents'])

        new_docs = [doc for doc in texts if doc.page_content not in existing_texts]

        if new_docs:
            pdf_db.add_documents(new_docs)
            pdf_db.persist()
            logging.info(f"Inserted {len(new_docs)} new documents.")
            return "✅ PDF processed successfully!"
        else:
            logging.info("PDF already processed. No new data inserted.")
            return "⚠️ PDF already processed. No new data inserted."

    except Exception as e:
        logging.error(f"Error processing PDF: {e}")
        return f"❌ Error processing PDF: {str(e)}"

# Corrected get_context function
def get_context(question):
    try:
        results = pdf_db.similarity_search(question, k=3)
        
        # Check type of results (Document or str) and handle accordingly
        context_list = []
        for res in results:
            if hasattr(res, 'page_content'):
                context_list.append(res.page_content)
            else:
                context_list.append(res)

        combined_context = "\n".join(context_list)
        logging.info(f"Retrieved context: {context_list}")
        
        return combined_context if context_list else ""
    except Exception as e:
        logging.error(f"Error retrieving context: {e}")
        return ""

# Improved answer_question function with robust error handling
def answer_question(question, chat_history):
    logging.info(f"Received question: {question}")
    context = get_context(question)

    if not context:
        error_msg = "⚠️ No relevant context found."
        logging.warning(error_msg)
        chat_history.append((question, error_msg))
        return chat_history

    SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

    prompt = f"""<|system|>
{SYSTEM_PROMPT}

Context:
{context}

<|user|>
{question}
<|assistant|>
"""

    try:
        output = llm(
            prompt,
            max_tokens=1024,
            temperature=0.7,
            top_p=0.95,
            stop=["<|user|>", "<|system|>", "<|assistant|>"]
        )

        # Extract the full response text
        response_text = output['choices'][0]['text'].strip()
        print("Response ",response_text)

        # Parse reasoning and answer from the response
        reasoning_start = response_text.find("<reasoning>")
        reasoning_end = response_text.find("</reasoning>")
        answer_start = response_text.find("<answer>")
        answer_end = response_text.find("</answer>")

        reasoning = ""
        answer = ""

        if reasoning_start != -1 and reasoning_end != -1:
            reasoning = response_text[reasoning_start + len("<reasoning>"):reasoning_end].strip()

        if answer_start != -1 and answer_end != -1:
            answer = response_text[answer_start + len("<answer>"):answer_end].strip()

        # Combine reasoning and answer for display
        formatted_response = f"**Reasoning:**\n{reasoning}\n\n**Answer:**\n{answer}"

        # Save interaction to history DB (avoid duplicates)
        interaction_record = f"User: {question}\nAssistant: {formatted_response}"
        
        existing_history_docs = history_db.get()
        existing_history_texts = set(existing_history_docs['documents'])

        if interaction_record not in existing_history_texts:
            history_db.add_texts([interaction_record])
            history_db.persist()
            logging.info("Interaction saved to history.")

        # Append to chat history
        chat_history.append((question, formatted_response))
        
    except Exception as e:
        error_msg = f"❌ Error generating response: {str(e)}"
        logging.error(error_msg)
        chat_history.append((question, error_msg))

    return chat_history


# Gradio UI (Continuous Chat Session with loading indicators)
with gr.Blocks() as demo:
    
    gr.Markdown("# 📄 PDF-based RAG Chatbot")

    with gr.Row():
        pdf_input = gr.File(label="Upload PDF Document 📁")
    
    upload_status = gr.Label("")
    upload_btn = gr.Button("Upload & Process PDF 🚀")

    upload_btn.click(
        fn=process_pdf,
        inputs=[pdf_input],
        outputs=[upload_status]
    )

    chatbot_ui = gr.Chatbot(label="Chat History 💬")
    
    question_input = gr.Textbox(label="Enter your question ❓")
    
    ask_btn = gr.Button("Get Answer ✨")

    ask_btn.click(
        fn=answer_question,
        inputs=[question_input, chatbot_ui],
        outputs=[chatbot_ui]
    )

demo.queue().launch()


llama_model_loader: loaded meta data with 26 key-value pairs and 434 tensors from D:\Competitions\39\Q3\Qwen_3B_GRPO_Enabled.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3b Instruct Unsloth Bnb 4bit
llama_model_loader: - kv   3:                       general.organization str              = Unsloth
llama_model_loader: - kv   4:                           general.finetune str              = instruct-unsloth-bnb-4bit
llama_model_loader: - kv   5:                           general.basename str              = qwen2.5
llama_model_loader: - kv   6:                         general.size_label str              = 3B
llama_m

* Running on local URL:  http://127.0.0.1:7860


2025-03-09 23:40:49,834 - INFO - HTTP Request: GET http://127.0.0.1:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
2025-03-09 23:40:49,868 - INFO - HTTP Request: HEAD http://127.0.0.1:7860/ "HTTP/1.1 200 OK"



To create a public link, set `share=True` in `launch()`.




2025-03-09 23:40:50,371 - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"


# Evaluator


In [None]:
import gradio as gr
from llama_cpp import Llama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
import torch
import logging
import fitz  # PyMuPDF for PDF text extraction

# Setup Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load LLM model (Adjust path accordingly)
llm = Llama(
    model_path="D:\\Competitions\\39\\gguf\\500_data_new.gguf",
    n_ctx=4096,
    n_gpu_layers=-1
)

# Load ColBERT model and tokenizer
colbert_tokenizer = AutoTokenizer.from_pretrained("colbert-ir/colbertv2.0")
colbert_model = AutoModel.from_pretrained("colbert-ir/colbertv2.0")

# Device configuration for ColBERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
colbert_model = colbert_model.to(device)

# Initialize storage for embeddings and text chunks
file_embeddings = []
file_text_chunks = []

## File Processing Function (Supports .pdf, .txt, .md)
def process_file(uploaded_file):
    try:
        file_extension = uploaded_file.name.split('.')[-1].lower()

        # Extract text based on file type
        if file_extension == 'pdf':
            # Process PDF files using PyMuPDF
            doc = fitz.open(uploaded_file.name)
            extracted_text = ""
            for page_num in range(len(doc)):
                page = doc[page_num]
                extracted_text += page.get_text()
        elif file_extension in ['txt', 'md']:
            # Process plain text or markdown files
            with open(uploaded_file.name, 'r', encoding='utf-8') as f:
                extracted_text = f.read()
        else:
            return "❌ Unsupported file type. Please upload a .pdf, .txt, or .md file."

        # Clean non-UTF-8 characters from extracted text
        cleaned_text = extracted_text.encode("utf-8", "ignore").decode("utf-8", "ignore")

        # Split cleaned text into chunks for indexing
        splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
        texts = splitter.split_text(cleaned_text)

        # Index chunks into ColBERT or another vector database
        global file_embeddings, file_text_chunks
        file_embeddings.clear()
        file_text_chunks.clear()

        for chunk in texts:
            inputs = colbert_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
            with torch.no_grad():
                embedding = colbert_model(**inputs).last_hidden_state.mean(dim=1)  # Average pooling
                file_embeddings.append(embedding.cpu())
                file_text_chunks.append(chunk)

        logging.info(f"Processed and indexed {len(texts)} chunks from the uploaded {file_extension.upper()} file.")
        print("PDF Processed Successfully")
        return f"✅ {file_extension.upper()} file processed successfully!"

    except Exception as e:
        logging.error(f"Error processing file: {e}")
        return f"❌ Error processing file: {str(e)}"


## Retrieve Context Using ColBERT
def get_context(question):
    try:
        # Encode the query using ColBERT
        inputs = colbert_tokenizer(question, return_tensors="pt", truncation=True, max_length=512).to(device)
        with torch.no_grad():
            query_embedding = colbert_model(**inputs).last_hidden_state.mean(dim=1).cpu()

        # Compute similarity scores between query and stored embeddings
        scores = [torch.cosine_similarity(query_embedding, emb, dim=1).item() for emb in file_embeddings]

        # Get top-3 most relevant chunks based on similarity scores
        top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
        top_chunks = [file_text_chunks[i] for i in top_indices]

        logging.info(f"Retrieved top-3 relevant chunks: {top_chunks}")
        
        return "\n".join(top_chunks) if top_chunks else "⚠️ No relevant context found."

    except Exception as e:
        logging.error(f"Error retrieving context: {e}")
        return "⚠️ Error retrieving context."


def answer_question(question, chat_history):
    logging.info(f"Received question: {question}")
    context = get_context(question)

    if not context or context == "⚠️ No relevant context found.":
        error_msg = "⚠️ No relevant context found."
        logging.warning(error_msg)
        chat_history.append((question, error_msg))
        return chat_history

    SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

    prompt = f"""<|system|>
{SYSTEM_PROMPT}

Context:
{context}

<|user|>
{question}
<|assistant|>
"""

    try:
        output = llm(
            prompt,
            max_tokens=1024,
            temperature=0.7,
            top_p=0.95,
            stop=["<|user|>", "<|system|>", "<|assistant|>"]
        )

        # Extract the full response text
        response_text = output['choices'][0]['text'].strip()
        print("Response:", response_text)

        # Parse reasoning and answer from the response
        reasoning_start = response_text.find("<reasoning>")
        reasoning_end = response_text.find("</reasoning>")
        answer_start = response_text.find("<answer>")
        answer_end = response_text.find("</answer>")

        reasoning = ""
        answer = ""

        if reasoning_start != -1 and reasoning_end != -1:
            reasoning = response_text[reasoning_start + len("<reasoning>"):reasoning_end].strip()

            # Check if <answer> tags exist properly
            if answer_start != -1 and answer_end != -1:
                answer = response_text[answer_start + len("<answer>"):answer_end].strip()
            else:
                # ELSE condition: <answer> tags not found correctly
                # Take everything after </reasoning>, remove any leftover tags
                remaining_text = response_text[reasoning_end + len("</reasoning>"):].strip()
                # Clean up any accidental tags
                remaining_text = remaining_text.replace("<answer>", "").replace("</answer>", "").strip()
                answer = remaining_text

        else:
            # If reasoning tags are missing entirely, treat whole text as answer
            reasoning = "Reasoning not explicitly provided."
            answer = response_text.replace("<answer>", "").replace("</answer>", "").strip()

        # Combine reasoning and answer for display
        formatted_response = f"**Reasoning:**\n{reasoning}\n\n**Answer:**\n{answer}"

        # Append to chat history
        chat_history.append((question, formatted_response))

    except Exception as e:
        error_msg = f"❌ Error generating response: {str(e)}"
        logging.error(error_msg)
        chat_history.append((question, error_msg))

    return chat_history



# Gradio UI (Continuous Chat Session with loading indicators)
with gr.Blocks() as demo:
    
    gr.Markdown("# 📄 Enhanced RAG Chatbot with Multi-format Support")

    with gr.Row():
        file_input = gr.File(label="Upload Document (.pdf/.txt/.md) 📁")
    
    upload_status = gr.Label("")
    upload_btn = gr.Button("Upload & Process File 🚀")

    upload_btn.click(
        fn=process_file,
        inputs=[file_input],
        outputs=[upload_status]
    )

    chatbot_ui = gr.Chatbot(label="Chat History 💬")
    
    question_input = gr.Textbox(label="Enter your question ❓")
    
    ask_btn = gr.Button("Get Answer ✨")

    ask_btn.click(
        fn=answer_question,
        inputs=[question_input, chatbot_ui],
        outputs=[chatbot_ui]
    )

demo.queue().launch()


llama_model_loader: loaded meta data with 26 key-value pairs and 434 tensors from D:\Competitions\39\gguf\500_data_new.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3b Instruct Unsloth Bnb 4bit
llama_model_loader: - kv   3:                       general.organization str              = Unsloth
llama_model_loader: - kv   4:                           general.finetune str              = instruct-unsloth-bnb-4bit
llama_model_loader: - kv   5:                           general.basename str              = qwen2.5
llama_model_loader: - kv   6:                         general.size_label str              = 3B
llama_model_l

* Running on local URL:  http://127.0.0.1:7876


2025-03-09 19:09:06,875 - INFO - HTTP Request: GET http://127.0.0.1:7876/gradio_api/startup-events "HTTP/1.1 200 OK"
2025-03-09 19:09:06,895 - INFO - HTTP Request: HEAD http://127.0.0.1:7876/ "HTTP/1.1 200 OK"



To create a public link, set `share=True` in `launch()`.




2025-03-09 19:09:07,408 - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
2025-03-09 19:09:38,356 - INFO - Processed and indexed 30 chunks from the uploaded PDF file.
2025-03-09 19:09:40,881 - INFO - Received question: What is the key difference between DeepSeek-R1-Zero and DeepSeek-R1?
2025-03-09 19:09:40,908 - INFO - Retrieved top-3 relevant chunks: ['DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via\nReinforcement Learning\nDeepSeek-AI\nresearch@deepseek.com\nAbstract\nWe introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.\nDeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super-\nvised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.\nThrough RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing\nreasoning behaviors. However, it encounters challenges such as poor readability, and language\nmixing. To address these 

Response: <reasoning>
According to the information provided, the key difference between DeepSeek-R1-Zero and DeepSeek-R1 lies in the training process. DeepSeek-R1-Zero was trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step. On the other hand, DeepSeek-R1 incorporates multi-stage training and cold-start data before RL. This difference in training methodology aims to address the challenges faced by DeepSeek-R1-Zero, such as poor readability and language mixing, to enhance its reasoning performance, leading to improvements in its benchmark performance. 
</reasoning>
</answer>
DeepSeek-R1 differs from DeepSeek-R1-Zero in that the latter was trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, while the former incorporates multi-stage training and cold-start data before RL. This difference in training process is designed to address the challenges faced by DeepSeek-R1-Zero t

2025-03-09 19:12:35,351 - INFO - Received question: What role does distillation play in DeepSeek-R1’s development?
2025-03-09 19:12:35,415 - INFO - Retrieved top-3 relevant chunks: ['R1-Zero can be further augmented through the application of majority voting. For example,\nwhen majority voting is employed on the AIME benchmark, DeepSeek-R1-Zero’s performance\nescalates from 71.0% to 86.7%, thereby exceeding the performance of OpenAI-o1-0912. The\nability of DeepSeek-R1-Zero to achieve such competitive performance, both with and without\nmajority voting, highlights its strong foundational capabilities and its potential for further\nadvancements in reasoning tasks.\nSelf-evolution Process of DeepSeek-R1-Zero\nThe self-evolution process of DeepSeek-R1-Zero\nis a fascinating demonstration of how RL can drive a model to improve its reasoning capabilities\nautonomously. By initiating RL directly from the base model, we can closely monitor the model’s\nprogression without the influence of the

Response: <reasoning>
Distillation in machine learning refers to the process of training a smaller, more manageable model to mimic the behavior of a larger and more complex model. In the context of DeepSeek-R1’s development, distillation is used to create smaller versions of the model (like Qwen2.5-32B and Llama series) that can be fine-tuned further. The idea is that the larger and more complex model (DeepSeek-R1) can discover certain patterns and reasoning capabilities that are then distilled into smaller models, which can potentially learn these patterns more effectively and can be fine-tuned to perform specific tasks. This distillation process not only allows for the creation of more manageable models but also enables the transfer of key insights and patterns from the large model to the smaller ones, which can lead to improved performance on specific benchmarks and tasks. By using distilled models like Qwen and Llama, we can achieve a good balance between computational efficiency a

2025-03-09 19:20:06,597 - INFO - Received question: Can you summerize the  DeepSeek-R1 Evaluation
2025-03-09 19:20:06,699 - INFO - Retrieved top-3 relevant chunks: ['C-SimpleQA (Correct)\n55.4\n58.7\n68.0\n40.3\n-\n63.7\nTable 4 | Comparison between DeepSeek-R1 and other representative models.\nFor education-oriented knowledge benchmarks such as MMLU, MMLU-Pro, and GPQA\nDiamond, DeepSeek-R1 demonstrates superior performance compared to DeepSeek-V3. This im-\nprovement is primarily attributed to enhanced accuracy in STEM-related questions, where signif-\nicant gains are achieved through large-scale reinforcement learning. Additionally, DeepSeek-R1\nexcels on FRAMES, a long-context-dependent QA task, showcasing its strong document analysis\ncapabilities. This highlights the potential of reasoning models in AI-driven search and data\nanalysis tasks. On the factual benchmark SimpleQA, DeepSeek-R1 outperforms DeepSeek-V3,\ndemonstrating its capability in handling fact-based queries. A simi

In [None]:
!pip install deepeval

from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

def test_answer_relevancy():
    answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
    test_case = LLMTestCase(
        input="",
        actual_output=""
    )
    assert_test(test_case, [answer_relevancy_metric])


test_answer_relevancy()

In [None]:
!pip install ragas

Collecting ragas
  Downloading ragas-0.2.14-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain_openai (from ragas)
  Downloading langchain_openai-0.3.8-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core (from ragas)
  Downloading langchain_core-0.3.43-py3-none-any.whl.metadata (5.9 kB)
Downloading ragas-0.2.14-py3-none-any.whl (187 kB)
Downloading langchain_openai-0.3.8-py3-none-any.whl (55 kB)
Downloading langchain_core-0.3.43-py3-none-any.whl (415 kB)
Installing collected packages: langchain-core, langchain_openai, ragas
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.31
    Uninstalling langchain-core-0.3.31:
      Successfully uninstalled langchain-core-0.3.31
Successfully installed langchain-core-0.3.43 langchain_openai-0.3.8 ragas-0.2.14


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
open-webui 0.5.7 requires fastapi==0.111.0, but you have fastapi 0.115.8 which is incompatible.
open-webui 0.5.7 requires pydantic==2.9.2, but you have pydantic 2.10.6 which is incompatible.
open-webui 0.5.7 requires unstructured==0.15.9, but you have unstructured 0.16.23 which is incompatible.


2025-03-09 19:25:08,033 - INFO - Received question: Can you summerize the  DeepSeek-R1 Evaluation Percentages and Number Values
2025-03-09 19:25:08,103 - INFO - Retrieved top-3 relevant chunks: ['We intentionally limit our constraints to this structural format, avoiding any content-specific\nbiases—such as mandating reflective reasoning or promoting particular problem-solving strate-\ngies—to ensure that we can accurately observe the model’s natural progression during the RL\nprocess.\n2.2.4. Performance, Self-evolution Process and Aha Moment of DeepSeek-R1-Zero\nPerformance of DeepSeek-R1-Zero\nFigure 2 depicts the performance trajectory of DeepSeek-\nR1-Zero on the AIME 2024 benchmark throughout the RL training process. As illustrated,\nDeepSeek-R1-Zero demonstrates a steady and consistent enhancement in performance as the\nRL training advances. Notably, the average pass@1 score on AIME 2024 shows a significant\nincrease, jumping from an initial 15.6% to an impressive 71.0%, reaching

Response: <reasoning>
The question asks to summarize the evaluation percentages and number values for DeepSeek-R1. To answer it, I will look at the tables and figures provided in the text, and extract the relevant information.
</reasoning>
<answer>
From the text, we can see the following key information:

1. **AIME 2024 Benchmark:**
   - Initial Average pass@1 score: 15.6%
   - Final Average pass@1 score: 71.0%

2. **Other Benchmarks:**
   - **Math-500 Benchmark:**
     - OpenAI-o1-0912: 83.3%
     - DeepSeek-R1-Zero: 86.7%

   - **GPQA Benchmark:**
     - OpenAI-o1-0912: 94.8%
     - DeepSeek-R1-Zero: 95.9%

   - **LiveCode Benchmark:**
     - OpenAI-o1-0912: 77.3%
     - DeepSeek-R1-Zero: 73.3%

   - **Codeforces Benchmark:**
     - OpenAI-o1-0912: 63.8%
     - DeepSeek-R1-Zero: 1444 (Elo rating)

   - **Diamond Benchmark:**
     - OpenAI-o1-0912: 50.0%
     - DeepSeek-R1-Zero: 50.0%

   - **FRAMES Benchmark:**
     - OpenAI-o1-0912: 1843
     - DeepSeek-R1-Zero: 1444

   - **SimpleQ

In [None]:
def rag_system(question):
    context = get_context(question)  # Retrieve relevant context using your RAG retrieval mechanism.
    return answer_question(question, context)  # Generate answer using your LLM.


In [None]:
def rag_system(question, chat_history):
    logging.info(f"Received question: {question}")
    context = get_context(question)

    if not context or context == "⚠️ No relevant context found.":
        error_msg = "⚠️ No relevant context found."
        logging.warning(error_msg)
        chat_history.append((question, error_msg))
        return chat_history

    SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

    prompt = f"""<|system|>
{SYSTEM_PROMPT}

Context:
{context}

<|user|>
{question}
<|assistant|>
"""

    try:
        output = llm(
            prompt,
            max_tokens=1024,
            temperature=0.7,
            top_p=0.95,
            stop=["<|user|>", "<|system|>", "<|assistant|>"]
        )

        # Extract the full response text
        response_text = output['choices'][0]['text'].strip()
        print("Response:", response_text)

        # Parse reasoning and answer from the response
        reasoning_start = response_text.find("<reasoning>")
        reasoning_end = response_text.find("</reasoning>")
        answer_start = response_text.find("<answer>")
        answer_end = response_text.find("</answer>")

        reasoning = ""
        answer = ""

        if reasoning_start != -1 and reasoning_end != -1:
            reasoning = response_text[reasoning_start + len("<reasoning>"):reasoning_end].strip()

            # Check if <answer> tags exist properly
            if answer_start != -1 and answer_end != -1:
                answer = response_text[answer_start + len("<answer>"):answer_end].strip()
            else:
                # ELSE condition: <answer> tags not found correctly
                # Take everything after </reasoning>, remove any leftover tags
                remaining_text = response_text[reasoning_end + len("</reasoning>"):].strip()
                # Clean up any accidental tags
                remaining_text = remaining_text.replace("<answer>", "").replace("</answer>", "").strip()
                answer = remaining_text

        else:
            # If reasoning tags are missing entirely, treat whole text as answer
            reasoning = "Reasoning not explicitly provided."
            answer = response_text.replace("<answer>", "").replace("</answer>", "").strip()
    except Exception as e:
        error_msg = f"❌ Error generating response: {str(e)}"
        logging.error(error_msg)
        chat_history.append((question, error_msg))

    return answer

In [19]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting absl-py (from rouge-score)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Using cached absl_py-2.1.0-py3-none-any.whl (133 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=25026 sha256=908fd12e5397e979cf7776f44caa01855e9d1e6376e4b0f89233dfef72eb8ca9
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: absl-py, rouge-score
Successfully installed absl-py-2.1.0 rouge-score-0.1.2


In [25]:
import json
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import logging
import os 

# Define the process_file function
def process_file_from_path(file_path):
    """
    Processes a file based on its extension (.pdf, .txt, .md) and indexes its content into a vector database.
    
    Args:
        file_path (str): Path to the file to be processed.
    
    Returns:
        str: Success or error message.
    """
    try:
        # Check if the file exists
        if not os.path.exists(file_path):
            return f"❌ File not found: {file_path}"

        # Determine the file extension
        file_extension = file_path.split('.')[-1].lower()

        # Extract text based on file type
        if file_extension == 'pdf':
            # Process PDF files using PyMuPDF
            doc = fitz.open(file_path)
            extracted_text = ""
            for page_num in range(len(doc)):
                page = doc[page_num]
                extracted_text += page.get_text()
        elif file_extension in ['txt', 'md']:
            # Process plain text or markdown files
            with open(file_path, 'r', encoding='utf-8') as f:
                extracted_text = f.read()
        else:
            return "❌ Unsupported file type. Please upload a .pdf, .txt, or .md file."

        # Clean non-UTF-8 characters from extracted text
        cleaned_text = extracted_text.encode("utf-8", "ignore").decode("utf-8", "ignore")

        # Split cleaned text into chunks for indexing
        splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
        texts = splitter.split_text(cleaned_text)

        # Index chunks into ColBERT or another vector database
        global file_embeddings, file_text_chunks
        file_embeddings.clear()
        file_text_chunks.clear()

        for chunk in texts:
            inputs = colbert_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
            with torch.no_grad():
                embedding = colbert_model(**inputs).last_hidden_state.mean(dim=1)  # Average pooling
                file_embeddings.append(embedding.cpu())
                file_text_chunks.append(chunk)

        logging.info(f"Processed and indexed {len(texts)} chunks from the uploaded {file_extension.upper()} file.")
        return f"✅ {file_extension.upper()} file processed successfully!"

    except Exception as e:
        logging.error(f"Error processing file: {e}")
        return f"❌ Error processing file: {str(e)}"

# Load JSON Dataset
def load_dataset(json_path):
    """
    Load the dataset from a JSON file.
    Args:
        json_path (str): Path to the JSON file containing questions and reference answers.
    Returns:
        list: Loaded dataset as a list of dictionaries.
    """
    with open(json_path, 'r') as f:
        return json.load(f)

# Define RAG System Function
def rag_system(question, chat_history):
    """
    Retrieves context using the RAG retrieval mechanism and generates an answer using the LLM.
    
    Args:
        question (str): The user-provided question.
        chat_history (list): Chat history for storing responses.
    
    Returns:
        str: The generated answer from the RAG system.
    """
    logging.info(f"Received question: {question}")
    context = get_context(question)  # Retrieve context using your retrieval mechanism

    if not context or context == "⚠️ No relevant context found.":
        error_msg = "⚠️ No relevant context found."
        logging.warning(error_msg)
        chat_history.append((question, error_msg))
        return error_msg

    SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

    prompt = f"""<|system|>
{SYSTEM_PROMPT}

Context:
{context}

<|user|>
{question}
<|assistant|>
"""

    try:
        output = llm(
            prompt,
            max_tokens=1024,
            temperature=0.7,
            top_p=0.95,
            stop=["<|user|>", "<|system|>", "<|assistant|>"]
        )

        # Extract the full response text
        response_text = output['choices'][0]['text'].strip()
        print("Response:", response_text)

        # Parse reasoning and answer from the response
        reasoning_start = response_text.find("<reasoning>")
        reasoning_end = response_text.find("</reasoning>")
        answer_start = response_text.find("<answer>")
        answer_end = response_text.find("</answer>")

        reasoning = ""
        answer = ""

        if reasoning_start != -1 and reasoning_end != -1:
            reasoning = response_text[reasoning_start + len("<reasoning>"):reasoning_end].strip()

            if answer_start != -1 and answer_end != -1:
                answer = response_text[answer_start + len("<answer>"):answer_end].strip()
            else:
                remaining_text = response_text[reasoning_end + len("</reasoning>"):].strip()
                remaining_text = remaining_text.replace("<answer>", "").replace("</answer>", "").strip()
                answer = remaining_text
        else:
            reasoning = "Reasoning not explicitly provided."
            answer = response_text.replace("<answer>", "").replace("</answer>", "").strip()

    except Exception as e:
        error_msg = f"❌ Error generating response: {str(e)}"
        logging.error(error_msg)
        chat_history.append((question, error_msg))
        return error_msg

    return answer

# Generate Answers Using RAG System
def generate_answers(rag_system, dataset):
    """
    Generate answers for each question in the dataset using the RAG system.
    Args:
        rag_system (function): Function to generate answers using the RAG system.
        dataset (list): List of dictionaries containing questions and reference answers.
    Returns:
        list: List of generated answers.
    """
    generated_answers = []
    chat_history = []
    
    for entry in dataset:
        question = entry["question"]
        generated_answer = rag_system(question, chat_history)
        generated_answers.append(generated_answer)
    
    return generated_answers

# Evaluate Generated Answers
def evaluate_answers(generated_answers, reference_answers):
    """
    Evaluate generated answers against reference answers using BLEU and ROUGE scores.
    
    Args:
        generated_answers (list): List of answers generated by the RAG system.
        reference_answers (list): List of reference answers from the dataset.
    
    Returns:
        dict: Dictionary containing BLEU and ROUGE scores for each answer pair.
    """
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    results = {"bleu_scores": [], "rouge_scores": []}

    for gen_answer, ref_answer in zip(generated_answers, reference_answers):
        # Calculate BLEU score
        bleu_score = sentence_bleu([ref_answer.split()], gen_answer.split())
        results["bleu_scores"].append(bleu_score)

        # Calculate ROUGE scores
        rouge_scores = rouge_scorer_obj.score(ref_answer, gen_answer)
        results["rouge_scores"].append(rouge_scores)

    return results

# Main Function for Evaluation
if __name__ == "__main__":
    # Path to JSON file containing questions and reference answers
    json_path = "D:\\Competitions\\39\\gguf\\qa_datasetmd.json"  # Replace with your file path
    file = process_file_from_path("D:\\Competitions\\39\\gguf\\dataset.md")
    if file:
        # Load dataset
        dataset = load_dataset(json_path)

        # Extract questions and reference answers
        questions = [entry["question"] for entry in dataset]
        reference_answers = [entry["reference_answer"] for entry in dataset]

        # Generate answers using RAG system
        print("Generating answers using the RAG system...")
        generated_answers = generate_answers(rag_system, dataset)

        # Evaluate generated answers against reference answers
        print("Evaluating generated answers...")
        evaluation_results = evaluate_answers(generated_answers, reference_answers)

        # Print Results
        print("\nEvaluation Results:")
        
        for i, (question, gen_answer, ref_answer) in enumerate(zip(questions, generated_answers, reference_answers)):
            print(f"Question {i+1}: {question}")
            print(f"Generated Answer: {gen_answer}")
            print(f"Reference Answer: {ref_answer}")
            print(f"BLEU Score: {evaluation_results['bleu_scores'][i]:.4f}")
            print(f"ROUGE Scores: {evaluation_results['rouge_scores'][i]}")


2025-03-09 20:09:11,321 - INFO - Processed and indexed 5 chunks from the uploaded MD file.
2025-03-09 20:09:11,323 - INFO - Received question: How does DualPipe optimize pipeline parallelism compared to 1F1B and ZB1P?
2025-03-09 20:09:11,347 - INFO - Retrieved top-3 relevant chunks: ['# DualPipe\nDualPipe is an innovative bidirectional pipeline parallelism algorithm introduced in the DeepSeek-V3 Technical Report. It achieves full overlap of forward and backward computation-communication phases, also reducing pipeline bubbles. For detailed information on computation-communication overlap, please refer to the profile data.\n\nPipeline Bubbles and Memory Usage Comparison\n\n| Method    | Bubble                  | Parameter | Activation |\n|:---------:|:-----------------------:|:---------:|:----------:|\n| 1F1B      | (PP-1)(𝐹+𝐵)            | 1×        | PP         |\n| ZB1P      | (PP-1)(𝐹+𝐵-2𝑊)         | 1×        | PP         |\n| DualPipe  | (PP/2-1)(𝐹&𝐵+𝐵-3𝑊)     | 2×        | PP+1   

Generating answers using the RAG system...


llama_perf_context_print:        load time =   77185.40 ms
llama_perf_context_print: prompt eval time =  388706.45 ms /  1609 tokens (  241.58 ms per token,     4.14 tokens per second)
llama_perf_context_print:        eval time =   55136.70 ms /   264 runs   (  208.85 ms per token,     4.79 tokens per second)
llama_perf_context_print:       total time =  101516.50 ms /  1873 tokens
2025-03-09 20:10:52,919 - INFO - Received question: What are the two expert load-balancing strategies in EPLB?
2025-03-09 20:10:52,994 - INFO - Retrieved top-3 relevant chunks: ["# Expert Parallelism Load Balancer (EPLB)\n\nWhen using expert parallelism (EP), different experts are assigned to different GPUs. Because the load of different experts may vary depending on the current workload, it is important to keep the load of different GPUs balanced. As described in the DeepSeek-V3 paper, we adopt a redundant experts strategy that duplicates heavy-loaded experts. Then, we heuristically pack the duplicated expe

Response: <reasoning>
To compare DualPipe with 1F1B (forward-backward) and ZB1P (Zero-Backward-1-Partition), we need to focus on the bubble reduction and overall performance improvements. The "Bubble" column in the table details the performance improvement of each method, which is a key metric for evaluating pipeline parallelism efficiency.

1F1B (Forward-Backward) and ZB1P (Zero-Backward-1-Partition) are both forward-backward pipeline parallelism methods, but they differ in how they handle the communication phases. 1F1B (Forward-Backward) and ZB1P (Zero-Backward-1-Partition) have different numbers of pipeline bubbles. 

DualPipe introduces an innovation by using an overlap of 2 forward and backward chunks, which reduces the number of pipeline bubbles compared to the other methods. 

<answer>
DualPipe optimizes pipeline parallelism by using an overlap of 2 forward and backward chunks, which results in fewer pipeline bubbles compared to 1F1B and ZB1P. This overlap allows for better util

llama_perf_context_print:        load time =   77185.40 ms
llama_perf_context_print: prompt eval time =   42840.11 ms /  1062 tokens (   40.34 ms per token,    24.79 tokens per second)
llama_perf_context_print:        eval time =   59078.96 ms /   272 runs   (  217.20 ms per token,     4.60 tokens per second)
llama_perf_context_print:       total time =  103124.48 ms /  1334 tokens
2025-03-09 20:12:36,152 - INFO - Received question: What architectural features make 3FS suitable for AI workloads?
2025-03-09 20:12:36,224 - INFO - Retrieved top-3 relevant chunks: ['### Global Load Balancing\n\nIn other cases, we use the global load balancing policy that replicates the experts globally regardless of expert groups, and pack the replicated experts to individual GPUs. This policy can be adopted in decoding stage with a larger expert-parallel size.\n\n# Fire-Flyer File system\nThe Fire-Flyer File System (3FS) is a high-performance distributed file system designed to address the challenges of A

Response: <reasoning>
The context provided describes the EPLB (Expert Parallelism Load Balancer) and outlines two different load-balancing strategies for handling varying expert loads across GPUs. These strategies are based on the number of server nodes and the number of expert groups, and they are used to distribute the experts (in terms of replication and placement) to ensure load balancing across GPUs. The context mentions that there are two policies: one for hierarchical load balancing and one for global load balancing. This information directly answers the question about the two expert load-balancing strategies in EPLB.</reasoning>
<answer>
In EPLB, the two expert load-balancing strategies are:

1. Hierarchical Load Balancing: This strategy is used when the number of server nodes divides the number of expert groups. It ensures that the loads of different nodes are balanced by first distributing the expert groups evenly across the nodes, then replicating the experts within each nod

llama_perf_context_print:        load time =   77185.40 ms
llama_perf_context_print: prompt eval time =   49799.38 ms /  1130 tokens (   44.07 ms per token,    22.69 tokens per second)
llama_perf_context_print:        eval time =  123701.56 ms /   778 runs   (  159.00 ms per token,     6.29 tokens per second)
llama_perf_context_print:       total time =  176817.92 ms /  1908 tokens
2025-03-09 20:15:33,063 - INFO - Received question: How does 3FS perform in KVCache optimization for LLM inference?
2025-03-09 20:15:33,123 - INFO - Retrieved top-3 relevant chunks: ['## Training\nThe training profile data demonstrates our overlapping strategy for a pair of individual forward and backward chunks in DualPipe. Each chunk contains 4 MoE (Mixture of Experts) layers. The parallel configuration aligns with DeepSeek-V3 pretraining settings: EP64, TP1 with 4K sequence length. And the PP communication is not included during profilng for simplicity.\n\n## Inference\n### Prefilling\nFor prefilling, the

Response: <reasoning>
The Fire-Flyer File System (3FS) is designed to be highly suitable for AI workloads, particularly given the high-throughput and strong consistency requirements. Several architectural features of 3FS make it a good fit for AI workloads, including its:

1. Disaggregated Architecture: The system combines the throughput of thousands of SSDs and the network bandwidth of hundreds of storage nodes, enabling applications to access storage resources in a locality-oblivious manner. This feature is particularly useful in AI workloads where data access patterns are highly irregular and can vary significantly across different nodes.

2. Strong Consistency: The system implements Chain Replication with Apportioned Queries (CRAQ) for strong consistency. This feature simplifies application code and reasoning about consistency, which is crucial for AI systems that often require high levels of data integrity and consistency.

3. File Interfaces: Developing stateless metadata service

llama_perf_context_print:        load time =   77185.40 ms
llama_perf_context_print: prompt eval time =   34992.54 ms /  1090 tokens (   32.10 ms per token,    31.15 tokens per second)
llama_perf_context_print:        eval time =   26253.23 ms /   175 runs   (  150.02 ms per token,     6.67 tokens per second)
llama_perf_context_print:       total time =   61777.28 ms /  1265 tokens
2025-03-09 20:16:34,915 - INFO - Received question: What GraySort benchmark results highlight 3FS's capabilities?
2025-03-09 20:16:34,992 - INFO - Retrieved top-3 relevant chunks: ['## Performance\n1. Peak throughput\n\nThe following figure demonstrates the throughput of read stress test on a large 3FS cluster. This cluster consists of 180 storage nodes, each equipped with 2×200Gbps InfiniBand NICs and sixteen 14TiB NVMe SSDs. Approximately 500+ client nodes were used for the read stress test, with each client node configured with 1x200Gbps InfiniBand NIC. The final aggregate read throughput reached approxim

Response: <reasoning>
The given context does not provide any information about 3FS (which seems to be a typo or misunderstanding, as the context is about DeepSeek-V3 and DualPipe), nor does it mention KVCache optimization for LLM inference. There is no data or information related to 3FS or KVCache optimization in the provided context. Therefore, based on the information given, it's impossible to answer the question about 3FS and KVCache optimization.
</reasoning>
<answer>
The information provided does not contain any details about 3FS or its performance in KVCache optimization for Large Language Models (LLMs) during inference. Since 3FS isn't mentioned in the context, and there's no data or information about KVCache optimization, it's not possible to answer this question based on the given context.
</answer>


llama_perf_context_print:        load time =   77185.40 ms
llama_perf_context_print: prompt eval time =   32910.38 ms /  1128 tokens (   29.18 ms per token,    34.27 tokens per second)
llama_perf_context_print:        eval time =   44383.15 ms /   300 runs   (  147.94 ms per token,     6.76 tokens per second)
llama_perf_context_print:       total time =   78335.24 ms /  1428 tokens
2025-03-09 20:17:53,343 - INFO - Using default tokenizer.


Response: <reasoning>
The GraySort benchmark results demonstrate 3FS's capabilities by showcasing how efficiently it can handle large-scale data sorting tasks. In this case, 3FS was able to achieve an average throughput of 3.66 TiB/min when sorting 110.5 TiB of data across 8,19gestions in 30 minutes and 14 seconds. This performance indicates that 3FS is capable of handling large datasets and sorting them efficiently, which is a crucial aspect for distributed systems that need to process large amounts of data in a timely manner. The ability to sort such a large dataset in a relatively short time demonstrates 3FS's strong performance and suitability for applications that require high throughput and strong consistency.</reasoning>
<answer>
The GraySort benchmark results highlight 3FS's capability by demonstrating that it can efficiently process and sort large-scale datasets. In this specific test, 3FS was able to achieve an average throughput of 3.66 TiB/min when sorting 110.5 TiB of data

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [29]:
!pip install --upgrade langchain langchain-core ragas




ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
open-webui 0.5.7 requires fastapi==0.111.0, but you have fastapi 0.115.8 which is incompatible.
open-webui 0.5.7 requires langchain==0.3.7, but you have langchain 0.3.20 which is incompatible.
open-webui 0.5.7 requires pydantic==2.9.2, but you have pydantic 2.10.6 which is incompatible.
open-webui 0.5.7 requires unstructured==0.15.9, but you have unstructured 0.16.23 which is incompatible.



Collecting langchain
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.6 (from langchain)
  Using cached langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Downloading langchain-0.3.20-py3-none-any.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.3/1.0 MB ? eta -:--:--
   ------------------------------- -------- 0.8/1.0 MB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 1.0/1.0 MB 2.0 MB/s eta 0:00:00
Using cached langchain_text_splitters-0.3.6-py3-none-any.whl (31 kB)
Installing collected packages: langchain-text-splitters, langchain
  Attempting uninstall: langchain-text-splitters
    Found existing installa

In [None]:
!pip uninstall langchain-core
!pip install langchain-core


^C


: 

In [31]:
import json
from ragas.metrics import (
    Faithfulness,
    AnswerRelevance,
    ContextRelevance,
    AnswerSimilarity,
    FactualCorrectness
)
from ragas.evaluation import evaluate_metrics
from ragas.samples import SingleTurnSample

# Define RAG System Function
def rag_system(question, chat_history):
    """
    Retrieves context using the RAG retrieval mechanism and generates an answer using the LLM.
    
    Args:
        question (str): The user-provided question.
        chat_history (list): Chat history for storing responses.
    
    Returns:
        dict: Generated answer and retrieved context.
    """
    logging.info(f"Received question: {question}")
    context = get_context(question)  # Retrieve context using your retrieval mechanism

    if not context or context == "⚠️ No relevant context found.":
        error_msg = "⚠️ No relevant context found."
        logging.warning(error_msg)
        chat_history.append((question, error_msg))
        return {"answer": error_msg, "context": ""}

    SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

    prompt = f"""<|system|>
{SYSTEM_PROMPT}

Context:
{context}

<|user|>
{question}
<|assistant|>
"""

    try:
        output = llm(
            prompt,
            max_tokens=1024,
            temperature=0.7,
            top_p=0.95,
            stop=["<|user|>", "<|system|>", "<|assistant|>"]
        )

        # Extract reasoning and answer from response
        response_text = output['choices'][0]['text'].strip()
        reasoning_start = response_text.find("<reasoning>")
        reasoning_end = response_text.find("</reasoning>")
        answer_start = response_text.find("<answer>")
        answer_end = response_text.find("</answer>")

        reasoning = ""
        answer = ""

        if reasoning_start != -1 and reasoning_end != -1:
            reasoning = response_text[reasoning_start + len("<reasoning>"):reasoning_end].strip()

            if answer_start != -1 and answer_end != -1:
                answer = response_text[answer_start + len("<answer>"):answer_end].strip()
            else:
                remaining_text = response_text[reasoning_end + len("</reasoning>"):].strip()
                remaining_text = remaining_text.replace("<answer>", "").replace("</answer>", "").strip()
                answer = remaining_text
        else:
            reasoning = "Reasoning not explicitly provided."
            answer = response_text.replace("<answer>", "").replace("</answer>", "").strip()

    except Exception as e:
        error_msg = f"❌ Error generating response: {str(e)}"
        logging.error(error_msg)
        chat_history.append((question, error_msg))
        return {"answer": error_msg, "context": ""}

    return {"answer": answer, "context": context}

# Generate Answers Using RAG System
def generate_answers(rag_system, dataset):
    """
    Generate answers for each question in the dataset using the RAG system.
    
    Args:
        rag_system (function): Function to generate answers using the RAG system.
        dataset (list): List of dictionaries containing questions and reference answers.
    
    Returns:
        list: List of dictionaries containing generated answers and contexts.
    """
    generated_data = []
    chat_history = []
    
    for entry in dataset:
        question = entry["question"]
        result = rag_system(question, chat_history)
        
        generated_data.append({
            "question": question,
            "generated_answer": result["answer"],
            "retrieved_context": result["context"],
            "reference_answer": entry.get("reference_answer", "")
        })
    
    return generated_data

# Evaluate Answers Using RAGAS Metrics
def evaluate_ragas(generated_data):
    """
    Evaluate generated answers against reference answers using RAGAS metrics.
    
    Args:
        generated_data (list): List of dictionaries containing questions, generated answers, contexts, and reference answers.
    
    Returns:
        dict: Dictionary containing scores for all RAGAS metrics.
    """
    samples = []
    
    for data in generated_data:
        sample = SingleTurnSample(
            question=data["question"],
            retrieved_context=data["retrieved_context"],
            ground_truth=data["reference_answer"],
            prediction=data["generated_answer"]
        )
        samples.append(sample)

    # Define metrics to evaluate
    metrics = {
        "Faithfulness": Faithfulness(),
        "Answer Relevance": AnswerRelevance(),
        "Context Relevance": ContextRelevance(),
        "Answer Similarity": AnswerSimilarity(),
        "Factual Correctness": FactualCorrectness()
    }

    # Evaluate metrics for all samples
    results = evaluate_metrics(samples, metrics)
    
    return results

# Main Function for Evaluation
if __name__ == "__main__":
    # Path to JSON file containing questions and reference answers
    json_path = "D:\\Competitions\\39\\gguf\\qa_datasetmd.json"  # Replace with your file path
    file = process_file_from_path("D:\\Competitions\\39\\gguf\\dataset.md")

    # Load dataset
    with open(json_path, 'r') as f:
        dataset = json.load(f)

    # Generate answers using RAG system
    print("Generating answers using the RAG system...")
    generated_data = generate_answers(rag_system, dataset)

    # Evaluate generated answers using RAGAS metrics
    print("Evaluating generated answers...")
    evaluation_results = evaluate_ragas(generated_data)

    # Print Results
    print("\nEvaluation Results:")
    
    for metric_name, score in evaluation_results.items():
        print(f"{metric_name}: {score:.4f}")


ModuleNotFoundError: No module named 'langchain_core.language_models'