## Evaluate MCQ generation errors

## Imports

In [2]:
import sys
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
import pandas as pd
import numpy as np 
from tqdm import tqdm
from typing import List
import json
import glob
import ollama
from ollama import chat
import re

## Functions

In [3]:
model = 'qwen2.5:7b'

### clean

In [4]:
def validate_json_format_mcq(llm_output, type):
    """
    Attempts to extract and validate a JSON structure from the LLM output.

    Parameters:
    llm_output (str): Raw output from the LLM.
    type (str): question or answer

    Returns:
    dict: A valid JSON object if found and correctly formatted, otherwise None.
    """

    if type == 'question':
        try:
            json_match = re.search(r'\{.*\}', llm_output, re.DOTALL)
            if json_match:
                cleaned_json = json.loads(json_match.group())
                if "question" in cleaned_json and "options" in cleaned_json:
                    return cleaned_json
        except json.JSONDecodeError:
            pass
        return None
    
    elif type == 'answer':
        try:
            json_match = re.search(r'\{.*\}', llm_output, re.DOTALL)
            if json_match:
                cleaned_json = json.loads(json_match.group())
                if "Answer" in cleaned_json and "Justification" in cleaned_json:
                    answer = cleaned_json["Answer"]
                    # Check if answer in 'A', 'B', 'C', or 'D'.
                    if answer not in {'A', 'B', 'C', 'D'}:
                        # check for a valid letter isolated
                        match = re.search(r'\b[A-D]\b', answer)
                        if match:
                            cleaned_json["Answer"] = match.group()
                        else:
                            return None
                    return cleaned_json
        except json.JSONDecodeError:
            pass
        return None

In [5]:
def call_formatting_llm_mcq(llm_output, type):
    """
    Calls an LLM specialized in formatting text into the correct JSON format.

    Parameters:
    llm_output (str): Raw output from the initial LLM.
    type (str): question or answer

    Returns:
    dict: A valid JSON object containing the question and options.
    """
    if type == 'question':
        system_prompt = """You are an AI specialized in converting multiple-choice legal questions into JSON format.
        Ensure the output strictly follows this structure:
        ```json
        {"question": "...", "options": ["A ....", "B ...", "C ...", "D ..."]}
        """

    elif type == 'answer':
        system_prompt = """You are an AI specialized in converting legal answer into JSON format.
        Ensure the output strictly follows this structure:
        ```json
        {
        "Answer": "...", 
        "Justification": "..."
        }
        """

    user_prompt = f"""
        The following text needs to be formatted as a valid JSON:
        {llm_output}
        
        Please convert it into the required JSON format.
        """

    response = chat(model=model, messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ])
    
    if type == 'answer':
        return response['message']['content']
    elif type == 'question':
        return validate_json_format_mcq(response['message']['content'], type)

In [6]:
def clean_generate_mcq_output(llm_output, type):
    """
    Cleans and extracts a valid JSON multiple-choice question from the LLM output.
    If the initial output is not valid JSON, a specialized LLM is called to correct it.

    Parameters:
    llm_output (str): Raw output from the LLM.

    Returns:
    dict: A properly formatted multiple-choice question.
    """
    result = validate_json_format_mcq(llm_output, type)
    if result:
        return result
    
    # If not valid, call formatting LLM
    formatted_result = call_formatting_llm_mcq(llm_output, type)
    if formatted_result:
        return formatted_result
    
    raise ValueError("Failed to convert LLM output into valid JSON format.")

### get context

In [7]:
def get_context(query: str, k: int, knowledge_vector_db: FAISS):
    """ 
    Retrieves relevant context for a given query.

    Parameters:
    query (str): The input query for which context is needed.
    k (int, optional): The number of relevant context elements to retrieve (default is 5).

    Returns:
    list: A list containing relevant context elements.
    """
    retrieved_docs = knowledge_vector_db.similarity_search(query=query, k=k)
    return retrieved_docs

### embeddings loader

In [8]:
def load_rag_embeddings(path, embeddings_model, device='cuda'):

    embedding_model = HuggingFaceEmbeddings(
        model_name=embeddings_model,
        multi_process=True,
        model_kwargs={"device": device, "trust_remote_code":True},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
        encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
    )
    
    knowledge_vector_db = FAISS.load_local(path, embedding_model, allow_dangerous_deserialization=True)
    print("embedding load")
    return knowledge_vector_db

### generate mcq

In [9]:
MAX_OUTPUT_TOKENS = 3000

In [10]:
def generate_mcq(questions: str, knowledge_vector_db: FAISS) -> dict:
    """
    Generates an MCQ question.

    Parameters:
    questions (str): String of validated mcq questions from one subcategory as exemple.

    Returns:
    question (dict): {'question': '...',
                      'options': ['A ....', 'B ...', ...]}
    """
    # Retrieve context
    retrieved_docs = get_context(questions, 3, knowledge_vector_db)
    context = "\nExtracted documents:\n"
    context += "".join([f'Content: {doc.page_content} \nSource: {doc.metadata['ref']}\n\n' for i, doc in enumerate(retrieved_docs)])
    # context_sources = "".join([f'\nSource: {doc.metadata['ref']}, Url: {doc.metadata.get('url', 'N/A')}' for i, doc in enumerate(retrieved_docs)])


    # Build prompt
    system_prompt = f"""
    You are an AI specialized in generating multiple-choice legal questions based on given legal texts.
    ### Instructions:
    - Generate a new legal multiple-choice question based on the provided context.
    - Ensure the question aligns with the style and complexity of the given examples.
    - Provide four answer options (A, B, C, D), with only one being correct.
    - Format the output strictly as a JSON object with the following structure:
        ```json
        {{'question': '...', 'options': ['A ....', 'B ...', 'C ...', 'D ...']}}
    """

    user_prompt = f"""
    ### Context:
    {context}

    ### Examples of Previous Questions:
    {questions}

    Generate a new question that follows the same format and is correct based on the context. Write it in a json.
    """

    # Initial attempt to get the answer
    attempt_count = 0
    max_attempts = 5  # Limit number of attempts to prevent infinite loops

    while attempt_count < max_attempts:
        question_mcq = chat(model=model,
                            messages=[{"role":"system", "content":system_prompt},
                                      {"role":"user","content":user_prompt}],
                            options = {"num_predict":MAX_OUTPUT_TOKENS}
                            )
        
        # Put question in correct json format
        try:
            cleaned_question_mcq = clean_generate_mcq_output(question_mcq['message']['content'], type='question')
            return cleaned_question_mcq, attempt_count  # If valid, return it
        except ValueError:
            attempt_count += 1  # Increment attempt count
            print(f"Attempt {attempt_count} failed. Retrying...")
    
    # If all attempts fail, raise an exception or return None
    return 'Error', '>5'
    raise ValueError("Failed to generate a valid MCQ after multiple attempts.")

## Evaluation

In [11]:
import random

def evaluate_generate_mcq_question(save_dir: str, data_dir: str, test_name: str, model, knowledge_vector_db: FAISS, n):
    """
    Evaluate generation_mcq_answer.

    Parameters:
    save_dir (str): Path to save the evaluation file.
    data_dir (str): Folder to load the json mcq and mcs_solutions files.
    test_name (str): Name of the test, used to save file.

    Returns:
    Nothing but save a json file in save_dir.
    """

    # Get questions from mcq json files
    questions = []
    
    for filename in os.listdir(data_dir):
        # Ignore file containing solution or MOCK
        if "solution" in filename or "json" not in filename or "MOCK" in filename or 'open' in filename or 'categories' in filename:
            continue

        # Load doc
        filepath = os.path.join(data_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            doc = json.load(file)
        for elt in doc.values():
            questions.append(elt)
        
    ai_questions = []
    attempt_counts = []
    # Generate n AI Questions
    for i in range(n):
        # Get 3 random question from questions
        questions_for_generation = random.sample(questions, 3)
        questions_for_generation_str = json.dumps(questions_for_generation, indent=4)
        print(questions_for_generation_str)

        # Generate new question
        new_question, attempt_count = generate_mcq(questions_for_generation_str, knowledge_vector_db)
        print(f'Question {i}: {new_question}')
        ai_questions.append(new_question)
        attempt_counts.append(attempt_count)
            
    
    result = {'ai_questions': ai_questions,'attempt_counts': attempt_counts}
    
    # Save json
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    output_path = os.path.join(save_dir, f"{test_name}.json")
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(result, file, indent=4, ensure_ascii=False)


In [None]:
# knowledge_vector_db = load_rag_embeddings(path=f'../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
#                                             embeddings_model='sentence-transformers/all-MiniLM-L6-v2', 
#                                             device='cuda')

embedding load


In [None]:
# evaluate_generate_mcq_question(save_dir='../outputs/evaluation2/', data_dir='../outputs/',
#                             test_name=f'evaluate_mcq_questions_generation', model=model,
#                             knowledge_vector_db=knowledge_vector_db, n=100)

[
    {
        "question": "Which of the following statements about filing third- party observations is correct?",
        "options": [
            "A. The person that filed third- party observations will be a party to the proceedings before the EPO, and the applicant is obliged to respond to third- party observations",
            "B. Third -party observations must be filed in writing and may not be filed anonymously",
            "C. Third -party observations are considered by the examining division only if they include a statement of the grounds on which they are based, and an official fee is paid",
            "D. Third -party observations can be filed after the publication of a European patent application and must be filed in an official language of the EPO"
        ]
    },
    {
        "question": "During oral proceedings, the opposition division decided to maintain the patent in amended form on the basis of a request in which the description contains a paragraph with handwrit

## Pour vérifier que les nouveaux embeddings font pas de bêtises

In [12]:
model = 'qwen2.5:7b'

knowledge_vector_db = load_rag_embeddings(path='../embeddings/gte_large_en_v1_5/rag_embeddings_Alibaba-NLP_gte-large-en-v1.5', 
                                          embeddings_model='Alibaba-NLP/gte-large-en-v1.5', 
                                          device='cuda')

evaluate_generate_mcq_question(save_dir='../outputs/evaluation2/', data_dir='../outputs/',
                            test_name=f'evaluate_mcq_questions_generation_newembeddings', model=model,
                            knowledge_vector_db=knowledge_vector_db, n=100)

embedding load
[
    {
        "question": "During the examination proceedings the applicant notes that claim 1 contains a typographical error. What is the last point in time for requesting correction of the error under Rule 139 EPC?",
        "options": [
            "A. One day before the decision to grant a European patent is handed over to the EPO postal service",
            "B. Date  of publication of the mention of the grant of the European patent",
            "C. Date of notification of the decision to grant",
            "D. Date of the communication under Rule 71(3) EPC"
        ]
    },
    {
        "question": "A British company buys a French firm with an IP portfolio which includes an international application that claims priority of a British national application. The international application is in French. The British company wants to pursue the invention as a European patent application. However, they would prefer that the language of the European proceedings be Engli