# Evaluation models

Get real metrics to see how our models performe.

## Imports

In [1]:
import os
import json
import sys
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import ollama
from ollama import chat
import re

## Temp utils functions

In [None]:
!curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d "{\"model\": \"qwen2.5:7b\", \"keep_alive\": -1}"
# To unload a model and free up memory
# !curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d "{\"model\": \"qwen2.5:7b\", \"keep_alive\": 0}"

EMBEDDING_MODEL_NAME = "thenlper/gte-small"
# Load embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local("../embeddings/rag_embeddings_thenlper_gte-small", embedding_model, allow_dangerous_deserialization=True)

# Reader model
MODEL = 'qwen2.5:7b'
MAX_OUTPUT_TOKENS = 3000

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   154  100   113  100    41    508    184 --:--:-- --:--:-- --:--:--   693


{"model":"qwen2.5:7b","created_at":"2025-03-28T01:58:45.8198017Z","response":"","done":true,"done_reason":"load"}


In [2]:
MAX_OUTPUT_TOKENS = 3000

### utils

In [35]:
def get_context(query, KNOWLEDGE_VECTOR_DATABASE=KNOWLEDGE_VECTOR_DATABASE, k=5):
    """ 
    Retrieves relevant context for a given query.

    Parameters:
    query (str): The input query for which context is needed.
    k (int, optional): The number of relevant context elements to retrieve (default is 5).

    Returns:
    list: A list containing relevant context elements.
    """
    # test
    query = f'{query}'
    # Retrieve docs
    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=query, k=k)

    return retrieved_docs

In [36]:
def call_formatting_llm_mcq(llm_output, type, model):
    """
    Calls an LLM specialized in formatting text into the correct JSON format.

    Parameters:
    llm_output (str): Raw output from the initial LLM.
    type (str): question or answer

    Returns:
    dict: A valid JSON object containing the question and options.
    """

    if type == 'question':
        SYSTEM_PROMPT = """You are an AI specialized in converting multiple-choice legal questions into JSON format.
        Ensure the output strictly follows this structure:
        ```json
        {"question": "...", "options": ["A ....", "B ...", "C ...", "D ..."]}
        """

    elif type == 'answer':
        SYSTEM_PROMPT = """You are an AI specialized in converting legal answer into JSON format.
        Ensure the output strictly follows this structure:
        ```json
        {
        "Answer": "...", 
        "Justification": "..."
        }
        """

    user_prompt = f"""
        The following text needs to be formatted as a valid JSON:
        {llm_output}
        
        Please convert it into the required JSON format.
        """

    response = chat(model=model, messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ])
    
    print(response['message']['content'])
    return response['message']['content']

In [37]:
def clean_output_v2(text, type, model):
    """ 
    Fist we call small model to translate in json style.
    If it's parsable we get the json. If not, we do regex.
    """

    if type == 'answer':
        text_json_like = call_formatting_llm_mcq(text, type, model)

        # Regex pattern to capture the content of "Answer" and "Justification"
        pattern = r'"Answer":\s*"([^"]+)"\s*,\s*"Justification":\s*"([^"]+)"'

        matches = re.search(pattern, text_json_like)

        if matches:
            answer = matches.group(1)
            # check for a valid letter isolated
            match = re.search(r'\b[A-D]\b', answer)
            if match:
                answer = match.group()

            justification = matches.group(2)
        else:
            raise ValueError("Can't extract data.")
        
        return {'Answer': answer, 'Justification': justification}
    
    # elif type == 'question':
    #     text_json_like = call_formatting_llm_mcq(text, type, model)

    #     return {'question': question, 'options': [optionA, optionB, optionC, optionD]}


## generate_mcq_answer

In [38]:
def generate_mcq_answer(question_mcq: str, model) -> dict:
    """
    Generates an answer for a MCQ question.

    Parameters:
    question_mcq (str): The input question_mcq for which an answer is needed.

    Returns:
    answer (str): The generated response from the AI with the context used.
    """

    # Convert the question in string, in case the question is a json.
    question_mcq = str(question_mcq)

    # Retrieve context
    retrieved_docs = get_context(question_mcq, k=5)
    context = "\nExtracted documents:\n"
    context += "".join([f'Content: {doc.page_content} \nSource: {doc.metadata['ref']}\n\n' for i, doc in enumerate(retrieved_docs)])
    context_sources = "".join([f'\nSource: {doc.metadata['ref']}, Url: {doc.metadata.get('url', 'N/A')}' for i, doc in enumerate(retrieved_docs)])

    # Build prompt
    SYSTEM_PROMPT = f"""
    You are an AI specialized in answering legal multiple-choice questions based on provided legal texts.
    ### Instructions:
    - When given a multiple-choice legal question, provide the correct answer followed by an explanation.
    - Your answer should begin with the correct choice (e.g., "Answer A").
    - After that, explain why this choice is correct based on the provided legal context.
    - Then, explain why the other choices (B, C, D) are incorrect, using relevant legal reasoning from the context.
    - Use the legal context provided to back up your reasoning.
    - Make sure to clearly distinguish between the correct answer and the incorrect ones.
    """

    user_prompt = f"""
    ### Context:
    {context}

    ### Legal Question:
    {question_mcq}

    Answer the question by:
    1. Starting with the correct answer (e.g., "Answer A").
    2. Explaining why this choice is correct according to the provided legal text.
    3. Explaining why the other options (B, C, D) are incorrect based on the legal context.
    """

    # Initial attempt to get the answer
    attempt_count = 0
    max_attempts = 3  # Limit number of attempts to prevent infinite loops

    while attempt_count < max_attempts:
        answer_mcq = chat(model=model,
                            messages=[{"role":"system", "content":SYSTEM_PROMPT},
                                      {"role":"user","content":user_prompt}],
                            options = {"num_predict":MAX_OUTPUT_TOKENS}
                            )

        # Put answer in correct json format
        try:      
            cleaned_answer_mcq = clean_output_v2(answer_mcq['message']['content'], type='answer', model=model)
            if cleaned_answer_mcq['Answer'] not in {'A', 'B', 'C', 'D'}:
                raise ValueError("La réponse doit être 'A', 'B', 'C' ou 'D'.")
            
            cleaned_answer_mcq['Justification'] += f'\n\nSources:\n{context_sources}'
            return cleaned_answer_mcq  # If valid, return it

        except ValueError:
            attempt_count += 1  # Increment attempt count
            print(f"Attempt {attempt_count} failed. Retrying...")
    
    # If all attempts fail, raise an exception or return None
    raise ValueError("Failed to generate a valid MCQ after multiple attempts.")


### evaluate_generate_mcq_answer

In [39]:
def evaluate_generate_mcq_answer(save_dir: str, data_dir: str, test_name: str, model):
    """
    Evaluate generation_mcq_answer.

    Parameters:
    save_dir (str): Path to save the evaluation file.
    data_dir (str): Folder to load the json mcq and mcs_solutions files.
    test_name (str): Name of the test, used to save file.

    Returns:
    Nothing but save a json file in save_dir.
    """

    # Get questions/answers from mcq json files
    data = []
    
    # Initialise total answer and correct answer for evaluation
    total_answer = 0
    correct_answer =  0
    
    for filename in os.listdir(data_dir):
        # Ignore file containing solution or MOCK
        if "solution" in filename or "json" not in filename or "MOCK" in filename or 'open' in filename or 'categories' in filename:
            continue

        # Initialise filename lists
        questions = []
        answers = []
        ai_answers = []
        
        # Load doc
        filepath = os.path.join(data_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            doc = json.load(file)
        for elt in doc.values():
            questions.append(elt)
        
        # Load solution doc
        solution_filepath = filepath.rsplit('.json', 1)[0] + "_solution.json"
        if os.path.exists(solution_filepath):
            with open(solution_filepath, 'r', encoding='utf-8') as file:
                solution_doc = json.load(file)
            for elt in solution_doc.values():
                answers.append(elt)
        
        # Generate AI Answer
        for i in range(len(questions)):

            question = str(questions[i])
            print(question)

            try:
                ai_answer = generate_mcq_answer(question, model)
            except json.JSONDecodeError:
                ai_aswer = {'Answer': 'None', 'Justification': 'Error when parsing json. {json.JSONDecodeError}'}
            # ai_answer = 
            ai_answers.append(ai_answer)
            
            # Check if correct answer
            total_answer += 1
            if answers[i]['Answer'] == ai_answer['Answer']:
                correct_answer += 1

        # Append to data
        data.append({'filename': filename, 'questions': questions,
        'answers': answers, 'ai_answers': ai_answers})
    
    # Save at each question
    informations = {'total_answer': total_answer, 'correct_answer': correct_answer}
    final_evaluation = {'informations': informations,'data': data}
    
    # Save json
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    output_path = os.path.join(save_dir, f"{test_name}.json")
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(final_evaluation, file, indent=4, ensure_ascii=False)



In [40]:
# v0.1 ~ 28 min, max failed observed: 6
MODEL = 'qwen2.5:7b'
name = 'qwen2_5_7b_context5_without_guidelines'

evaluate_generate_mcq_answer(save_dir='../outputs/ai_answers_evaluation', 
                             data_dir='../outputs', 
                             test_name= f'evaluate_generate_mcq_answer_{name}',
                             model = MODEL)

{'question': 'An international application is published , together  with the search report drawn up by the CNIPA , 18 m onths + 1 day after the filing date (no priority claimed). The application has no more than 35 pages and 15 claims. Which statement reflects all actions the CN  applicant  needs to take for entry into the EP phase 25 months  after the filing date ? A request for early processing has been filed.', 'options': ['A. Complete and file Form 1200 and pay the filing fee and the search fee', 'B. Complete and file Form 1200 and pay the filing fee, the search fee and the renewal fee for the third year', 'C. Complete and file Form 1200, pay the filing fee and the search fee and appoint a representative', 'D. None of the above statements']}
 ```json
{
    "Answer": "B.",
    "Justification": "According to the EPO's Guidelines for Examination in European Patent Applications (Guidelines C-V, March 2022 version), when an international application is published with a search report dra

### generate_mcq()

In [9]:
# Data for tests
test1 = """ 
{
        "question": "A communication pursuant to Article 94(3) EPC is dated 7 December 2022. In the communication a time limit of four months is set for replying to objections raised by the examining division.  Which of the following statements is not  correct?",
        "options": [
            "A.  The communication is deemed to be delivered on 17 December 2022",
            "B. The time limit for replying to the communication expires on 17 April 2023",
            "C.  Further processing for replying to the communication can be validly requested on 19 June 2023 at the latest",
            "D.  An extension of the time limit for filing the reply can be validly requested on 14 April 2023"
        ]
}
{
        "question": "You filed a European patent application which discloses a new amino acid sequence. This sequence is however only used in a single example and is not part of the claims. No sequence listing was filed.  Which of the following statements is correct?",
        "options": [
            "A. If you are invited by the EPO to file a sequence listing, you need to file it and pay a late furnishing fee to the EPO",
            "B. If you are invited by the EPO to file a sequence listing and do not react to this invitation in due time, the application is deemed to be withdrawn",
            "C. If you are invited by the EPO to file a sequence listing, you need to file it in PDF format",
            "D. You do not need to file any sequence listing because the amino acid sequence is not claimed"
        ]
 {
        "question": "During the examination proceedings the applicant notes that claim 1 contains a typographical error. What is the last point in time for requesting correction of the error under Rule 139 EPC?",
        "options": [
            "A. One day before the decision to grant a European patent is handed over to the EPO postal service",
            "B. Date  of publication of the mention of the grant of the European patent",
            "C. Date of notification of the decision to grant",
            "D. Date of the communication under Rule 71(3) EPC"
        ]
}
"""

test2 = """ 
{
        "question": "Antonia, an applicant living in Argentina and having Argentinian nationality, intends to file a European patent application EP -A and an international patent application PCT- A.  Which of the following statements is correct?",
        "options": [
            "A. Antonia can validly file PCT- A with the International Bureau of WIPO",
            "B. Antonia can validly file PCT -A with the EPO",
            "C. Antonia can validly file EP -A with the EPO",
            "D. None of the above"
        ]
    {
        "question": "Applicant X filed European patent application EP -X on 23 August 2019 as a first filing. The applicant filed application EP -Y as a divisional application of EP- X on 26 September 2022. Please consider the following statements:   2 I. The renewal fee for the fourth year for EP -X was due on 31 August 2022; the renewal fees for the third and fourth years for EP -Y were due on 26 September 2022 II. The renewal fee for the fourth year for EP -X can be validly paid on 28 February 2023 together with the necessary additional fee; the renewal fees for the third and fourth years for EP -Y can be validly paid without additional fee up to 31 January 2023  Which of the following items applies?",
        "options": [
            "A. Only statement I is correct",
            "B. Only statement II is correct",
            "C. Both statements I and II are correct",
            "D. Both statements I and II are incorrect"
        ]
    },
 {
        "question": "The European Patent Office as receiving Office for an international patent application invites the applicant to provide a missing abstract within a period of two months. The invitation is dated 2 December 2022 and is received by the applicant on 12 December 2022. The date of receipt stamped on the communication by the mail service provides evidence of the receipt date. What is the final date for responding to the invitation?",
        "options": [
            "A. 2 February 2023",
            "B. 13 February 2023",
            "C. 5 February 2023",
            "D. 6 February 2023"
        ]
}
"""

test3 = """ 
 {
        "question": "An international patent application was filed on 17 February 2022 claiming priority from a European patent application filed on 17 February 2021. The international search report together with the written opinion was transmitted to the applicant on 10 June 2022.   What is the last day for filing a demand for international preliminary examination?",
        "options": [
            "A. 17 December 2022",
            "B. 19 December 2022",
            "C. 10 September 2022",
            "D. 12 September 2022"
        ]
    }
 {
        "question": "A professional representative acts as agent for the following actions:  I.  Withdrawing an international patent application at the European Patent Office as receiving Office  II.  Filing a demand for international preliminary examination for an international patent application at the European Patent Office as International Preliminary Examining Authority   Which of the following statements is correct?",
        "options": [
            "A.  For both I and II a power of attorney signed by each applicant is always required",
            "B.  For I a power of attorney signed by each applicant is always required",
            "C.  For II a power of attorney signed by each applicant is always required",
            "D.  For both I and II the requirement to submit a power of attorney signed by each applicant may be waived   3"
        ]
    }
 {
        "question": "Which of the following steps is not a minimum requirement for entry into the European phase?",
        "options": [
            "A.  Supplying a translation if the Euro- PCT application was not published in one of the EPO's official languages",
            "B.  Specifying the application documents on which the European grant procedure is to be based",
            "C.  Filing the designation of the inventor",
            "D.  Paying the search fee if a supplementary European search report is to be drawn up"
        ]
    }
"""


In [None]:
from ollama import Client

def get_ollama_client():
    client = Client('http://localhost:11434/')
    client.pull('qwen2.5:7b')
    return client
ollama_client = get_ollama_client()
model, max_output_tokens = "qwen2.5:7b", 3000


def generate_mcq_answer(question_mcq: str, knowledge_vector_db: FAISS) -> dict:
    """
    Generates an answer for a MCQ question.

    Parameters:
    question_mcq (str): The input question_mcq for which an answer is needed.

    Returns:
    answer (str): The generated response from the AI with the context used.
    """
    # Convert the question in string, in case the question is a json.
    question_mcq = str(question_mcq)

    # Retrieve context
    retrieved_docs = get_context(question_mcq, 5, knowledge_vector_db)
    context = "\nExtracted documents:\n"
    context += "".join([f'Content: {doc.page_content} \nSource: {doc.metadata['ref']}\n\n' for i, doc in enumerate(retrieved_docs)])
    context_sources = "".join([f'\nSource: {doc.metadata['ref']}, Url: {doc.metadata.get('url', 'N/A')}' for i, doc in enumerate(retrieved_docs)])

    # Build prompt
    system_prompt = f"""
    You are an AI specialized in answering legal multiple-choice questions based on provided legal texts.
    ### Instructions:
    - When given a multiple-choice legal question, provide the correct answer followed by an explanation.
    - Your answer should begin with the correct choice (e.g., "Answer A").
    - After that, explain why this choice is correct based on the provided legal context.
    - Then, explain why the other choices (B, C, D) are incorrect, using relevant legal reasoning from the context.
    - Use the legal context provided to back up your reasoning.
    - Make sure to clearly distinguish between the correct answer and the incorrect ones.
    """

    user_prompt = f"""
    ### Context:
    {context}

    ### Legal Question:
    {question_mcq}

    Answer the question by:
    1. Starting with the correct answer (e.g., "Answer A").
    2. Explaining why this choice is correct according to the provided legal text.
    3. Explaining why the other options (B, C, D) are incorrect based on the legal context.
    """

    # Initial attempt to get the answer
    attempt_count = 0
    max_attempts = 3  # Limit number of attempts to prevent infinite loops

    while attempt_count < max_attempts:
        answer_mcq = ollama_client.chat(model=model,
                            messages=[{"role":"system", "content":system_prompt},
                                      {"role":"user","content":user_prompt}],
                            options = {"num_predict":max_output_tokens}
                            )

        # Put answer in correct json format
        try:
            cleaned_answer_mcq = clean_output_v2(answer_mcq['message']['content'], type='answer')
            # Add context to Justification
            cleaned_answer_mcq['Justification'] += f'\n\nSources:\n{context_sources}'
            return cleaned_answer_mcq  # If valid, return it
        except ValueError:
            attempt_count += 1  # Increment attempt count
            print(f"Attempt {attempt_count} failed. Retrying...")
    
    # If all attempts fail, raise an exception or return None
    raise ValueError("Failed to generate a valid MCQ after multiple attempts.")

In [None]:
generate_mcq_answer(test1, KNOWLEDGE_VECTOR_DATABASE)

In [None]:
generate_mcq_answer(test2, KNOWLEDGE_VECTOR_DATABASE)

In [None]:
generate_mcq_answer(test3, KNOWLEDGE_VECTOR_DATABASE)

## Embeddings

In [41]:
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
import pandas as pd
import numpy as np 
from tqdm import tqdm
from typing import List
import json
import glob
import os

In [42]:
def load_csv(data_dir):
    # Load all CSV
    # csv_files = glob.glob(f"{data_dir}/*.csv")
    # Temp pour test
    csv_files = [f for f in glob.glob(f"{data_dir}/*.csv") if "guidelines" not in f.lower()]

    df_list = [pd.read_csv(file) for file in csv_files]
    df = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame(columns=["ref", "url", "content"])
    # List of documents for LangChain
    return  [
        LangchainDocument(page_content=row["content"], metadata={"ref": row["ref"], "url": row["url"]})
        for _, row in tqdm(df.iterrows(), total=len(df))
    ]


def load_exam_solutions(data_dir):
    # Load JSON with "solution" in their name
    json_files = glob.glob(f"{data_dir}/*solution*.json")

    json_documents = []
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Extract justification and create Langchain documents
        filename = os.path.basename(file)  # Nom du fichier sans le chemin

        # Si le fichier contient "open" dans son nom, on récupère tout
        if "open" in filename:
            for key, entry in data.items():
                json_documents.append(
                    LangchainDocument(
                        page_content=entry,
                        metadata={"ref": filename}
                    )
                )
            
        else:
            for key, entry in data.items():
                if "Justification" in entry:
                    json_documents.append(
                        LangchainDocument(
                            page_content=entry["Justification"],
                            metadata={"ref": filename}  # ref = Nom du fichier
                        )
                    )
    return json_documents
    

def split_documents(chunk_size: int, knowledge_base: List[LangchainDocument], tokenizer_name: str, markdown_separators) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=markdown_separators,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


def create_rag_embeddings(model_name, markdown_separators, data_dir, output_dir, device="cpu"):
    
    csv_raw_knowledge = load_csv(data_dir)
    exam_raw_knowledge = load_exam_solutions(data_dir)
    raw_knowledge_base = csv_raw_knowledge + exam_raw_knowledge
    
    docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    raw_knowledge_base,
    model_name,
    markdown_separators
    )

    embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    multi_process=True,
    model_kwargs={"device": device},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
    )

    # Compute embeddings (can take time ~7min on my laptop)
    knowledge_vector_database = FAISS.from_documents(
        docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
    )
    # Save embeddings
    knowledge_vector_database.save_local(f"{output_dir}/rag_embeddings_{model_name.replace('/', '_')}")


markdown_separators = [
    "\n\n",
    "\n",
    ".",
    " ",
    "",
    ]

data_dir = f'../outputs/'
output_dir = f'../embeddings/'

### GENERER EMBEDDING

In [44]:
model_rag = 'thenlper/gte-small'
experiance_name = output_dir + 'gte_small_without_guidelines'

create_rag_embeddings(model_rag, markdown_separators, data_dir, experiance_name, device="cuda")

100%|██████████| 2501/2501 [00:00<00:00, 26472.80it/s]


In [30]:
# Load embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name=model_rag,
    multi_process=True,
    model_kwargs={"device": "cuda"},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(f"{experiance_name}/rag_embeddings_thenlper_gte-small", embedding_model, allow_dangerous_deserialization=True)

## evaluate_generate_open_answer (not used)

In [5]:
def extract_score(text):
    match = re.search(r"\[SCORE\] (\d)", text)
    return int(match.group(1)) if match else None

def big_model_evaluation(question: str, answer: str, ai_answer: str) -> tuple[int, str]:
    """
    Use 32b model to evaluate the quality of our AI answers.

    Parameters:
    question (str): Question.
    answer (str): Réal answer.
    ai_answer (str): Ai answer.

    Returns:
    score (int): The score between 0 and 5.
    explaination (str): A short explaination.
    """

    # Model to use
    model = 'qwen2.5:1.5b'
    
    # Convert the question in string, in case the question is a json.
    question = str(question)
    answer = str(answer)
    ai_answer = str(ai_answer)

    # Build prompt
    SYSTEM_PROMPT = f"""You are an AI expert in evaluating legal question-answering systems. Your task is to assess the performance of a Retrieval-Augmented Generation (RAG) model in answering legal examination questions.
    ### **Evaluation Criteria**:  
    1. **Accuracy (0-2 points):**  
    - 2: The answer is fully correct and aligns with the true answer.  
    - 1: The answer is partially correct but contains minor inaccuracies or incomplete reasoning.  
    - 0: The answer is incorrect.  

    2. **Legal Reasoning (0-2 points):**  
    - 2: The reasoning is well-structured, logical, and fully supported by legal sources.  
    - 1: The reasoning is somewhat logical but lacks clarity, depth, or partial sourcing.  
    - 0: The reasoning is weak, missing, or incorrect.  

    3. **Use of Sources (0-1 point):**  
    - 1: The answer correctly references legal sources supporting the argument.  
    - 0: The answer lacks proper sourcing or relies on incorrect sources.  

    ### **Output Format:**  
    Your evaluation must include:  
    - A **numerical grade (0-5)** formatted explicitly as `[SCORE] X`, where X is the final score.  
    - A **brief explanation** justifying the grade.  

    If the answer is incorrect or lacks sufficient reasoning, explain why. If the answer is correct but can be improved, provide suggestions.
    """

    user_prompt = f"""### **Legal Question:**  
    {question}  

    ### **True Answer:**  
    {answer}  

    ### **Model Answer:**  
    {ai_answer}  

    Evaluate the model answer based on:  
    1. **Accuracy** (0-2 points)  
    2. **Legal Reasoning** (0-2 points)  
    3. **Use of Sources** (0-1 point)  

    Provide a final score formatted as `[SCORE] X` and justify the score with a brief explanation.
    """

    # Redact an answer
    answer = chat(model=model,
                            messages=[{"role":"system", "content":SYSTEM_PROMPT},
                                      {"role":"user","content":user_prompt}]
                )
    
    # Extract score
    score = extract_score(answer['message']['content'])

    return score, answer['message']['content']
    



In [6]:
def evaluate_generate_open_answer(save_dir: str, data_dir: str, test_name: str):
    """
    Evaluate generation_mcq_answer.

    Parameters:
    save_dir (str): Path to save the evaluation file.
    data_dir (str): Folder to load the json mcq and mcs_solutions files.
    test_name (str): Name of the test, used to save file.

    Returns:
    Nothing but save a json file in save_dir.
    """

    # Get questions/answers from mcq json files
    data = []
    
    # Initialize grade
    sum_possible_score = 0
    sum_eval_score = 0
    
    for filename in os.listdir(data_dir):
        # Ignore file containing solution or MOCK
        if "solution" in filename or "json" not in filename or "MOCK" in filename or 'mcq' in filename or 'categories' in filename:
            continue

        # Initialise filename lists
        questions = []
        answers = []
        ai_answers = []
        
        # Load doc
        filepath = os.path.join(data_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            doc = json.load(file)
        for elt in doc.values():
            questions.append(elt)
        
        # Load solution doc
        solution_filepath = filepath.rsplit('.json', 1)[0] + "_solution.json"
        if os.path.exists(solution_filepath):
            with open(solution_filepath, 'r', encoding='utf-8') as file:
                solution_doc = json.load(file)
            for elt in solution_doc.values():
                answers.append(elt)
        
        # Generate AI Answer
        for i in range(len(questions)):
            question = str(questions[i])
            # ai_answer = generate_open_answer(question)
            ai_answer = {'In dev'}
            ai_answers.append(ai_answer)


        # Append to data
        data.append({'filename': filename, 'questions': questions,
        'answers': answers, 'ai_answers': ai_answers})

    # Get grade from bigger AI
    for file in data:
        questions = file['questions']
        answers = file['answers']
        ai_answers = file['ai_answers']

        # Initialise evaluations lists
        eval_scores = []
        eval_feedbacks = []

        # Evaluate each question
        for i in range(len(questions)):
            eval_score, eval_feedback = big_model_evaluation(questions[i], answers[i], ai_answers[i])
            eval_scores.append(eval_score)
            eval_feedbacks.append(eval_feedback)
            sum_possible_score += 5
            sum_eval_score += eval_score

        # Add evaluations lists to file
        file['eval_scores'] = eval_scores
        file['eval_feedbacks'] = eval_feedbacks
        

    avg_eval_score = sum_eval_score/sum_possible_score
    informations = {'averega_eval_score': avg_eval_score}
    final_evaluation = {'informations': informations,'data': data}
    
    # Save json
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    output_path = os.path.join(save_dir, f"{test_name}.json")
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(final_evaluation, file, indent=4, ensure_ascii=False)


In [10]:
evaluate_generate_open_answer(save_dir='../outputs/ai_answers_evaluation', 
                             data_dir='../outputs', 
                             test_name='evaluate_generate_open_answer_basemodel')

NameError: name 'evaluate_generate_open_answer' is not defined