# Evaluation models

Get real metrics to see how our models performe.

## Imports

In [1]:
import os
import json
import sys

# Import our AI functions
# Add src path
sys.path.append(os.path.abspath("../src/models"))
from utils_temp import *
from answer import *

  embedding_model = HuggingFaceEmbeddings(


## evaluate_generate_mcq_answer

In [2]:
def evaluate_generate_mcq_answer(save_dir: str, data_dir: str, test_name: str):
    """
    Evaluate generation_mcq_answer.

    Parameters:
    save_dir (str): Path to save the evaluation file.
    data_dir (str): Folder to load the json mcq and mcs_solutions files.
    test_name (str): Name of the test, used to save file.

    Returns:
    Nothing but save a json file in save_dir.
    """

    # Get questions/answers from mcq json files
    data = []
    
    # Initialise total answer and correct answer for evaluation
    total_answer = 0
    correct_answer =  0
    
    for filename in os.listdir(data_dir):
        # Ignore file containing solution or MOCK
        if "solution" in filename or "json" not in filename or "MOCK" in filename or 'open' in filename or 'categories' in filename:
            continue

        # Initialise filename lists
        questions = []
        answers = []
        ai_answers = []
        
        # Load doc
        filepath = os.path.join(data_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            doc = json.load(file)
        for elt in doc.values():
            questions.append(elt)
        
        # Load solution doc
        solution_filepath = filepath.rsplit('.json', 1)[0] + "_solution.json"
        if os.path.exists(solution_filepath):
            with open(solution_filepath, 'r', encoding='utf-8') as file:
                solution_doc = json.load(file)
            for elt in solution_doc.values():
                answers.append(elt)
        
        # Generate AI Answer
        for i in range(len(questions)):
            question = str(questions[i])
            print(question)

            try:
                ai_answer = generate_mcq_answer(question)
            except json.JSONDecodeError:
                ai_aswer = {'Answer': 'None', 'Justification': 'Error when parsing json. {json.JSONDecodeError}'}
            # ai_answer = 
            ai_answers.append(ai_answer)
            
            # Check if correct answer
            total_answer += 1
            if answers[i]['Answer'] == ai_answer['Answer']:
                correct_answer += 1

            print(f'Debug: Score = {correct_answer/total_answer}')

    # Append to data
    data.append({'filename': filename, 'questions': questions,
    'answers': answers, 'ai_answers': ai_answers})
    
    # Save at each question
    informations = {'total_answer': total_answer, 'correct_answer': correct_answer}
    final_evaluation = {'informations': informations,'data': data}
    
    # Save json
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    output_path = os.path.join(save_dir, f"{test_name}.json")
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(final_evaluation, file, indent=4, ensure_ascii=False)



In [3]:
# v0.1 ~ 28 min, max failed observed: 6
evaluate_generate_mcq_answer(save_dir='../outputs/ai_answers_evaluation', 
                             data_dir='../outputs', 
                             test_name='evaluate_generate_mcq_answer_basemodel_improvedjsonconverter')

{'question': 'A communication pursuant to Article 94(3) EPC is dated 7 December 2022. In the communication a time limit of four months is set for replying to objections raised by the examining division.  Which of the following statements is not  correct?', 'options': ['A.  The communication is deemed to be delivered on 17 December 2022', 'B. The time limit for replying to the communication expires on 17 April 2023', 'C.  Further processing for replying to the communication can be validly requested on 19 June 2023 at the latest', 'D.  An extension of the time limit for filing the reply can be validly requested on 14 April 2023']}
Answer before parsing: C
Answer after parsing: C
Debug: Score = 1.0
{'question': 'You filed a European patent application which discloses a new amino acid sequence. This sequence is however only used in a single example and is not part of the claims. No sequence listing was filed.  Which of the following statements is correct?', 'options': ['A. If you are invit

KeyboardInterrupt: 

## evaluate_generate_open_answer

In [None]:
def extract_score(text):
    match = re.search(r"\[SCORE\] (\d)", text)
    return int(match.group(1)) if match else None

def big_model_evaluation(question: str, answer: str, ai_answer: str) -> tuple[int, str]:
    """
    Use 32b model to evaluate the quality of our AI answers.

    Parameters:
    question (str): Question.
    answer (str): Réal answer.
    ai_answer (str): Ai answer.

    Returns:
    score (int): The score between 0 and 5.
    explaination (str): A short explaination.
    """

    # Model to use
    model = 'qwen2.5:1.5b'
    
    # Convert the question in string, in case the question is a json.
    question = str(question)
    answer = str(answer)
    ai_answer = str(ai_answer)

    # Build prompt
    SYSTEM_PROMPT = f"""You are an AI expert in evaluating legal question-answering systems. Your task is to assess the performance of a Retrieval-Augmented Generation (RAG) model in answering legal examination questions.
    ### **Evaluation Criteria**:  
    1. **Accuracy (0-2 points):**  
    - 2: The answer is fully correct and aligns with the true answer.  
    - 1: The answer is partially correct but contains minor inaccuracies or incomplete reasoning.  
    - 0: The answer is incorrect.  

    2. **Legal Reasoning (0-2 points):**  
    - 2: The reasoning is well-structured, logical, and fully supported by legal sources.  
    - 1: The reasoning is somewhat logical but lacks clarity, depth, or partial sourcing.  
    - 0: The reasoning is weak, missing, or incorrect.  

    3. **Use of Sources (0-1 point):**  
    - 1: The answer correctly references legal sources supporting the argument.  
    - 0: The answer lacks proper sourcing or relies on incorrect sources.  

    ### **Output Format:**  
    Your evaluation must include:  
    - A **numerical grade (0-5)** formatted explicitly as `[SCORE] X`, where X is the final score.  
    - A **brief explanation** justifying the grade.  

    If the answer is incorrect or lacks sufficient reasoning, explain why. If the answer is correct but can be improved, provide suggestions.
    """

    user_prompt = f"""### **Legal Question:**  
    {question}  

    ### **True Answer:**  
    {answer}  

    ### **Model Answer:**  
    {ai_answer}  

    Evaluate the model answer based on:  
    1. **Accuracy** (0-2 points)  
    2. **Legal Reasoning** (0-2 points)  
    3. **Use of Sources** (0-1 point)  

    Provide a final score formatted as `[SCORE] X` and justify the score with a brief explanation.
    """

    # Redact an answer
    answer = chat(model=model,
                            messages=[{"role":"system", "content":SYSTEM_PROMPT},
                                      {"role":"user","content":user_prompt}]
                )
    
    # Extract score
    score = extract_score(answer['message']['content'])

    return score, answer['message']['content']
    



In [8]:
def evaluate_generate_open_answer(save_dir: str, data_dir: str, test_name: str):
    """
    Evaluate generation_mcq_answer.

    Parameters:
    save_dir (str): Path to save the evaluation file.
    data_dir (str): Folder to load the json mcq and mcs_solutions files.
    test_name (str): Name of the test, used to save file.

    Returns:
    Nothing but save a json file in save_dir.
    """

    # Get questions/answers from mcq json files
    data = []
    
    # Initialize grade
    sum_possible_score = 0
    sum_eval_score = 0
    
    for filename in os.listdir(data_dir):
        # Ignore file containing solution or MOCK
        if "solution" in filename or "json" not in filename or "MOCK" in filename or 'mcq' in filename or 'categories' in filename:
            continue

        # Initialise filename lists
        questions = []
        answers = []
        ai_answers = []
        
        # Load doc
        filepath = os.path.join(data_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            doc = json.load(file)
        for elt in doc.values():
            questions.append(elt)
        
        # Load solution doc
        solution_filepath = filepath.rsplit('.json', 1)[0] + "_solution.json"
        if os.path.exists(solution_filepath):
            with open(solution_filepath, 'r', encoding='utf-8') as file:
                solution_doc = json.load(file)
            for elt in solution_doc.values():
                answers.append(elt)
        
        # Generate AI Answer
        for i in range(len(questions)):
            question = str(questions[i])
            # ai_answer = generate_open_answer(question)
            ai_answer = {'In dev'}
            ai_answers.append(ai_answer)


        # Append to data
        data.append({'filename': filename, 'questions': questions,
        'answers': answers, 'ai_answers': ai_answers})

    # Get grade from bigger AI
    for file in data:
        questions = file['questions']
        answers = file['answers']
        ai_answers = file['ai_answers']

        # Initialise evaluations lists
        eval_scores = []
        eval_feedbacks = []

        # Evaluate each question
        for i in range(len(questions)):
            eval_score, eval_feedback = big_model_evaluation(questions[i], answers[i], ai_answers[i])
            eval_scores.append(eval_score)
            eval_feedbacks.append(eval_feedback)
            sum_possible_score += 5
            sum_eval_score += eval_score

        # Add evaluations lists to file
        file['eval_scores'] = eval_scores
        file['eval_feedbacks'] = eval_feedbacks
        

    avg_eval_score = sum_eval_score/sum_possible_score
    informations = {'averega_eval_score': avg_eval_score}
    final_evaluation = {'informations': informations,'data': data}
    
    # Save json
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    output_path = os.path.join(save_dir, f"{test_name}.json")
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(final_evaluation, file, indent=4, ensure_ascii=False)


In [9]:
evaluate_generate_open_answer(save_dir='../outputs/ai_answers_evaluation', 
                             data_dir='../outputs', 
                             test_name='evaluate_generate_open_answer_basemodel')

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'