# Road to 100% Accuracy



## Imports

In [11]:
import sys
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
import pandas as pd
import numpy as np 
from tqdm import tqdm
from typing import List
import json
import glob
import ollama
from ollama import chat
import re

# os.chdir(os.path.abspath('../../'))
# sys.path.append(os.getcwd())
# print("Working Directory:", os.getcwd())
# # from ai.src.create_embeddings import split_documents, load_exam_solutions, load_csv
# from ai.src.embedding_loader import load_rag_embeddings
# data_dir = os.path.abspath('ai/outputs/')
# output_dir = os.path.abspath('ai/embeddings/')

data_dir = '../outputs'
output_dir = '../embeddings'

## Functions imported

### create_embeddings.py

In [12]:
markdown_separators = [
"\n\n",
"\n",
".",
" ",
"",
]

def load_csv(data_dir):
    # Load all CSV
    csv_files = glob.glob(f"{data_dir}/*.csv")
    df_list = [pd.read_csv(file) for file in csv_files]
    df = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame(columns=["ref", "url", "content"])

    # List of documents for LangChain
    return  [
        LangchainDocument(page_content=row["content"], metadata={"ref": row["ref"], "url": row["url"]})
        for _, row in tqdm(df.iterrows(), total=len(df))
    ]


def load_exam_solutions(data_dir):
    # Load JSON with "solution" in their name
    json_files = glob.glob(f"{data_dir}/*solution*.json")

    json_documents = []
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Extract justification and create Langchain documents
        filename = os.path.basename(file)  # Nom du fichier sans le chemin

        # Si le fichier contient "open" dans son nom, on récupère tout
        if "open" in filename:
            for key, entry in data.items():
                json_documents.append(
                    LangchainDocument(
                        page_content=entry,
                        metadata={"ref": filename}
                    )
                )
            
        else:
            for key, entry in data.items():
                if "Justification" in entry:
                    json_documents.append(
                        LangchainDocument(
                            page_content=entry["Justification"],
                            metadata={"ref": filename}  # ref = Nom du fichier
                        )
                    )
    return json_documents


def split_documents(chunk_size: int, knowledge_base: List[LangchainDocument], tokenizer_name: str, markdown_separators) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=markdown_separators,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


def create_rag_embeddings(model_name, chunk_size, markdown_separators, data_dir, output_dir, device="cuda"):
    
    csv_raw_knowledge = load_csv(data_dir)
    exam_raw_knowledge = load_exam_solutions(data_dir)
    raw_knowledge_base = csv_raw_knowledge + exam_raw_knowledge
    
    docs_processed = split_documents(
    chunk_size,  # We choose a chunk size adapted to our model
    raw_knowledge_base,
    model_name,
    markdown_separators
    )

    embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    multi_process=True,
    model_kwargs={"device": device,  "trust_remote_code":True},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
    )

    # Compute embeddings (can take time ~7min on my laptop)
    knowledge_vector_database = FAISS.from_documents(
        docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
    )
    # Save embeddings
    knowledge_vector_database.save_local(f"{output_dir}/rag_embeddings_{model_name.replace('/', '_')}_chunk{chunk_size}")

### embeddings loader

In [13]:
def load_rag_embeddings(path, embeddings_model, device='cuda'):

    embedding_model = HuggingFaceEmbeddings(
        model_name=embeddings_model,
        multi_process=True,
        model_kwargs={"device": device},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
        encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
    )
    
    knowledge_vector_db = FAISS.load_local(path, embedding_model, allow_dangerous_deserialization=True)
    print("embedding load")
    return knowledge_vector_db

### generate_mcq_answer

In [14]:
MAX_OUTPUT_TOKENS = 3000

In [15]:
def get_context(query: str, k: int, knowledge_vector_db: FAISS):
    """ 
    Retrieves relevant context for a given query.

    Parameters:
    query (str): The input query for which context is needed.
    k (int, optional): The number of relevant context elements to retrieve (default is 5).

    Returns:
    list: A list containing relevant context elements.
    """
    retrieved_docs = knowledge_vector_db.similarity_search(query=query, k=k)
    return retrieved_docs

In [16]:
def call_formatting_llm_mcq(llm_output, type, model):
    """
    Calls an LLM specialized in formatting text into the correct JSON format.

    Parameters:
    llm_output (str): Raw output from the initial LLM.
    type (str): question or answer

    Returns:
    dict: A valid JSON object containing the question and options.
    """

    if type == 'question':
        SYSTEM_PROMPT = """You are an AI specialized in converting multiple-choice legal questions into JSON format.
        Ensure the output strictly follows this structure:
        ```json
        {"question": "...", "options": ["A ....", "B ...", "C ...", "D ..."]}
        """

    elif type == 'answer':
        SYSTEM_PROMPT = """You are an AI specialized in converting legal answer into JSON format.
        Ensure the output strictly follows this structure:
        ```json
        {
        "Answer": "...", 
        "Justification": "..."
        }
        """

    user_prompt = f"""
        The following text needs to be formatted as a valid JSON:
        {llm_output}
        
        Please convert it into the required JSON format.
        """

    response = chat(model=model, messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ])
    
    # print(response['message']['content'])
    return response['message']['content']

In [17]:
def clean_output_v2(text, type, model):
    """ 
    Fist we call small model to translate in json style.
    If it's parsable we get the json. If not, we do regex.
    """

    if type == 'answer':
        print(f"Answer before 2nd model: {text}")
        text_json_like = call_formatting_llm_mcq(text, type, model)
        print(f"Answer after 2nd model: {text_json_like}")

        # Regex pattern to capture the content of "Answer" and "Justification"
        pattern = r'"Answer":\s*"([^"]+)"\s*,\s*"Justification":\s*"([^"]+)"'

        matches = re.search(pattern, text_json_like)

        if matches:
            answer = matches.group(1)
            # check for a valid letter isolated
            match = re.search(r'\b[A-D]\b', answer)
            if match:
                answer = match.group()
                print(f'Final answer: {answer}')

            justification = matches.group(2)
        else:
            print(f'Erreur parsing: {text_json_like}')
            raise ValueError("Can't extract data.")
        
        return {'Answer': answer, 'Justification': justification}
    
    # elif type == 'question':
    #     text_json_like = call_formatting_llm_mcq(text, type, model)

    #     return {'question': question, 'options': [optionA, optionB, optionC, optionD]}


In [18]:
def generate_mcq_answer(question_mcq: str, model, knowledge_vector_db, reranker=False) -> dict:
    """
    Generates an answer for a MCQ question.

    Parameters:
    question_mcq (str): The input question_mcq for which an answer is needed.

    Returns:
    answer (str): The generated response from the AI with the context used.
    """

    # Convert the question in string, in case the question is a json.
    question_mcq = str(question_mcq)

    # Retrieve context
    if reranker:
        retrieved_docs = get_context(question_mcq, k=20, knowledge_vector_db=knowledge_vector_db)
        retrieved_docs = reranker.rerank(question_mcq, retrieved_docs, k=5)
    else:
        retrieved_docs = get_context(question_mcq, k=5, knowledge_vector_db=knowledge_vector_db)

    context = "\nExtracted documents:\n"
    context += "".join([f'Content: {doc.page_content} \nSource: {doc.metadata['ref']}\n\n' for i, doc in enumerate(retrieved_docs)])
    context_sources = "".join([f'\nSource: {doc.metadata['ref']}, Url: {doc.metadata.get('url', 'N/A')}' for i, doc in enumerate(retrieved_docs)])

    # Build prompt
    SYSTEM_PROMPT = f"""
    You are an AI specialized in answering legal multiple-choice questions based on provided legal texts.
    ### Instructions:
    - When given a multiple-choice legal question, provide the correct answer followed by an explanation.
    - Your answer should begin with the correct choice (e.g., "Answer A").
    - After that, explain why this choice is correct based on the provided legal context.
    - Then, explain why the other choices (B, C, D) are incorrect, using relevant legal reasoning from the context.
    - Use the legal context provided to back up your reasoning.
    - Make sure to clearly distinguish between the correct answer and the incorrect ones.
    """

    user_prompt = f"""
    ### Context:
    {context}

    ### Legal Question:
    {question_mcq}

    Answer the question by:
    1. Starting with the correct answer (e.g., "Answer A").
    2. Explaining why this choice is correct according to the provided legal text.
    3. Explaining why the other options (B, C, D) are incorrect based on the legal context.
    """

    # Initial attempt to get the answer
    attempt_count = 0
    max_attempts = 3  # Limit number of attempts to prevent infinite loops

    while attempt_count < max_attempts:
        answer_mcq = chat(model=model,
                            messages=[{"role":"system", "content":SYSTEM_PROMPT},
                                      {"role":"user","content":user_prompt}],
                            options = {"num_predict":MAX_OUTPUT_TOKENS}
                            )

        # Put answer in correct json format
        try:      
            cleaned_answer_mcq = clean_output_v2(answer_mcq['message']['content'], type='answer', model=model)
            if cleaned_answer_mcq['Answer'] not in {'A', 'B', 'C', 'D'}:
                print(f"Erreur Answer est pas une lettre: Answer={cleaned_answer_mcq['Answer']}")
                raise ValueError("La réponse doit être 'A', 'B', 'C' ou 'D'.")
            
            cleaned_answer_mcq['Justification'] += f'\n\nSources:\n{context_sources}'
            return cleaned_answer_mcq  # If valid, return it

        except ValueError:
            attempt_count += 1  # Increment attempt count
            print(f"Attempt {attempt_count} failed. Retrying...")
            
    
    # If all attempts fail, raise an exception or return None
    return {'Answer': 'Error', 'Justirifation': 'Error parsing json.'}
    raise ValueError("Failed to generate a valid MCQ after multiple attempts.")


### evaluate_mcq_answer

In [19]:
def evaluate_generate_mcq_answer(save_dir: str, data_dir: str, test_name: str, model, knowledge_vector_db: FAISS, reranker=False):
    """
    Evaluate generation_mcq_answer.

    Parameters:
    save_dir (str): Path to save the evaluation file.
    data_dir (str): Folder to load the json mcq and mcs_solutions files.
    test_name (str): Name of the test, used to save file.

    Returns:
    Nothing but save a json file in save_dir.
    """

    # Get questions/answers from mcq json files
    data = []
    
    # Initialise total answer and correct answer for evaluation
    total_answer = 0
    correct_answer =  0
    
    for filename in os.listdir(data_dir):
        # Ignore file containing solution or MOCK
        if "solution" in filename or "json" not in filename or "MOCK" in filename or 'open' in filename or 'categories' in filename:
            continue

        # Initialise filename lists
        questions = []
        answers = []
        ai_answers = []
        
        # Load doc
        filepath = os.path.join(data_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            doc = json.load(file)
        for elt in doc.values():
            questions.append(elt)
        
        # Load solution doc
        solution_filepath = filepath.rsplit('.json', 1)[0] + "_solution.json"
        if os.path.exists(solution_filepath):
            with open(solution_filepath, 'r', encoding='utf-8') as file:
                solution_doc = json.load(file)
            for elt in solution_doc.values():
                answers.append(elt)
        
        # Generate AI Answer
        for i in range(len(questions)):

            question = str(questions[i])
            print(question)

            try:
                ai_answer = generate_mcq_answer(question, model, knowledge_vector_db, reranker)
            except json.JSONDecodeError:
                ai_aswer = {'Answer': 'None', 'Justification': 'Error when parsing json. {json.JSONDecodeError}'}
            # ai_answer = 
            ai_answers.append(ai_answer)
            
            # Check if correct answer
            total_answer += 1
            if answers[i]['Answer'] == ai_answer['Answer']:
                correct_answer += 1

        # Append to data
        data.append({'filename': filename, 'questions': questions,
        'answers': answers, 'ai_answers': ai_answers})
    
    # Save at each question
    informations = {'total_answer': total_answer, 'correct_answer': correct_answer}
    print(informations)
    final_evaluation = {'informations': informations,'data': data}
    
    # Save json
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    output_path = os.path.join(save_dir, f"{test_name}.json")
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(final_evaluation, file, indent=4, ensure_ascii=False)


## Tune chunk size

In [20]:
embeddings_model = 'thenlper/gte-small'
reader = 'qwen2.5:7b'

In [11]:
def test_chunk_size(chunk_size):
    create_rag_embeddings(embeddings_model, chunk_size, markdown_separators, data_dir, output_dir)
    knowledge_vector_db = load_rag_embeddings(path=f'../embeddings/rag_embeddings_{embeddings_model.replace('/', '_')}_chunk{chunk_size}',
                                            embeddings_model=embeddings_model, device='cuda')
    evaluate_generate_mcq_answer(save_dir='../outputs/evaluation2/', data_dir='../outputs/',
                             test_name=f'{embeddings_model.replace('/', '_')}_qwen2_5_7b_chunk{chunk_size}', model=reader,
                             knowledge_vector_db=knowledge_vector_db)

### chunk_size = 256

In [12]:
# test_chunk_size(256)

### chunk_size = 512

In [13]:
# test_chunk_size(512)

### chunk_size = 768

In [14]:
# test_chunk_size(768)

In [15]:
# test_chunk_size(1024)

## Test better embeddings model

In [16]:
chunk_size = 512
reader = 'qwen2.5:7b'

In [17]:
def test_embeddings_model(embeddings_model, chunk_size):
    create_rag_embeddings(embeddings_model, chunk_size, markdown_separators, data_dir, output_dir)
    knowledge_vector_db = load_rag_embeddings(path=f'../embeddings/rag_embeddings_{embeddings_model.replace('/', '_')}_chunk{chunk_size}',
                                            embeddings_model=embeddings_model, device='cuda')
    evaluate_generate_mcq_answer(save_dir='../outputs/evaluation2/', data_dir='../outputs/',
                             test_name=f'{embeddings_model.replace('/', '_')}_qwen2_5_7b_chunk{chunk_size}', model=reader,
                             knowledge_vector_db=knowledge_vector_db)

### embeddings_model = nvidia/NV-Embed-v2 (NON CAR >30min pour créer un Embeddings)

### embeddings_model = Alibaba-NLP/gte-Qwen2-7B-instruct   (NON CAR >1h pour créer un Embeddings)

### embeddings_model = all-MiniLM-L6-v2

In [None]:
# test_embeddings_model(embeddings_model='sentence-transformers/all-MiniLM-L6-v2', chunk_size=512)

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extended European search repor

### embeddings_model = bge-large-en

In [None]:
# test_embeddings_model(embeddings_model='BAAI/bge-large-en-v1.5', chunk_size=512)

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extended European search repor

### embeddings_model = all-mpnet-base-v2

In [19]:
# test_embeddings_model(embeddings_model='sentence-transformers/all-mpnet-base-v2', chunk_size=512)

## Test New Prompt without model 2 for json parsing

C'est pas terrible donc  on a remise l'ancien système. Peut-être avec un meilleur prompt ? 

In [None]:
# def generate_mcq_answer(question_mcq: str, model, knowledge_vector_db, reranker=False) -> dict:
#     """
#     Generates an answer for a MCQ question.

#     Parameters:
#     question_mcq (str): The input question_mcq for which an answer is needed.

#     Returns:
#     answer (str): The generated response from the AI with the context used.
#     """

#     # Convert the question in string, in case the question is a json.
#     question_mcq = str(question_mcq)

#     # Retrieve context
#     if reranker:
#         retrieved_docs = get_context(question_mcq, k=20, knowledge_vector_db=knowledge_vector_db)
#         retrieved_docs = reranker.rerank(question_mcq, retrieved_docs, k=5)
#     else:
#         retrieved_docs = get_context(question_mcq, k=5, knowledge_vector_db=knowledge_vector_db)

#     context = "\nExtracted documents:\n"
#     context += "".join([f'Content: {doc.page_content} \nSource: {doc.metadata['ref']}\n\n' for i, doc in enumerate(retrieved_docs)])
#     context_sources = "".join([f'\nSource: {doc.metadata['ref']}, Url: {doc.metadata.get('url', 'N/A')}' for i, doc in enumerate(retrieved_docs)])

#     # Build prompt
#     SYSTEM_PROMPT = f"""You are an AI specialized in answering multiple-choice legal questions based on given legal texts.
#     ### Instructions:
#     - Provide the correct answer.
#     - After that, explain why this choice is correct based on the provided legal context.
#     - Then, explain why the other choices (B, C, D) are incorrect, using relevant legal reasoning from the context.
#     - Format the output strictly as a JSON object with the following structure exemple:
#         ```json
#         {{"Answer": "A", "Justification": "..."}}
#     The Answer field must only contain the letter of the answer which is A, B, C or D. All other explainations have to be in Justification.
#     """

#     user_prompt = f"""
#     ### Context:
#     {context}

#     ### Legal Question:
#     {question_mcq}

#     Answer the question by:
#     1. Starting with the correct answer (e.g., "Answer A").
#     2. Explaining why this choice is correct according to the provided legal text.
#     3. Explaining why the other options (B, C, D) are incorrect based on the legal context.
#     Make sure to answer with the correct json format:
#     ```json
#         {{"Answer": "...", "Justification": "..."}}
#     The Answer field must only contain the letter of the answer which is A, B, C or D. All other explainations have to be in Justification.
#     """

#     # Initial attempt to get the answer
#     attempt_count = 0
#     max_attempts = 3  # Limit number of attempts to prevent infinite loops

#     while attempt_count < max_attempts:
#         answer_mcq = chat(model=model,
#                             messages=[{"role":"system", "content":SYSTEM_PROMPT},
#                                       {"role":"user","content":user_prompt}],
#                             options = {"num_predict":MAX_OUTPUT_TOKENS}
#                             )
#         try:
#             # Put answer in correct json format
#             # Regex pattern to capture the content of "Answer" and "Justification"
#             pattern = r'"Answer":\s*"([^"]+)"\s*,\s*"Justification":\s*"([^"]+)"'
#             matches = re.search(pattern, answer_mcq['message']['content'])

#             if matches:
#                 answer = matches.group(1)
#                 # check for a valid letter isolated
#                 match = re.search(r'\b[A-D]\b', answer)
#                 if match:
#                     answer = match.group()
#                     print(f'Final answer: {answer}')

#                 justification = matches.group(2)
                
#                 # Add context
#                 justification += f'\n\nSources:{context_sources}'
#                 print(f'Answer: {answer}, Justification: {justification}')
#                 return {'Answer': answer, 'Justification': justification}
            
#             else:
#                 print(f'Erreur parsing: {answer_mcq['message']['content']}')
#                 raise ValueError("Can't extract data.")
            

        
#         except:
#             attempt_count += 1  # Increment attempt count
#             print(f"Attempt {attempt_count} failed. Retrying...")
    
#     # If all attempts fail, raise an exception or return None
#     return {'Answer': 'Error', 'Justirifation': 'Error parsing json.'}
#     raise ValueError("Failed to generate a valid MCQ after multiple attempts.")

In [15]:
# Temp pour voir si second modèle bousille réponse
# import sys
# # Ouvrir un fichier pour enregistrer les prints
# log_file = open("logs/test_if_model_2_is_wrong.log", "w")
# # Sauvegarder l'ancien stdout
# old_stdout = sys.stdout  
# # Rediriger stdout vers le fichier
# sys.stdout = log_file 

knowledge_vector_db = load_rag_embeddings(path=f'../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
                                            embeddings_model='sentence-transformers/all-MiniLM-L6-v2', 
                                            device='cuda')
evaluate_generate_mcq_answer(save_dir='../outputs/evaluation2/', data_dir='../outputs/',
                             test_name=f'sentence-transformers_all-MiniLM-L6-v2_qwen2_5_7b_chunk512_newPromptWithout2ndModel', model=reader,
                             knowledge_vector_db=knowledge_vector_db)

# sys.stdout = old_stdout  
# log_file.close()

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously 

## Test Big Reader (qwen2.5-32B)

In [18]:
def test_reader_model(embeddings_model, chunk_size, reader):
    # create_rag_embeddings(embeddings_model, chunk_size, markdown_separators, data_dir, output_dir)
    knowledge_vector_db = load_rag_embeddings(path=f'../embeddings/rag_embeddings_{embeddings_model.replace('/', '_')}_chunk{chunk_size}',
                                            embeddings_model=embeddings_model, device='cuda')
    evaluate_generate_mcq_answer(save_dir='../outputs/evaluation2/', data_dir='../outputs/',
                             test_name=f'{embeddings_model.replace('/', '_')}_chunk{chunk_size}_{reader.replace('.', '_').replace(':', '_')}', model=reader,
                             knowledge_vector_db=knowledge_vector_db)

In [None]:
# test_reader_model(embeddings_model='sentence-transformers/all-MiniLM-L6-v2', chunk_size=512, reader='qwen2.5:32b') 

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Answer before 2nd model:  

### Answer:
**Answer: D**

### Explanation:

According to Rule 21(1) EPC as mentioned in one of the sources provided ("Legal basis A n incorrect designation of inventor shall be rectified upon request and only wi

## qwen1.5-32B without model 2 with new prompt and better parsing

In [28]:
def generate_mcq_answer(question_mcq: str, model, knowledge_vector_db, reranker=False) -> dict:
    """
    Generates an answer for a MCQ question.

    Parameters:
    question_mcq (str): The input question_mcq for which an answer is needed.

    Returns:
    answer (str): The generated response from the AI with the context used.
    """

    # Convert the question in string, in case the question is a json.
    question_mcq = str(question_mcq)

    # Retrieve context
    if reranker:
        print(f'Rerank')
        retrieved_docs = get_context(question_mcq, k=50, knowledge_vector_db=knowledge_vector_db)
        retrieved_docs = reranker.rerank(question_mcq, retrieved_docs, k=5)
    else:
        retrieved_docs = get_context(question_mcq, k=5, knowledge_vector_db=knowledge_vector_db)

    context = "\nExtracted documents:\n"
    context += "".join([f'Content: {doc.page_content} \nSource: {doc.metadata['ref']}\n\n' for i, doc in enumerate(retrieved_docs)])
    context_sources = "".join([f'\nSource: {doc.metadata['ref']}, Url: {doc.metadata.get('url', 'N/A')}' for i, doc in enumerate(retrieved_docs)])

    # Build prompt
    SYSTEM_PROMPT = f"""You are an AI specialized in answering multiple-choice legal questions based on given legal texts.
    ### Instructions:
    - Always start your response with:  
    **Answer: X** (where X is A, B, C, or D).  
    - After that, provide a detailed justification.  
    - Explain why the selected answer is correct based on the legal context.  
    - Explain why the other choices (B, C, D) are incorrect, referencing the legal context.  

    The **Answer** field must contain only a single letter: A, B, C, or D.  
    All explanations must be in the **Justification** section.  
    Ensure strict adherence to this format to allow for structured extraction.
    """

    user_prompt = f"""
    ### Context:
    {context}

    ### Legal Question:
    {question_mcq}

    Respond strictly in the following format:

    **Answer: X**  
    **Justification:**  
    - Explanation of why X is correct.  
    - Explanation of why B, C, and D are incorrect.  

    Ensure that:  
    - The **Answer** field contains only a single letter (A, B, C, or D).  
    - All other explanations are in the **Justification** section.
    """

    # Initial attempt to get the answer
    attempt_count = 0
    max_attempts = 3  # Limit number of attempts to prevent infinite loops

    while attempt_count < max_attempts:
        answer_mcq = chat(model=model,
                            messages=[{"role":"system", "content":SYSTEM_PROMPT},
                                      {"role":"user","content":user_prompt}],
                            options = {"num_predict":MAX_OUTPUT_TOKENS}
                            )

        # Put answer in correct json format
        try:
            # Regex pour capturer Answer et Justification
            pattern = re.compile(r"\*\*Answer:\s*([A-D])\*\*\s*\n\s*\*\*Justification:\*\*\s*\n(.*)", re.DOTALL)

            match = pattern.search(answer_mcq['message']['content'])

            if match:
                extracted_answer = match.group(1)
                justification = match.group(2).strip()

                # check for a valid letter isolated
                match = re.search(r'\b[A-D]\b', extracted_answer)
                if match:
                    extracted_answer = match.group()
                    print(f'Final answer: {extracted_answer}')
                else:
                    print(f"Erreur Answer is not a letter: Answer={extracted_answer}")
                    raise ValueError("Answer should be 'A', 'B', 'C' ou 'D'.")
            
                justification += f'\n\nSources:\n{context_sources}'
                return {"Answer": extracted_answer, "Justification": justification}  # If valid, return it

        except ValueError:
            attempt_count += 1  # Increment attempt count
            print(f"Attempt {attempt_count} failed. Retrying...")
            
    
    # If all attempts fail, raise an exception or return None
    return {'Answer': 'Error', 'Justirifation': 'Error parsing json.'}
    raise ValueError("Failed to generate a valid MCQ after multiple attempts.")

In [20]:
test_reader_model(embeddings_model='sentence-transformers/all-MiniLM-L6-v2', chunk_size=512, reader='qwen2.5:32b')

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Final answer: D
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extended Europ

In [17]:
# test_reader_model(embeddings_model='sentence-transformers/all-MiniLM-L6-v2', chunk_size=512, reader='deepseek-r1') 

In [None]:
import json
import re

# Charger le fichier JSON
with open("../outputs/evaluation2/sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_32b_test.json", "r", encoding="utf-8") as f:
    file = json.load(f)

# Regex pour capturer Answer et Justification
pattern = re.compile(r"\*\*Answer:\s*([A-D])\*\*\s*\n\s*\*\*Justification:\*\*\s*\n(.*)", re.DOTALL)

for item in file.get("data", []):
    ai_answers = item.get("ai_answers", [])
    
    for answer_dict in ai_answers:
        text = answer_dict.get("Justification", "No Justification")
        # print(text)

        match = pattern.search(text)

        if match:
            extracted_answer = match.group(1)
            justification = match.group(2).strip()

            # check for a valid letter isolated
            match = re.search(r'\b[A-D]\b', extracted_answer)
            if match:
                extracted_answer = match.group()
                print(f'Final answer: {extracted_answer}')

            print(f"Answer: {extracted_answer}")
            print(f"Justification: {justification}")
            print("-" * 50)  # Séparateur entre les entrées

Final answer: D
Answer: D
Justification: - Explanation of why D is correct. According to Rule 21(1) EPC, an incorrect designation of inventor shall be rectified upon request and only with the consent of the wrongly designated person. Since Ms Smith was erroneously designated as an inventor, she must give her consent for this correction to take place.
- Explanation of why B is incorrect. There is no requirement that proceedings must be pending before the EPO in order to rectify an incorrect designation of inventors. The issue can be addressed even if there are no pending proceedings.
- Explanation of why C is incorrect. Rule 21(1) does not require Mr Li's consent for this rectification; it requires only Ms Smith’s consent as she was the wrongly designated inventor.
- Explanation of why A is incorrect. While the patent proprietor can request that Ms Smith be deleted, such a change cannot occur without Ms Smith’s explicit consent per Rule 21(1) EPC.
---------------------------------------

On a généré avec un nouveau prompt sans modèle pour parser. L'idée est de voir le format de sortie. S'il est uniforme, de créer un script pour le parser, voir s'il s'en sort sur tous les cas. Ensuite on pourra mesurer l'accuracy, voir si ce prompt est aussi bon / meilleur que l'autre. Et si le parsing est bien, ça nous permet d'implémenter tout ça et d'avoir un modèle 32B sans nécessiter un autre modèle pour faire un json.

## Test different content in Retrieval

So far, we try a lot of things and it seems to be the retrieval capacity is the bottleneck. We will try to not use everything in it.

We identify a sur-representation of Guidelines doccuments. So we will seperate the embeddings into 2. One with Guidelines et another with articles. We will take 3 articles and 2 guidelines for each questions.

### Create embeddings

We moove the file EPC_guidelines out of the outputs for generting the first one and we do the opposite for the second one.
We don't take Exams Solution in sources.

In [None]:
# def create_rag_embeddings(model_name, chunk_size, markdown_separators, data_dir, output_dir, device="cuda"):
    
#     csv_raw_knowledge = load_csv(data_dir)
#     raw_knowledge_base = csv_raw_knowledge
    
#     docs_processed = split_documents(
#     chunk_size,  # We choose a chunk size adapted to our model
#     raw_knowledge_base,
#     model_name,
#     markdown_separators
#     )

#     embedding_model = HuggingFaceEmbeddings(
#     model_name=model_name,
#     multi_process=True,
#     model_kwargs={"device": device,  "trust_remote_code":True},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
#     encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
#     )

#     # Compute embeddings (can take time ~7min on my laptop)
#     knowledge_vector_database = FAISS.from_documents(
#         docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
#     )
#     # Save embeddings
#     knowledge_vector_database.save_local(f"{output_dir}/rag_embeddings_{model_name.replace('/', '_')}_chunk{chunk_size}")

In [None]:
# create_rag_embeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', 
#                       chunk_size=512, markdown_separators=markdown_separators, 
#                       data_dir='../outputs', output_dir='./')

100%|██████████| 3739/3739 [00:00<00:00, 25075.80it/s]


In [7]:
def test_custom(test_name, embeddings_model, embeddings_path, reader, reranker=False):
    # create_rag_embeddings(embeddings_model, chunk_size, markdown_separators, data_dir, output_dir)
    knowledge_vector_db = load_rag_embeddings(path=embeddings_path,
                                            embeddings_model=embeddings_model, device='cuda')
    evaluate_generate_mcq_answer(save_dir='../outputs/evaluation2/', data_dir='../outputs/',
                             test_name=test_name, model=reader,
                             knowledge_vector_db=knowledge_vector_db, reranker=reranker)

### Qu'avec Articles/Rules

Lancé avec le nouveau prompt sans 2nd model pour parsing.

In [27]:
test_custom('sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_embeddingsOnlyRulesArticles',
           embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
           embeddings_path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512_onlyRulesArticles/',
           reader='qwen2.5:7b')

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Final answer: A
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extended Europ

### Qu'avec Guidelines / Case Law

In [28]:
test_custom('sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_embeddingsOnlyGuidelinesAndLaws',
           embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
           embeddings_path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512_onlyGuidelinesAndLaws',
           reader='qwen2.5:7b')

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Final answer: A
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extended Europ

### Vérifier si c'est pas le new prompt qui foue la merde

Il a pas trop l'air. On perd un peu mais c'est 1 bonne réponse.

In [None]:
# # Bizarre les résultats, on vérifie rapidement si c'est le nouveau prompt qui foue la merde
# test_custom('sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_TestAvecNewPromptSansSecondModelVerifPerf_2',
#            embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
#            embeddings_path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
#            reader='qwen2.5:7b')

### Avec un mix des deux

A tester de piocher 3 dans guidelines 2 dans articles.

## Test Reranking

In [40]:
from sentence_transformers import CrossEncoder

class LocalReranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
        """
        Initialise un reranker basé sur un modèle cross-encoder.
        """
        self.model = CrossEncoder(model_name)

    def rerank(self, query, documents, k=5):
        """
        Classe les documents en fonction de leur pertinence pour la question.
        
        query: str -> La question posée.
        documents: list -> Liste des documents récupérés.
        k: int -> Nombre de documents à conserver après reranking.
        
        Retourne les k meilleurs documents rerankés.
        """
        print("Rerank")
        doc_texts = [doc.page_content for doc in documents]  # Extraire les contenus
        pairs = [(query, doc) for doc in doc_texts]  # Création des paires (question, doc)
        
        scores = self.model.predict(pairs)  # Prédiction des scores de pertinence
        
        # Trier les documents par score décroissant
        ranked_docs = sorted(zip(scores, documents), key=lambda x: x[0], reverse=True)
        
        return [doc[1] for doc in ranked_docs[:k]]  # Retourner les k meilleurs

# Utilisation
reranker = LocalReranker()

In [41]:
test_custom('sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_TestAvecNewPromptSansSecondModel_Reranker',
           embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
           embeddings_path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
           reader='qwen2.5:7b', 
           reranker = reranker)

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Rerank
Final answer: A
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extende

In [4]:
from colbert import Searcher

class ColbertReranker:
    def __init__(self, model_name="colbert-ir/colbertv2.0"):
        """
        Initialise un reranker ColBERT en chargeant le modèle pré-entraîné.
        """
        self.searcher = Searcher.from_pretrained(model_name, faiss_depth=100, nprobe=10)

    def rerank(self, query, documents, k=5):
        """
        Classe les documents en fonction de leur pertinence pour la question.

        query: str -> La question posée.
        documents: list -> Liste des documents récupérés.
        k: int -> Nombre de documents à conserver après reranking.

        Retourne les k meilleurs documents rerankés.
        """
        doc_texts = [doc.page_content for doc in documents]  # Extraire le texte des documents

        # Indexation temporaire des documents pour le reranking
        doc_ids = list(range(len(doc_texts)))
        self.searcher.index(texts=doc_texts, doc_ids=doc_ids)

        # Effectuer la recherche
        results = self.searcher.search(query, k=k)

        # Trier les documents selon leur score
        ranked_docs = sorted(zip(results.doc_scores, results.doc_ids), key=lambda x: x[0], reverse=True)

        # Récupérer les documents classés
        reranked_docs = [documents[idx] for _, idx in ranked_docs]

        return reranked_docs  # Retourne les meilleurs documents rerankés

# Utilisation
reranker = ColbertReranker()

ImportError: cannot import name 'AdamW' from 'transformers' (c:\Users\Patrice\Documents\Projets\ia-pau-data-battle-mars-2025\.venv\Lib\site-packages\transformers\__init__.py)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

class Reranker:
    def __init__(self, model_name='BAAI/bge-reranker-large'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()

    def rerank(self, query, documents, k=5):
        pairs = [[query, doc.page_content] for doc in documents]
        with torch.no_grad():
            inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
            scores = self.model(**inputs, return_dict=True).logits.view(-1).float()
        
        # Associer les scores aux documents
        scored_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
        return [doc for doc, _ in scored_docs[:k]]

In [25]:
reranker = Reranker()
test_custom('sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_TestAvecNewPromptSansSecondModel_RerankerBAAI_bge_reranker_large',
           embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
           embeddings_path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
           reader='qwen2.5:7b', 
           reranker = reranker)

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Rerank
Final answer: A
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extende

In [26]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

class Reranker:
    def __init__(self, model_name='BAAI/bge-reranker-large'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()

    def rerank(self, query, documents, k=5):
        pairs = [[query, doc.page_content] for doc in documents]
        with torch.no_grad():
            inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors='pt')
            scores = self.model(**inputs, return_dict=True).logits.view(-1).float()
        
        # Associer les scores aux documents
        scored_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
        return [doc for doc, _ in scored_docs[:k]]
    
reranker = Reranker()
test_custom('sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_TestAvecNewPromptSansSecondModel_RerankerBAAI_bge_reranker_large_sanslimitetoken',
           embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
           embeddings_path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
           reader='qwen2.5:7b', 
           reranker = reranker)

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Rerank
Final answer: A
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extende

### Tester avec top50 dans le premier et on rerank 5 ? 
7/29 Pas fou.

In [29]:
reranker = Reranker()
test_custom('sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_TestAvecNewPromptSansSecondModel_RerankerBAAI_bge_reranker_large_sanslimitetoken_50context',
           embeddings_model='sentence-transformers/all-MiniLM-L6-v2',
           embeddings_path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
           reader='qwen2.5:7b', 
           reranker = reranker)

embedding load
{'question': 'A European patent was granted to a German company on 4  January 2023. No opposition was filed. Mr Li and Ms Smith were designated as inventors. On 12 October 2023,  the patent proprietor not ices that Ms Smith was erroneously designated as inventor and that Mr Li is the sole inventor.  Which of the following statements is correct?', 'options': ['A. Ms Smith will be deleted as inventor if the patent proprietor requests this in writing .', 'B. Ms Smith can no longer  be deleted as inventor because no proceedings are pending before the EPO .', "C. The patent proprietor can request that  Ms Smith be deleted as inventor but needs Mr Li 's consent  for this .", 'D. Ms Smith can only be deleted as inventor with her consent .']}
Rerank
Final answer: A
{'question': "On 10 October 2021, European patent attorney X filed European patent application EP -A with the EPO on the applicant's behalf. Application EP -A was published on 12 April 2023, together  with the extende

FileNotFoundError: [Errno 2] No such file or directory: '../outputs/evaluation2/sentence-transformers_all-MiniLM-L6-v2_chunk512_qwen2_5_7b_TestAvecNewPromptSansSecondModel_RerankerBAAI_bge_reranker_large_sanslimitetoken_50context.json'

## Test Prompt engineering

## Plot dynamic Embeddings

In [30]:
import json
import plotly.offline as pyo
import plotly.graph_objs as go
from sklearn.decomposition import PCA
import numpy as np

def generate_html_from_faiss(knowledge_vector_db, output_file="embeddings_visualization.html"):
    # Définition des couleurs pour chaque catégorie
    category_colors = {
        "Case Law": "red",
        "Guidelines for Examination in the EPO": "green",
        "EPC Article": "blue",
        "EPC Rule": "orange",
        "EPC RFee": "purple",
        "PCT Article": "brown",
        "PCT Rule": "black",
        "Other": "gray"
    }
    
    # Extraction des embeddings et des références associées
    embeddings = []
    refs = []
    colors = []
    categories = []
    
    documents = list(knowledge_vector_db.docstore._dict.values())  # Liste des documents stockés
    
    for i in range(knowledge_vector_db.index.ntotal):
        vec = knowledge_vector_db.index.reconstruct(i).tolist()
        doc = documents[i] if i < len(documents) else None  # Vérifier l'existence du document
        ref = doc.metadata.get("ref", f"doc_{i}") if doc else f"doc_{i}"
        embeddings.append(vec)
        refs.append(ref)
        
        # Attribution de la couleur et de la catégorie en fonction du titre
        color = "Other"
        category = "Other"
        for key in category_colors:
            if key in ref:
                color = category_colors[key]
                category = key
                break
        colors.append(color)
        categories.append(category)
    
    embeddings = np.array(embeddings)
    
    # Réduction de dimension avec PCA
    pca = PCA(n_components=3)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    # Séparation des coordonnées x, y, z
    x_vals = reduced_embeddings[:, 0]
    y_vals = reduced_embeddings[:, 1]
    z_vals = reduced_embeddings[:, 2]
    
    # Création des traces pour chaque catégorie
    traces = []
    for category, color in category_colors.items():
        category_x = [x_vals[i] for i in range(len(x_vals)) if categories[i] == category]
        category_y = [y_vals[i] for i in range(len(y_vals)) if categories[i] == category]
        category_z = [z_vals[i] for i in range(len(z_vals)) if categories[i] == category]
        
        traces.append(go.Scatter3d(
            x=category_x, y=category_y, z=category_z,
            mode='markers',
            name=category,  # Nom de la catégorie dans la légende
            marker=dict(size=5, color=color),
            text=[refs[i] for i in range(len(refs)) if categories[i] == category],
            hoverinfo='text'
        ))
    
    layout = go.Layout(
        title="Visualisation des Embeddings 3D (PCA)",
        legend=dict(
            title="Categories",
            itemsizing="constant"
        )
    )
    
    # Création de la figure avec toutes les traces
    fig = go.Figure(data=traces, layout=layout)
    
    # Génération du fichier HTML
    pyo.plot(fig, filename=output_file, auto_open=False)
    print(f"Fichier HTML généré: {output_file}")
    
    return output_file


knowledge_vector_db = load_rag_embeddings(path='../embeddings/rag_embeddings_sentence-transformers_all-MiniLM-L6-v2_chunk512',
                                        embeddings_model='sentence-transformers/all-MiniLM-L6-v2', device='cuda')
generate_html_from_faiss(knowledge_vector_db, output_file="../outputs/embeddings_visualization.html")

embedding load
Fichier HTML généré: ../outputs/embeddings_visualization.html


'../outputs/embeddings_visualization.html'