# Answering questions

Create a model capable of answering questions with citations using knowledge base.

## Imports

In [10]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

## Reader model

Hugging face utilisent le modèle zephyr-7b-beta qui date un peu, et ils le quantize. Pour quantizer il faut un GPU. Je suppose qu'une fois la quantization faite on peut charger le modèle sur CPU.

Pour les tests j'ai pris un petit modèle pas trop mal classé dans le leaderborad. https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?params=-1%2C3 


In [11]:
READER_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

In [12]:
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Device set to use cpu


In [5]:
READER_LLM("What is 4+4? Answer:")

[{'generated_text': ' 8. What is the answer to this question?\nThe answer to this question is 8. \n\nTo break it down:\n\n1. The problem states "what is 4 + 4?"\n2. When you add two numbers together, you combine their values.\n3. In this case, we\'re adding 4 and 4.\n\nSo, when you add 4 and 4:\n- You have four units\n- You add another four units\n\nWhen you put these together, you get eight units in total.\n\nTherefore, the answer to the question "what is 4 + 4?" is indeed 8.'}]

## Prompt

In [4]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Use the information contained in the context to provide a comprehensive answer to the question.  
        - Answer only the question asked, in a concise and relevant manner.  
        - Always cite the sources used by indicating their.  
        - Explain why each reference was used to support the answer.  
        - If the answer cannot be deduced from the context, do not provide one.
        
        Exemple:
        - The correct answer is ...
        - Reference sources used: explain each reference and why you use them.
        - If the question was a multiple choice, explain why the other choise are wrong.
        """,
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

<|im_start|>system
Use the information contained in the context to provide a comprehensive answer to the question.  
        - Answer only the question asked, in a concise and relevant manner.  
        - Always cite the sources used by indicating their.  
        - Explain why each reference was used to support the answer.  
        - If the answer cannot be deduced from the context, do not provide one.

        Exemple:
        - The correct answer is ...
        - Reference sources used: explain each reference and why you use them.
        - If the question was a multiple choice, explain why the other choise are wrong.
        <|im_end|>
<|im_start|>user
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}<|im_end|>
<|im_start|>assistant



## Code from notebook *1_Retrieval_capacity.ipynb* for the retrieval

In [9]:
# Load embeddings
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

EMBEDDING_MODEL_NAME = "thenlper/gte-small"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},  # replace 'cpu' by 'cuda' if you have Nvidia gpu
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local("../outputs/rag_embeddings_thenlper_gte-small", embedding_model, allow_dangerous_deserialization=True)

  embedding_model = HuggingFaceEmbeddings(


## Test Reader

In [13]:
user_query = """Your Client, A Inc, is a sub-licensee under European patent application EP-1. Can the sub-licence be recorded in the European Patent Register?
 
A    No, it is not possible to record sub-licences in the European Patent Register.
 
B    Yes, any sub-licence can be recorded in the European Patent Register.
 
C    Yes, provided the licensee granting the sub-licence has recorded its licence in the European Patent Register.
"""

Correct answer:

The correct answer is C.

A sub-licence can only be recorded in the European Patent Register if it is granted by a licensee whose licence is recorded in 
the Register (Rule 24(b) EPC, implementing Article 73 EPC).

In [42]:
print(f"\nStarting retrieval for {user_query=}...")
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)


Starting retrieval for user_query='Your Client, A Inc, is a sub-licensee under European patent application EP-1. Can the sub-licence be recorded in the European Patent Register?\n\nA    No, it is not possible to record sub-licences in the European Patent Register.\n\nB    Yes, any sub-licence can be recorded in the European Patent Register.\n\nC    Yes, provided the licensee granting the sub-licence has recorded its licence in the European Patent Register.\n'...


In [41]:
context = "\nExtracted documents:\n"

context += "".join([f'Content: {doc.page_content} \nSource: {doc.metadata['ref']}\n\n' for i, doc in enumerate(retrieved_docs)])
print(context)

final_prompt = RAG_PROMPT_TEMPLATE.format(question=user_query, context=context)

# Redact an answer
answer = READER_LLM(final_prompt)[0]["generated_text"]
print(answer)


Extracted documents:
Content: Title: EPC Rule 24. Content: Rule 24 FootnoteRef37 Special entries for licence registrations A licence in respect of a European patent application shall be recorded (a) as an exclusive licence if the applicant and the licensee so request; (b) as a sub-licence where it is granted by a licensee whose licence is recorded in the European Patent Register. FootnoteRef37 See decision of the President of the EPO ( OJ EPO 2013, 600 ). See decisions of the President of the EPO of 09.02.2024 ( OJ EPO 2024, A17 and OJ EPO 2024, A18 ) and notice from the EPO of 09.02.2024 ( OJ EPO 2024, A22 ). 
Source: EPC Rule 24

Content: Title: Guidelines for Examination in the EPO, E-XIV, 6.1. Content: 6.1 Registration A European patent application may give rise to rightsin rem, may be licensed and may be the subject of legal means of execution. This includes contractual licences only (Art. 73). Licences and other rights may be geographically limited to parts of the territories of

## Reranking (PAS AU POINT A PARTIR D'ICI)

A good option for RAG is to retrieve more documents than you want in the end, then rerank the results with a more powerful retrieval model before keeping only the top_k.

In [6]:
!pip install ragatouille

Collecting ragatouille
  Using cached RAGatouille-0.0.9-py3-none-any.whl.metadata (28 kB)
Collecting llama-index (from ragatouille)
  Using cached llama_index-0.12.25-py3-none-any.whl.metadata (12 kB)
Collecting colbert-ai>=0.2.19 (from ragatouille)
  Using cached colbert_ai-0.2.21-py3-none-any.whl.metadata (12 kB)
Collecting onnx (from ragatouille)
  Using cached onnx-1.17.0.tar.gz (12.2 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting srsly (from ragatouille)
  Using cached srsly-2.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
INFO: pip is looking at multiple versions of ragatouille to determine which version is compatible with other requirements. This could take a while.
Collecting ragatouille
  Using cached ragatouille-0.0.8.post4-py3-none-any.whl.metadata (15 kB)
Collecting colbert-ai==0.2.19 (from ragatouille)
 

In [15]:
from ragatouille import RAGPretrainedModel

RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

ModuleNotFoundError: No module named 'ragatouille'

## Assemble it all !

In [14]:
from transformers import Pipeline
from langchain.docstore.document import Document as LangchainDocument
from typing import Optional, List, Tuple


def answer_with_rag(
    question: str,
    llm: Pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    # Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    return answer, relevant_docs

NameError: name 'RAGPretrainedModel' is not defined

In [None]:
# Test
user_query = """Your Client, A Inc, is a sub-licensee under European patent application EP-1. Can the sub-licence be recorded in the European Patent Register?
 
A    No, it is not possible to record sub-licences in the European Patent Register.
 
B    Yes, any sub-licence can be recorded in the European Patent Register.
 
C    Yes, provided the licensee granting the sub-licence has recorded its licence in the European Patent Register.
"""

answer, relevant_docs = answer_with_rag(user_query, READER_LLM, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)

print("==================================Answer==================================")
print(f"{answer}")
print("==================================Source docs==================================")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i}------------------------------------------------------------")
    print(doc)