In [None]:
# Data link: https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/DIC.zip
!mkdir -p ../data/
!curl -L -o ../data/DIC.zip https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/DIC.zip
!unzip -d ../data ../data/DIC.zip
!rm ../data/DIC.zip

# Loading

In [None]:
!pip uninstall -y torchvision

In [None]:

import glob
from langchain_community.document_loaders import PyPDFLoader
import os

documents = []
#DIC_path = os.path.join(os.path.expanduser("~/RAG_ALM_assistant"), "data/DIC/*.pdf")
DIC_path = "../data/DIC/*.pdf"

for file in glob.glob(DIC_path):
    try:
        loader = PyPDFLoader(file)  # Retourne une liste de document (un pour chaque page)
        pages = loader.load()
        for i, doc in enumerate(pages):
            doc.metadata["dic_name"] = str(file).split("/")[3]
            doc.metadata["page"] = i + 1           # pages 1-based
        documents += pages
        
    except Exception:
        print(f"Erreur survenue pour le fichier '{file}'.")


In [None]:
print(len(documents))
print("metadata: ", documents[0].metadata)
print(documents[0])



# Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n", "\n\n"],
    chunk_size = 600,
    chunk_overlap = 60,
    length_function = len
)

chunks = text_splitter.split_documents(documents=documents)
print(f"{len(chunks)} chunks ont été créés par le splitter à partir du document PDF.")

# Embedding

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

def get_vectorstore(docs, model_name="intfloat/multilingual-e5-large", normalize_embeddings=True):
    encode_kwargs = {"normalize_embeddings": normalize_embeddings}
    embedding = HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
    vectore_store = Chroma.from_documents(documents=docs, embedding=embedding, persist_directory="../data/vector_store")
    return vectore_store

vector_store = get_vectorstore(chunks)

retriever = vector_store.as_retriever(search_type = 'similarity', search_kwargs={'k':5})



In [None]:
results = retriever.invoke("Qu'est ce que l'OPCVM?")

for i, result in enumerate(results):
    print("--- ")
    print(f"== Contenu du chunk {i} ==\n{result.page_content}")

# LLM

In [None]:
!huggingface-cli login --token=...

In [None]:
import transformers
import torch

from transformers import BitsAndBytesConfig

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

model_config = transformers.AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    config = model_config,
    device_map = 'auto'
)

In [None]:
from langchain_community.llms import HuggingFacePipeline

from transformers import pipeline

llm = HuggingFacePipeline(
    pipeline = pipeline(
        'text-generation',
        model=model, 
        tokenizer=tokenizer,
        max_new_tokens = 4096,
        do_sample = False,
        return_full_text = False # Très important ! On ne veut pas le prompt initial
    )
)

# RAG Pipeline

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

prompt_template = PromptTemplate.from_template(
    "You are an assistant for question-answer tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maximum and keep the answer concise.\n\n"
    "Chat history:\n{chat_history}\n\n"
    "Context:\n{context}\n\n"
    "Question: {question}\n\n"
    "Answer:"
)

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer",
    input_key="question"
)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": prompt_template}
)

def rag_pipeline(question: str):
    result = qa_chain({"question": question})
    answer = result["answer"]
    source_docs = result["source_documents"]

    # formatage simple des sources pour les DIC
    sources = []
    for d in source_docs:
        m = d.metadata
        sources.append({
            "dic_name": m.get("dic_name"),
            "page": m.get("page")
        })

    return answer, sources

In [None]:
query = """
Donnes moi des informations sur l'OPCVM. quand a t'il ete creer et a quoi ca sert?
"""

# Effectuer une requête
answer, sources = rag_pipeline(query)

print("Réponse :\n", answer)
print("\nSources :")
for s in sources:
    print(f"- {s['dic_name']} (page {s['page']})")