In [3]:
# Data link: https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/DIC.zip
!mkdir -p ../data/
!curl -L -o ../data/DIC.zip https://blent-learning-user-ressources.s3.eu-west-3.amazonaws.com/projects/447dd4/DIC.zip
!unzip -d ../data ../data/DIC.zip
!rm ../data/DIC.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6707k  100 6707k    0     0  2473k      0  0:00:02  0:00:02 --:--:-- 2473k
Archive:  ../data/DIC.zip
replace ../data/DIC/Allianz.pdf? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


# Loading

In [2]:
!pip uninstall -y torchvision

Found existing installation: torchvision 0.20.1+cu124
Uninstalling torchvision-0.20.1+cu124:
  Successfully uninstalled torchvision-0.20.1+cu124
[0m

In [None]:

import glob
from langchain_community.document_loaders import PyPDFLoader
import os

documents = []
DIC_path = os.path.join(os.path.expanduser("~/RAG_ALM_assistant"), "data/DIC/*.pdf")
#DIC_path = "../data/DIC/*.pdf"

for file in glob.glob(DIC_path):
    try:
        loader = PyPDFLoader(file)  # Retourne une liste de document (un pour chaque page)
        pages = loader.load()
        for i, doc in enumerate(pages):
            doc.metadata["dic_name"] = str(file).split("/")[3]
            doc.metadata["page"] = i + 1           # pages 1-based
        documents += pages
        
    except Exception:
        print(f"Erreur survenue pour le fichier '{file}'.")


Allianz.pdf
Carmigac.pdf
credit_mutuel_arkea.pdf
fidelity.pdf
fidelity2.pdf
FR0007040373_DIC_FR_20230630.pdf
FR0007050570_DIC_FR_20230630.pdf
FR0010902726_DIC_FR_20230831.pdf
FR0011443225_DIC_FR_20230630.pdf
FR0011443233_DIC_FR_20230630.pdf
FR001400BQ78_DIC_FR_20230630.pdf
Independance et expension.pdf
JP Morgan.pdf
KIDPRIIPs_260056_85665_FRA_FRA_20230601.pdf
KIDPRIIPs_359588_52248_FRA_FRA_20230824.pdf
KIDPRIIPs_359896_87124_FRA_FRA_20230830.pdf
KIDPRIIPs_375641_89603_FRA_FRA_20230905.pdf
KIDPRIIPs_388188_51570_FRA_FRA_20231030.pdf
KIDPRIIPs_388360_51598_FRA_FRA_20231030.pdf
KIDPRIIPs_388409_51596_FRA_FRA_20231030.pdf
KIDPRIIPs_388767_52272_FRA_FRA_20231030.pdf
KIDPRIIPs_389140_85227_FRA_FRA_20231030.pdf
KIDPRIIPs_389170_51360_FRA_FRA_20231030.pdf
KIDPRIIPs_389214_51576_FRA_FRA_20231030.pdf
KIDPRIIPs_390085_51571_FRA_FRA_20231030.pdf
KIDPRIIPs_390180_51557_FRA_FRA_20231030.pdf
KIDPRIIPs_391370_57904_FRA_FRA_20231030.pdf
KIDPRIIPs_391737_63121_FRA_FRA_20231030.pdf
KIDPRIIPs_391793_48169

In [25]:
print(len(documents))
print("metadata: ", documents[0].metadata)
print(documents[0])



173
metadata:  {'producer': 'Actuate PDF Writer (Low Resolution) 2.1', 'creator': 'Actuate e.Reports', 'creationdate': '2022-07-29T08:11:45+01:00', 'title': '', 'subject': '', 'author': 'IDS GmbH - Analysis and Reporting Services', 'keywords': 'FR0010032326 (22.08.2022)', 'source': '../data/DIC/Allianz.pdf', 'total_pages': 2, 'page': 1, 'page_label': '1', 'dic_name': 'Allianz.pdf'}
page_content=' I n f o r m a t i o n s   c l Ø s   p o u r   l  i n v e s t i s s e u r
 C e   d o c u m e n t   f o u r n i t   d e s   i n f o r m a t i o n s   e s s e n t i e l l e s   a u x   i n v e s t i s s e u r s   d e   c e t   O P C V M .   I l   n e   s  a g i t   p a s   d  u n   d o c u m e n t   p r o m o t i o n n e l .   L e s
 i n f o r m a t i o n s   q u  i l   c o n t i e n t   v o u s   s o n t   f o u r n i e s   c o n f o r m Ø m e n t   à   u n e   o b l i g a t i o n   l Ø g a l e ,   a f i n   d e   v o u s   a i d e r   à   c o m p r e n d r e   e n   q u o i
 c o n s i s t e

# Chunking

In [30]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n", "\n\n"],
    chunk_size = 600,
    chunk_overlap = 60,
    length_function = len
)

chunks = text_splitter.split_documents(documents=documents)
print(f"{len(chunks)} chunks ont été créés par le splitter à partir du document PDF.")

1696 chunks ont été créés par le splitter à partir du document PDF.


# Embedding

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

def get_vectorstore(docs, model_name="intfloat/multilingual-e5-large", normalize_embeddings=True):
    encode_kwargs = {"normalize_embeddings": normalize_embeddings}
    embedding = HuggingFaceEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
    vectore_store = Chroma.from_documents(documents=docs, embedding=embedding, persist_directory="../data/vector_store")
    return vectore_store

vector_store = get_vectorstore(chunks)

retriever = vector_store.as_retriever(search_type = 'similarity', search_kwargs={'k':5})



In [9]:
results = retriever.invoke("Qu'est ce que l'OPCVM?")

for i, result in enumerate(results):
    print("--- ")
    print(f"== Contenu du chunk {i} ==\n{result.page_content}")

--- 
== Contenu du chunk 0 ==
Cet OPCVM est un OPCVM de capitalisation (revenus intégralement 
réinvestis).
Précision sur le niveau de risque :
L ’indicateur de risque de niveau 5 reflète principalement le risque dû à 
son exposition aux actions du secteur immobilier de l’Union européenne. 
Il intègre le risque de change qui découle des investissements dans des 
devises autres que l’Euro.
De plus, la catégorie de risque associée à cet OPCVM n’est pas garantie 
et est susceptible d’évoluer dans le temps.
Cet OPCVM ne bénéficie pas de garantie en capital.
Les données historiques, telles que celles utilisées pour calculer
--- 
== Contenu du chunk 1 ==
Cet OPCVM est un OPCVM de capitalisation (revenus intégralement 
réinvestis).
Précision sur le niveau de risque :
L ’indicateur de risque de niveau 5 reflète principalement le risque dû à 
son exposition aux actions du secteur immobilier de l’Union européenne. 
Il intègre le risque de change qui découle des investissements dans des 
devises 

# LLM

In [10]:
!huggingface-cli login --token=...

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [16]:
import transformers
import torch

from transformers import BitsAndBytesConfig

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

model_config = transformers.AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    config = model_config,
    device_map = 'auto'
)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:  45%|####4     | 2.22G/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
from langchain_community.llms import HuggingFacePipeline

from transformers import pipeline

llm = HuggingFacePipeline(
    pipeline = pipeline(
        'text-generation',
        model=model, 
        tokenizer=tokenizer,
        max_new_tokens = 4096,
        do_sample = False,
        return_full_text = False # Très important ! On ne veut pas le prompt initial
    )
)

  llm = HuggingFacePipeline(


# RAG Pipeline

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

prompt_template = PromptTemplate.from_template(
    "You are an assistant for question-answer tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maximum and keep the answer concise.\n\n"
    "Chat history:\n{chat_history}\n\n"
    "Context:\n{context}\n\n"
    "Question: {question}\n\n"
    "Answer:"
)

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer",
    input_key="question"
)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": prompt_template}
)

def rag_pipeline(question: str):
    result = qa_chain({"question": question})
    answer = result["answer"]
    source_docs = result["source_documents"]

    # formatage simple des sources pour les DIC
    sources = []
    for d in source_docs:
        m = d.metadata
        sources.append({
            "dic_name": m.get("dic_name"),
            "page": m.get("page")
        })

    return answer, sources

In [33]:
query = """
Donnes moi des informations sur l'OPCVM. quand a t'il ete creer et a quoi ca sert?
"""

# Effectuer une requête
answer, sources = rag_pipeline(query)

print("Réponse :\n", answer)
print("\nSources :")
for s in sources:
    print(f"- {s['dic_name']} (page {s['page']})")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Réponse :
 
The Organisme de Placement Collectif en Valeurs Mobilières (OPCVM) was created for a duration of 99 years. It is an investment product that allows collective investment in securities.

Sources :
- FR0007050570_DIC_FR_20230630.pdf (page 1)
- FR0010902726_DIC_FR_20230831.pdf (page 1)
- FR0011443225_DIC_FR_20230630.pdf (page 1)
- FR0011443233_DIC_FR_20230630.pdf (page 1)
- FR0010902726_DIC_FR_20230831.pdf (page 2)
