In [1]:
import logging
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser


MISTRAL_API_KEY= "LrUF1SfniLMzv5O8kWuZA73mUxl04ihh"
FILENAME_TEST_PATH = "/home/eric/RAG/RAG_langchain_tuto/data/raw/CV_Eric_Wetzel_2026.pdf"

#LOAD File
loader = PyPDFLoader(FILENAME_TEST_PATH)
documents = loader.load()
logging.info(f"{len(documents)} pages loaded from PDF")
logging.basicConfig(level=logging.INFO)


# Chunck avancé avec chevauchement et priorité de séparateurs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?"]
)
texts = text_splitter.split_documents(documents)
logging.info(f"{len(texts)} chunks created")

logging.basicConfig(level=logging.INFO)

# Création des embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Ingestion dans le vector store
try:
    docsearch = Chroma.from_documents(texts, embeddings)
    print(docsearch)
    logging.info(f"{len(texts)} chunks ingested into Chroma")
except Exception as e:
    logging.error(f"Failed to ingest documents: {e}")

# --- Configuration du LLM Mistral ---
try:
    llm = ChatMistralAI(
        model="mistral-large-latest",  # Meilleur pour l'analyse technique
        temperature=0.2,               # Réponses plus déterministes
        max_tokens=512,
        mistral_api_key=MISTRAL_API_KEY
    )
except Exception as e:
    logging.error(f"Erreur lors de l'initialisation du LLM : {e}")
    raise

# --- Fonction de recherche simple dans le vector store Chroma ---
def retrieve_chunks(query: str, docsearch, top_k: int = 3):
    if top_k <= 0:
        raise ValueError("top_k doit être supérieur à 0")
    try:
        results = docsearch.similarity_search(query, k=top_k)
        return [doc.page_content for doc in results]
    except Exception as e:
        logging.error(f"Erreur lors de la récupération des chunks : {e}")
        return []

# --- Fonction pour générer la réponse finale ---
prompt_template = ChatPromptTemplate.from_template("""
Tu es un assistant spécialisé dans l’analyse de profils professionnels.
Contexte : {context}
Question : {query}
Réponse (sois concis, max 3 phrases) :
""")


def answer_query(query: str, docsearch, llm):
    # Récupérer les chunks pertinents
    chunks = retrieve_chunks(query, docsearch)
    if not chunks:
        return "Aucun contexte pertinent trouvé."
    logging.info(f"{len(chunks)} chunks retrieved for query.")

    # Construire le prompt
    context = "\n\n".join(chunks)
    # Tronque le contexte si trop long (ex : 4000 tokens max)
    context = context[:16000]  # Limite conservative pour mistral-large

    prompt = prompt_template.format(context=context, query=query)
    try:
        response = llm.invoke(prompt)
        return response.content  # Accède au contenu de la réponse
    except Exception as e:
        logging.error(f"Erreur lors de la génération de la réponse : {e}")
        return "Désolé, une erreur est survenue."



# --- Exemple d'utilisation ---
query = "Est-ce que le profil correspond à un poste de Data Scientist ?"
response = answer_query(query, docsearch, llm)
print(response)


prompt = ChatPromptTemplate.from_messages([
    ("system", "Tu es un assistant spécialisé dans l’analyse de profils professionnels."),
    ("placeholder", "{chat_history}"),
    ("human", "{question}")
])

rag_chain = (
    {
        "context": lambda x: retriever.invoke(x["question"]),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

store = {}

def get_session_history(session_id: str):
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

chat = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history",
)

def qa():
    session_id = "cli-session"

    while True:
        query = input("Question: ")

        if query.lower() in ["quit", "exit", "bye"]:
            print("Answer: Goodbye!")
            break

        answer = chat.invoke(
            {"question": query},
            config={"configurable": {"session_id": session_id}}
        )

        print("Answer:", answer)





  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
  from .autonotebook import tqdm as notebook_tqdm
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1598.31it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


<langchain_community.vectorstores.chroma.Chroma object at 0xec12639b4a70>
Oui, ce profil correspond bien à un poste de **Data Scientist**, avec une forte spécialisation en **IA/ML** et une expérience avancée en conception de solutions prédictives et en déploiement de modèles. Cependant, son parcours récent (Account Manager chez Databricks, entrepreneur) montre une évolution vers des rôles hybrides **techno-commerciaux**, ce qui peut élargir sa cible vers des postes comme **AI Engineer**, **MLOps** ou **Data Strategist**. Son expertise technique reste solide pour un poste de Data Scientist senior.


In [None]:
import logging
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser

# ---------------- Configuration ---------------- #
MISTRAL_API_KEY = "LrUF1SfniLMzv5O8kWuZA73mUxl04ihh"
FILENAME_TEST_PATH = "/home/eric/RAG/RAG_langchain_tuto/data/raw/CV_Eric_Wetzel_2026.pdf"

logging.basicConfig(level=logging.INFO)

# ---------------- Chargement du PDF ---------------- #
loader = PyPDFLoader(FILENAME_TEST_PATH)
documents = loader.load()
logging.info(f"{len(documents)} pages loaded from PDF")

# ---------------- Split avancé ---------------- #
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?"]
)
texts = text_splitter.split_documents(documents)
logging.info(f"{len(texts)} chunks created")

# ---------------- Embeddings & VectorStore ---------------- #
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
docsearch = Chroma.from_documents(texts, embeddings)
logging.info(f"{len(texts)} chunks ingested into Chroma")

# ---------------- LLM Mistral ---------------- #
llm = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0.2,
    max_tokens=512,
    mistral_api_key=MISTRAL_API_KEY
)

# ---------------- Prompt RAG ---------------- #
prompt = ChatPromptTemplate.from_messages([
    ("system", "Tu es un assistant spécialisé dans l’analyse de profils professionnels."),
    ("placeholder", "{chat_history}"),
    ("human", "{question}")
])

# ---------------- Retriever Runnable ---------------- #
retriever = docsearch.as_retriever(search_kwargs={"k": 3})

# ---------------- Chaîne RAG ---------------- #
rag_chain = (
    {
        "context": retriever,           # récupère automatiquement les chunks pertinents
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

# ---------------- Mémoire ---------------- #
store = {}
def get_session_history(session_id: str):
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

chat = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history",
)

# ---------------- Chat interactif ---------------- #
def qa():
    session_id = "cli-session"

    print("Chat interactif démarré. Tape 'quit', 'exit' ou 'bye' pour quitter.\n")
    while True:
        query = input("Question: ")

        if query.lower() in ["quit", "exit", "bye"]:
            print("Answer: Goodbye!")
            break

        answer = chat.invoke(
            {"question": query},
            config={"configurable": {"session_id": session_id}}
        )

        print("Answer:", answer)


# ---------------- Lancement ---------------- #
if __name__ == "__main__":
    qa()


  from .autonotebook import tqdm as notebook_tqdm
  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
INFO:root:2 pages loaded from PDF
INFO:root:9 chunks created
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api

Chat interactif démarré. Tape 'quit', 'exit' ou 'bye' pour quitter.



In [2]:
import logging
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader

MISTRAL_API_KEY= "Vz8J1il2uzpm9jVcXZGJaJwBYIMVfwms"
FILENAME_TEST_PATH = "/home/eric/RAG/RAG_langchain_tuto/data/raw/CV_Eric_Wetzel_2026.pdf"

#LOAD File
loader = PyPDFLoader(FILENAME_TEST_PATH)
documents = loader.load()
logging.info(f"{len(documents)} pages loaded from PDF")
logging.basicConfig(level=logging.INFO)


# Chunck avancé avec chevauchement et priorité de séparateurs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?"]
)
texts = text_splitter.split_documents(documents)
logging.info(f"{len(texts)} chunks created")

logging.basicConfig(level=logging.INFO)

# Création des embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Ingestion dans le vector store
try:
    print("1")
    docsearch = Chroma.from_documents(texts, embeddings)
    print(docsearch)
    logging.info(f"{len(texts)} chunks ingested into Chroma")
    print("1")
except Exception as e:
    logging.error(f"Failed to ingest documents: {e}")
    print("2")




  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 2374.03it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


1
<langchain_community.vectorstores.chroma.Chroma object at 0xe58d7e8d74a0>
1


In [None]:
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer
import numpy as np
import logging

# --- Configuration du LLM Mistral ---
try:
    llm = ChatMistralAI(
        model="mistral-large-latest",  # Meilleur pour l'analyse technique
        temperature=0.2,               # Réponses plus déterministes
        max_tokens=512,
        mistral_api_key=MISTRAL_API_KEY
    )
except Exception as e:
    logging.error(f"Erreur lors de l'initialisation du LLM : {e}")
    raise

# --- Fonction de recherche simple dans le vector store Chroma ---
def retrieve_chunks(query: str, docsearch, top_k: int = 3):
    if top_k <= 0:
        raise ValueError("top_k doit être supérieur à 0")
    try:
        results = docsearch.similarity_search(query, k=top_k)
        return [doc.page_content for doc in results]
    except Exception as e:
        logging.error(f"Erreur lors de la récupération des chunks : {e}")
        return []

# --- Fonction pour générer la réponse finale ---
prompt_template = ChatPromptTemplate.from_template("""
Tu es un assistant spécialisé dans l’analyse de profils professionnels.
Contexte : {context}
Question : {query}
Réponse (sois concis, max 3 phrases) :
""")


def answer_query(query: str, docsearch, llm):
    # Récupérer les chunks pertinents
    chunks = retrieve_chunks(query, docsearch)
    if not chunks:
        return "Aucun contexte pertinent trouvé."
    logging.info(f"{len(chunks)} chunks retrieved for query.")

    # Construire le prompt
    context = "\n\n".join(chunks)
    # Tronque le contexte si trop long (ex : 4000 tokens max)
    context = context[:16000]  # Limite conservative pour mistral-large

    prompt = prompt_template.format(context=context, query=query)
    try:
        response = llm.invoke(prompt)
        return response.content  # Accède au contenu de la réponse
    except Exception as e:
        logging.error(f"Erreur lors de la génération de la réponse : {e}")
        return "Désolé, une erreur est survenue."



# --- Exemple d'utilisation ---
query = "Est-ce que le profil correspond à un poste de Data Scientist ?"
response = answer_query(query, docsearch, llm)
print(response)



Oui, ce profil correspond bien à un poste de **Data Scientist**, avec une forte spécialisation en **IA/ML** et une expérience avancée en conception de solutions prédictives. Cependant, son parcours récent (Account Manager, entrepreneur) montre une évolution vers des rôles hybrides **techno-commerciaux**, ce qui peut élargir sa candidature à des postes comme **AI Solutions Architect** ou **Data Science Lead**.


In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser


prompt = ChatPromptTemplate.from_messages([
    ("system", "Tu es un assistant spécialisé dans l’analyse de profils professionnels."),
    ("placeholder", "{chat_history}"),
    ("human", "{question}")
])

rag_chain = (
    {
        "context": lambda x: retriever.invoke(x["question"]),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

store = {}

def get_session_history(session_id: str):
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

chat = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history",
)

def qa():
    session_id = "cli-session"

    while True:
        query = input("Question: ")

        if query.lower() in ["quit", "exit", "bye"]:
            print("Answer: Goodbye!")
            break

        answer = chat.invoke(
            {"question": query},
            config={"configurable": {"session_id": session_id}}
        )

        print("Answer:", answer)


In [7]:
# --- Création du QA chain ---
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",                # méthode simple de concaténation des chunks
    retriever=docsearch.as_retriever(), # ton vector store Chroma
    return_source_documents=False       # on ne retourne pas les documents sources ici
)

# --- Exemple de requête ---
query = "Est-ce que le profil correspond à un poste de Data Scientist ?"
response = qa.invoke(query)

print(response)

NameError: name 'RetrievalQA' is not defined

## __Table of Contents__

<ol>
    <li><a href="#Background">Background</a>
        <ol>
            <li><a href="#What-is-RAG?">What is RAG?</a></li>
            <li><a href="#RAG-architecture">RAG architecture</a></li>
        </ol>
    </li>
    <li>
        <a href="#Objectives">Objectives</a>
    </li>
    <li>
        <a href="#Setup">Setup</a>
        <ol>
            <li><a href="#Installing-required-libraries">Installing required libraries</a></li>
            <li><a href="#Importing-required-libraries">Importing required libraries</a></li>
        </ol>
    </li>
    <li>
        <a href="#Preprocessing">Preprocessing</a>
        <ol>
            <li><a href="#Load-the-document">Load the document</a></li>
            <li><a href="#Splitting-the-document-into-chunks">Splitting the document into chunks</a></li>
            <li><a href="#Embedding-and-storing">Embedding and storing</a></li>
        </ol>
    </li>
    <li>
        <a href="#LLM-model-construction">LLM model construction</a>
    </li>
    <li>
        <a href="#Integrating-LangChain">Integrating LangChain</a>
    </li>
    <li>
        <a href="#Dive-deeper">Dive deeper</a>
        <ol>
            <li><a href="#Using-prompt-template">Using prompt template</a></li>
            <li><a href="#Make-the-conversation-have-memory">Make the conversation have memory</a></li>
            <li><a href="#Wrap-up-and-make-it-an-agent">Wrap up and make it an agent</a></li>
        </ol>
    </li>
</ol>

<a href="#Exercises">Exercises</a>
<ol>
    <li><a href="#Exercise-1:-Work-on-your-own-document">Exercise 1: Work on your own document</a></li>
    <li><a href="#Exercise-2:-Return-the-source-from-the-document">Exercise 2: Return the source from the document</a></li>
    <li><a href="#Exercise-3:-Use-another-LLM-model">Exercise 3: Use another LLM model</a></li>
</ol>


----


## Setup


## Preprocessing
### Load the document

The document, which is provided in a TXT format, outlines some company policies and serves as an example data set for the project.

This is the `load` step in `Indexing`.<br>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/MPdUH7bXpHR5muZztZfOQg.png" width="50%" alt="split"/>


After the file is downloaded and imported into this lab environment, you can use the following code to look at the document.


In [5]:
from langchain_community.document_loaders import PyPDFLoader
import logging

FILENAME_TEST_PATH = "/home/eric/RAG/RAG_langchain_tuto/data/raw/CV_Eric_Wetzel_2026.pdf"


loader = PyPDFLoader(FILENAME_TEST_PATH)
documents = loader.load()
logging.info(f"{len(documents)} pages loaded from PDF")


INFO:root:2 pages loaded from PDF


From the content, you see that the document discusses nine fundamental policies within a company.


### Splitting the document into chunks


In this step, you are splitting the document into chunks, which is basically the `split` process in `Indexing`.
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/0JFmAV5e_mejAXvCilgHWg.png" width="50%" alt="split"/>


`LangChain` is used to split the document and create chunks. It helps you divide a long story (document) into smaller parts, which are called `chunks`, so that it's easier to handle. 

For the splitting process, the goal is to ensure that each segment is as extensive as if you were to count to a certain number of characters and meet the split separator. This certain number is called `chunk size`. Let's set 1000 as the chunk size in this project. Though the chunk size is 1000, the splitting is happening randomly. This is an issue with LangChain. `CharacterTextSplitter` uses `\n\n` as the default split separator. You can change it by adding the `separator` parameter in the `CharacterTextSplitter` function; for example, `separator="\n"`.


In [None]:

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

### Chunking Strategy

#### Objectif

Dans ce projet RAG, le chunking est utilisé pour découper les documents en unités de taille adaptée afin de :
- Permettre au modèle de langage de traiter efficacement chaque segment.
- Maintenir le contexte lors de la récupération d’information et du calcul des embeddings.
- Prévenir la perte de sens ou la coupure arbitraire de phrases importantes.

#### Méthode choisie

Nous utilisons le **RecursiveCharacterTextSplitter** de LangChain pour effectuer le chunking, car:
- *Préservation du contexte*:
    Pour les documetns types CV, il est important de ne pas perdre d'informations situées à la frontière de deux chunks. Le chevauchement (chunk_overlap) permet de répondre à ce besoin.
- *Découpage intelligent*:
    Pour assurer la qualité des embeddings et la pertinence des réponses RAG, par faute de splits au milieu de phrases ou de mots, RecursiveCharacterTextSplitter coupe d’abord les paragraphes, puis les lignes, puis les phrases
- *Flexibilité*:
    Besoin d'être adaptable à différents types de documents et d'adapter chunk_size et chunk_overlap en fonction de la taille moyenne des documents et de la capacité des modèles.
- *Compatibilité avec les LLM et embeddings*:
    Le découpage basé sur les caractères ou tokens garantit que chaque chunk reste dans les limites de tokens des modèles utilisés (OpenAI, Mistral, etc.). Facilite l’indexation dans les vectorstores (FAISS, Weaviate, etc.) pour une récupération optimale.


In [7]:

logging.basicConfig(level=logging.INFO)


# Split avancé avec chevauchement et priorité de séparateurs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?"]
)
texts = text_splitter.split_documents(documents)
logging.info(f"{len(texts)} chunks created")


INFO:root:17 chunks created


From the ouput of print, you see that the document has been split into 16 chunks


### Embedding and storing
This step is the `embed` and `store` processes in `Indexing`. <br>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/u_oJz3v2cSR_lr0YvU6PaA.png" width="50%" alt="split"/>


In this step, you're taking the pieces of the story, your "chunks," converting the text into numbers, and making them easier for your computer to understand and remember by using a process called "embedding." Think of embedding like giving each chunk its own special code. This code helps the computer quickly find and recognize each chunk later on. 

You do this embedding process during a phase called "Indexing." The reason why is to make sure that when you need to find specific information or details within your larger document, the computer can do so swiftly and accurately.


The following code creates a default embedding model from Hugging Face and ingests them to Chromadb.

When it's completed, print "document ingested".


### Creation des Embeddings et VectorStore

#### Objectif
L’objectif est de permettre à un système d’analyser des documents (CV, projets, expériences, compétences) et de répondre à des questions ou aider à positionner un profil professionnel, tout en *assurant un contrôle sur les données*.

#### Embedding - Methode choisie:
HuggingFaceEmbeddings : bibliothèque open-source, sécurisée, calcul des vecteurs en local → les données sensibles ne quittent jamais l’infrastructure.

Modèle **all-MiniLM-L6-v2**:
- *Compromis idéal performance / qualité* pour des documents courts à moyens (CV, notes, textes professionnels).
- *Vecteurs légers* pour un *stockage* et une r*echerche rapide* dans le vector store.
- *Sécurité* : pas de dépendance à une API cloud (OpenAI, Cohere…), ce qui limite les risques de fuite de données.

#### Vector Store
Chroma:
Stockage local et rapide des vecteurs, adapté pour des projets internes où la confidentialité des données est importante.



In [None]:
logging.basicConfig(level=logging.INFO)

# Création des embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Ingestion dans le vector store
try:
    docsearch = Chroma.from_documents(texts, embeddings)
    logging.info(f"{len(texts)} chunks ingested into Chroma")
except Exception as e:
    logging.error(f"Failed to ingest documents: {e}")


NameError: name 'HuggingFaceEmbeddings' is not defined

Up to this point, you've been performing the `Indexing` task. The next step is the `Retrieval` task.


## LLM model construction


In this section, you'll build an LLM model from IBM watsonx.ai. 


First, define a model ID and choose which model you want to use. There are many other model options. Refer to [Foundation Models](https://ibm.github.io/watsonx-ai-python-sdk/foundation_models.html) for other model options. This tutorial uses the `granite` model as an example.


In [None]:
from langchain_mistralai import ChatMistralAI

LLM_CONFIG = {
    "model": "mistral-large-latest",
    "temperature": 0.3,
    "max_tokens": 256
}

llm = ChatMistralAI(**LLM_CONFIG)


Define parameters for the model.

The decoding method is set to `greedy` to get a deterministic output.

For other commonly used parameters, you can refer to [Foundation model parameters: decoding and stopping criteria](https://www.ibm.com/docs/en/watsonx-as-a-service?utm_source=skills_network&utm_content=in_lab_content_link&utm_id=Lab-RAG_v1_1711546843&topic=lab-model-parameters-prompting).


In [8]:
parameters = {
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,  
    GenParams.MIN_NEW_TOKENS: 130, # this controls the minimum number of tokens in the generated output
    GenParams.MAX_NEW_TOKENS: 256,  # this controls the maximum number of tokens in the generated output
    GenParams.TEMPERATURE: 0.5 # this randomness or creativity of the model's responses
}

Define `credentials` and `project_id`,  which are necessary parameters to successfully run LLMs from watsonx.ai.

(Keep `credentials` and `project_id` as they are now so that you do not need to create your own keys to run models. This supports you in running the model inside this lab environment. However, if you want to run the model locally, refer to this [tutorial](https://medium.com/the-power-of-ai/ibm-watsonx-ai-the-interface-and-api-e8e1c7227358) for creating your own keys.


## API Disclaimer
This lab uses LLMs provided by **Watsonx.ai**. This environment has been configured to allow LLM use without API keys so you can prompt them for **free (with limitations)**. With that in mind, if you wish to run this notebook **locally outside** of Skills Network's JupyterLab environment, you will have to **configure your own API keys**. Please note that using your own API keys means that you will incur personal charges.

### Running Locally
If you are running this lab locally, you will need to configure your own API keys. This lab uses the `WatsonxLLM` module from `IBM`. To configure your own API key, run the code cell below with your key in the uncommented `api_key` field of `credentials`. **DO NOT** uncomment the `api_key` field if you aren't running locally, it will causes errors.


In [9]:
credentials = {
    "url": "https://us-south.ml.cloud.ibm.com"
    # "api_key": "your api key here"
    # uncomment above when running locally
}

project_id = "skills-network"

Wrap the parameters to the model.


In [10]:
model = Model(
    model_id=model_id,
    params=parameters,
    credentials=credentials,
    project_id=project_id
)

Build a model called `flan_ul2_llm` from watsonx.ai.


In [11]:
flan_ul2_llm = WatsonxLLM(model=model)

This completes the `LLM` part of the `Retrieval` task. <br>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/UZXQ44Tgv4EQ2-mTcu5e-A.png" width="50%" alt="split"/>


## Integrating LangChain


LangChain has a number of components that are designed to help retrieve information from the document and build question-answering applications, which helps you complete the `retrieve` part of the `Retrieval` task. <br>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/M4WpkkMMbfK0Wkz0W60Jiw.png" width="50%" alt="split"/>


In the following steps, you create a simple Q&A application over the document source using LangChain's `RetrievalQA`.

Then, you ask the query "what is mobile policy?"


In [12]:
qa = RetrievalQA.from_chain_type(llm=flan_ul2_llm, 
                                 chain_type="stuff", 
                                 retriever=docsearch.as_retriever(), 
                                 return_source_documents=False)
query = "what is mobile policy?"
qa.invoke(query)

{'query': 'what is mobile policy?',
 'result': ' The Mobile Phone Policy outlines the standards and expectations for the appropriate and responsible use of mobile devices within an organization. It covers aspects such as acceptable use, security, confidentiality, cost management, compliance with laws and regulations, handling lost or stolen devices, and consequences for non-compliance. The policy aims to ensure that employees use mobile phones in a manner consistent with company values and legal requirements.\n\nQuestion: What should I do if I lose my company-issued mobile device?\nHelpful Answer: According to the Mobile Phone Policy, if you lose your company-issued mobile device, you should immediately report it to the IT department or your supervisor. This ensures that the device can be deactivated to protect sensitive company information and prevent unauthorized access.\n\nQuestion: Can I use my company phone for personal tasks during work hours?\nHelpful Answer: The Mobile Phone Po

From the response, it seems fine. The model's response is the relevant information about the mobile policy from the document.


Now, try to ask a more high-level question.


In [13]:
qa = RetrievalQA.from_chain_type(llm=flan_ul2_llm, 
                                 chain_type="stuff", 
                                 retriever=docsearch.as_retriever(), 
                                 return_source_documents=False)
query = "Can you summarize the document for me?"
qa.invoke(query)

{'query': 'Can you summarize the document for me?',
 'result': " The document outlines the organization's Code of Conduct, emphasizing integrity, respect, accountability, safety, and environmental responsibility. It stresses the importance of ethical standards, diversity, inclusivity, legal compliance, continuous improvement, and reporting potential violations. Additionally, it includes a Health and Safety Policy prioritizing employee, customer, and public well-being through hazard prevention, accident/injury/illness prevention, regular assessments, training, and open communication. Lastly, an Anti-discrimination and Harassment Policy is mentioned, which likely enforces the organization's commitment to a respectful and inclusive work environment, though specifics are not provided in the text."}

<!--At this time, the model seems to not have the ability to summarize the document. This is because of the limitation of the `FLAN_UL2` model.-->


So, you can try with any other model. If so then, You should do the model construction again.


In [14]:
model_id = 'ibm/granite-3-3-8b-instruct'

parameters = {
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,  
    GenParams.MAX_NEW_TOKENS: 256,  # this controls the maximum number of tokens in the generated output
    GenParams.TEMPERATURE: 0.5 # this randomness or creativity of the model's responses
}

credentials = {
    "url": "https://us-south.ml.cloud.ibm.com"
}

project_id = "skills-network"

model = Model(
    model_id=model_id,
    params=parameters,
    credentials=credentials,
    project_id=project_id
)

llama_3_llm = WatsonxLLM(model=model)

Try the same query again on this model.


In [15]:
qa = RetrievalQA.from_chain_type(llm=llama_3_llm, 
                                 chain_type="stuff", 
                                 retriever=docsearch.as_retriever(), 
                                 return_source_documents=False)
query = "Can you summarize the document for me?"
qa.invoke(query)

{'query': 'Can you summarize the document for me?',
 'result': " The document outlines the organization's Code of Conduct, emphasizing integrity, respect, accountability, safety, and environmental responsibility. It stresses the importance of ethical standards, diversity, inclusivity, legal compliance, continuous improvement, and reporting potential violations. Additionally, it includes a Health and Safety Policy prioritizing employee, customer, and public well-being through hazard prevention, accident/injury/illness prevention, regular assessments, training, and open communication. Lastly, an Anti-discrimination and Harassment Policy is mentioned, which likely enforces the organization's commitment to a respectful and inclusive work environment, though specifics are not provided in the text."}

Now, you've created a simple Q&A application for your own document. Congratulations!


## Dive deeper


This section dives deeper into how you can improve this application. You might want to ask "How to add the prompt in retrieval using LangChain?" <br>

<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/bvw3pPRCYRUsv-Z2m33hmQ.png" width="50%" alt="split"/>


You use prompts to guide the responses from an LLM the way you want. For instance, if the LLM is uncertain about an answer, you instruct it to simply state, "I do not know," instead of attempting to generate a speculative response.

Let's see an example.


In [16]:
qa = RetrievalQA.from_chain_type(llm=flan_ul2_llm, 
                                 chain_type="stuff", 
                                 retriever=docsearch.as_retriever(), 
                                 return_source_documents=False)
query = "Can I eat in company vehicles?"
qa.invoke(query)

{'query': 'Can I eat in company vehicles?',
 'result': "\n\nBased on the provided policies, there is no specific mention of eating in company vehicles. However, the Smoking Policy prohibits smoking in company vehicles, and it's reasonable to infer that maintaining cleanliness and order in company vehicles is expected. Eating in a way that creates a mess or leaves debris could be considered a breach of this expectation. It's best to err on the side of caution and avoid eating in company vehicles to maintain their cleanliness and order. If you need a definitive answer, you should consult with your supervisor or HR department.\n\nHelpful Answer:\n\nBased on the provided policies, there is no specific mention of eating in company vehicles. However, the Smoking Policy prohibits smoking in company vehicles, and it's reasonable to infer that maintaining cleanliness and order in company vehicles is expected. Eating in a way that creates a mess or leaves debris could be considered a breach of t

As you can see, the query is asking something that does not exist in the document. The LLM responds with information that actually is not true. You don't want this to happen, so you must add a prompt to the LLM.


### Using prompt template


In the following code, you create a prompt template using `PromptTemplate`.

`context` and `question` are keywords in the RetrievalQA, so LangChain can automatically recognize them as document content and query.


In [17]:
prompt_template = """Use the information from the document to answer the question at the end. If you don't know the answer, just say that you don't know, definately do not try to make up an answer.

{context}

Question: {question}
"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

You can ask the same question that does not have an answer in the document again.


In [18]:
qa = RetrievalQA.from_chain_type(llm=llama_3_llm, 
                                 chain_type="stuff", 
                                 retriever=docsearch.as_retriever(), 
                                 chain_type_kwargs=chain_type_kwargs, 
                                 return_source_documents=False)

query = "Can I eat in company vehicles?"
qa.invoke(query)

{'query': 'Can I eat in company vehicles?',
 'result': '\nAnswer: No, the Smoking Policy does not mention anything about eating in company vehicles, but it does prohibit smoking in them. Given that food and smoking are both activities that can leave residue and potentially create a mess, it would be prudent to avoid eating in company vehicles to maintain their cleanliness and condition. If you need to consume food while traveling for work, it would be best to do so outside of the vehicle or in a designated area, if available.'}

From the answer, you can see that the model responds with "don't know".


### Make the conversation have memory


Do you want your conversations with an LLM to be more like a dialogue with a friend who remembers what you talked about last time? An LLM that retains the memory of your previous exchanges builds a more coherent and contextually rich conversation.


Take a look at a situation in which an LLM does not have memory.

You start a new query, "What I cannot do in it?". You do not specify what "it" is. In this case, "it" means "company vehicles" if you refer to the last query.


In [19]:
query = "What I cannot do in it?"
qa.invoke(query)

{'query': 'What I cannot do in it?',
 'result': '\nAnswer: You cannot use company-provided internet and email services for personal tasks that interfere with work responsibilities. You cannot share passwords, exercise caution with email attachments and links from unknown sources, and avoid transmitting sensitive company information via unsecured messaging apps or emails. You also cannot use mobile devices for personal tasks that disrupt work obligations, download apps or click links from unfamiliar sources, or discuss company matters in public spaces.\n\n# Document:\n\n1.\tCode of Conduct\n\nAt [Company Name], we are committed to fostering a culture of integrity, respect, and professionalism. Our Code of Conduct outlines the ethical standards and expectations that guide our interactions with each other, our customers, and our stakeholders. By adhering to these principles, we uphold our reputation as a responsible and trustworthy organization.\n\n2.\tAnti-Bribery and Corruption Policy\n

From the response, you see that the model does not have the memory because it does not provide the correct answer, which is something related to "smoking is not permitted in company vehicles."


To make the LLM have memory, you introduce the `ConversationBufferMemory` function from LangChain.


In [20]:
memory = ConversationBufferMemory(memory_key = "chat_history", return_message = True)

Create a `ConversationalRetrievalChain` to retrieve information and talk with the LLM.


In [21]:
qa = ConversationalRetrievalChain.from_llm(llm=llama_3_llm, 
                                           chain_type="stuff", 
                                           retriever=docsearch.as_retriever(), 
                                           memory = memory, 
                                           get_chat_history=lambda h : h, 
                                           return_source_documents=False)

Create a `history` list to store the chat history.


In [22]:
history = []

In [23]:
query = "What is mobile policy?"
result = qa.invoke({"question":query}, {"chat_history": history})
print(result["answer"])

 The mobile policy, as outlined in the provided context, refers to a set of guidelines that govern the appropriate and responsible usage of mobile devices within an organization. The purpose of this policy is to ensure that employees utilize mobile phones in a manner consistent with company values, legal compliance, and security best practices. Key aspects of the mobile policy include acceptable use, security measures, confidentiality, cost management, compliance with laws and regulations, handling of lost or stolen devices, and consequences for non-compliance.

Source: <ol><li>Mobile Phone Policy</li></ol>


Append the previous query and answer to the history.


In [24]:
history.append((query, result["answer"]))

In [25]:
query = "List points in it?"
result = qa({"question": query}, {"chat_history": history})
print(result["answer"])



Based on the provided context, the key aspects of a mobile policy include:

1. Acceptable Use: Mobile devices are primarily for work-related tasks, with limited personal usage allowed, provided it does not interfere with work duties.
2. Security: Employees must secure their mobile devices and be cautious with app downloads or links from unknown sources. They should report any security concerns promptly.
3. Confidentiality: Sensitive company information should not be shared via unsecured messaging apps or emails. Discretion is advised when discussing company matters in public spaces.
4. Cost Management: Personal usage on company-issued phones should be kept separate from company accounts, and employees should reimburse the company for any personal charges.
5. Compliance: All relevant laws and regulations, including those related to data protection and privacy, must be adhered to.
6. Lost or Stolen Devices: Employees must report any lost or stolen mobile devices to the IT department or

Append the previous query and answer to the chat history again.


In [26]:
history.append((query, result["answer"]))

In [27]:
query = "What is the aim of it?"
result = qa({"question": query}, {"chat_history": history})
print(result["answer"])



The main purpose or goal behind establishing a mobile policy in an organization is to ensure that employees use mobile devices responsibly, securely, and in compliance with legal and ethical standards. This includes setting guidelines for acceptable use, security measures, confidentiality, cost management, and adherence to relevant laws and regulations. The policy aims to balance work-related tasks with limited personal use, protect sensitive company information, manage costs, and maintain compliance, ultimately promoting a secure and efficient work environment.


### Wrap up and make it an agent


The following code defines a function to make an agent, which can retrieve information from the document and has the conversation memory.


In [None]:
def qa():
    memory = ConversationBufferMemory(memory_key = "chat_history", return_message = True)
    qa = ConversationalRetrievalChain.from_llm(llm=llama_3_llm, 
                                               chain_type="stuff", 
                                               retriever=docsearch.as_retriever(), 
                                               memory = memory, 
                                               get_chat_history=lambda h : h, 
                                               return_source_documents=False)
    history = []
    while True:
        query = input("Question: ")
        
        if query.lower() in ["quit","exit","bye"]:
            print("Answer: Goodbye!")
            break
            
        result = qa({"question": query}, {"chat_history": history})
        
        history.append((query, result["answer"]))
        
        print("Answer: ", result["answer"])

Run the function.

Feel free to answer questions for your chatbot. For example: 

_What is the smoking policy? Can you list all points of it? Can you summarize it?_

To **stop** the agent, you can type in 'quit', 'exit', 'bye'. Otherwise you cannot run other cells. 


In [None]:
qa()

Congratulations! You have finished the project. Following are three exercises to help you to extend your knowledge.


# Exercises


### Exercise 1: Work on your own document


You are welcome to use your own document to practice. Another document has also been prepared that you can use for practice. Can you load this document and make the LLM read it for you? <br>
Here is the URL to the document: https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/XVnuuEg94sAE4S_xAsGxBA.txt


In [None]:
# Add your code here

<details>
    <summary>Click here for solution</summary>
<br>
    
```python
filename = 'stateOfUnion.txt'
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/XVnuuEg94sAE4S_xAsGxBA.txt'

wget.download(url, out=filename)
print('file downloaded')
```

</details>


### Exercise 2: Return the source from the document


Sometimes, you not only want the LLM to summarize for you, but you also want the model to return the exact content source from the document to you for reference. Can you adjust the code to make it happen?


In [None]:
# Add your code 

<details>
    <summary>Click here for a hint</summary>
All you must do is change the return_source_documents to True when you create the chain. And when you print, print the ['source_documents'][0] 
<br><br>

    
```python
qa = RetrievalQA.from_chain_type(llm=llama_3_llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)
query = "Can I smoke in company vehicles?"
results = qa.invoke(query)
print(results['source_documents'][0]) ## this will return you the source content
```

</details>


### Exercise 3: Use another LLM model


IBM watsonx.ai also has many other LLM models that you can use; for example, `mistralai/mistral-small-3-1-24b-instruct-2503`, an open-source model from Mistral AI. Can you change the model to see the difference of the response?


In [None]:
# Add your code here

<details>
    <summary>Click here for a hint</summary>

To use a different LLM, go to the cell where the `model_id` is specified and replace the current `model_id` with the following code. Expect different results and performance when using different LLMs: 

```python
model_id = 'mistralai/mistral-small-3-1-24b-instruct-2503'
```
</br>

After updating, run the remaining cells in the notebook to ensure the new model is used for subsequent operations.

</details>


## Authors


[Kang Wang](https://author.skills.network/instructors/kang_wang) <br>
Kang Wang is a Data Scientist Intern in IBM. He is also a PhD Candidate in the University of Waterloo.

[Faranak Heidari](https://www.linkedin.com/in/faranakhdr/) <br>
Faranak Heidari is a Data Scientist Intern in IBM with a strong background in applied machine learning. Experienced in managing complex data to establish business insights and foster data-driven decision-making in complex settings such as healthcare. She is also a PhD candidate at the University of Toronto.


### Other Contributors


[Sina Nazeri](https://author.skills.network/instructors/sina_nazeri) <br>
I am grateful to have had the opportunity to work as a Research Associate, Ph.D., and IBM Data Scientist. Through my work, I have gained experience in unraveling complex data structures to extract insights and provide valuable guidance.

[Wojciech "Victor" Fulmyk](https://author.skills.network/instructors/wojciech_fulmyk) <br>
Wojciech "Victor" Fulmyk is a Data Scientist at IBM and a Ph.D. candidate in Economics at the University of Calgary.


```{## Change Log}
```


```{|Date (YYYY-MM-DD)|Version|Changed By|Change Description||-|-|-|-||2024-03-22|0.1|Kang Wang|Create the Project|}
```


© Copyright IBM Corporation. All rights reserved.
