In [6]:
import logging
import os
import pathlib
import tempfile
from typing import Any
import pickle

from langchain_community.document_loaders.epub import UnstructuredEPubLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.word_document import (UnstructuredWordDocumentLoader)
from langchain_core.documents import Document
from streamlit.logger import get_logger

logging.basicConfig(encoding="utf-8", level=logging.INFO)
LOGGER = get_logger(__name__)

#Custom class loader: La classe Epubreader herité de UnstructuredEPubLoader et le configure en "fast" mode pour l'extraction.

#DocumentLoader class: Classe centrale qui manage le chargement de fichier selon differents formats

#load_document: fonction utile, qui prend un chemin d'entrée determine son extension, initialise le load approprié depuis DocumentLoader mapping, et retourne le contenu sous forme de liste d'Objet Documents

In [11]:
class EpubReader(UnstructuredEPubLoader):
    def __init__(self, file_path: str | list[str], **unstructured_kwargs:Any):
        super().__init__(file_path, **unstructured_kwargs, mode="elements", strategy="fast")

class DocumentLoaderException(Exception):
    pass

class DocumentLoader(object):
    """Loads in a document with a supported extension"""

    supported_extensions = {
        ".pdf": PyPDFLoader,
        ".txt": TextLoader,
        ".epub":EpubReader,
        ".docx": UnstructuredWordDocumentLoader,
        ".doc": UnstructuredWordDocumentLoader,
    }

def load_document(temp_filepath:str) -> list[Document]:
    """Load le fichier et retourne une liste de documents."""
    ext = pathlib.Path(temp_filepath).suffix
    loader = DocumentLoader.supported_extensions.get(ext)
    if not loader:
        raise DocumentLoaderException(
            f"Invalid extension type {ext}, cannot load this type of file"
        )
    loader = loader(temp_filepath)
    docs = loader.load()

def load_documents(temp_filepath:str) -> list[Document]:
    """Load a file and return it as a list of documents."""
    ext = pathlib.Path(temp_filepath).suffix
    loader = DocumentLoader.supported_extensions.get(ext)
    if not loader:
        raise DocumentLoaderException(
            f"Invalid extension type {ext}, cannot load this type of file"
        )
    loaded = loader(temp_filepath)
    docs = loaded.load()
    logging.info(docs)
    return docs

#LLM.py

In [None]:
#LLM.py
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_groq import ChatGroq
from langchain_openai import OpenAiEmbeddings

from config import set_environment

set_environment()

ImportError: cannot import name 'CacheBackedEmbeddings' from 'langchain_core.embeddings' (/home/eric/.cache/pypoetry/virtualenvs/rag-langchain-tuto-5V8OYf-9-py3.12/lib/python3.12/site-packages/langchain_core/embeddings/__init__.py)

In [None]:
chat_model = ChatGroq(
    model="deepseek-r1-distill-llama-70b",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [21]:
store = LocalFileStore("./cache/")
underlying_embeddings = OpenAiEmbeddings(
    model="text-embedding-3-large",
)

#Avoiding unnecessary costs by caching the embeddings.
EMBEDDINGS = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)
#Reduit le coût API et accelere les requetes répétitives



NameError: name 'LocalFileStore' is not defined

#Document Retrieval

In [19]:
import os
import tempfile
from typing import List, Any

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

#from chapter4.document_loader import load_document
#from chapter4.llms import EMBEDDINGS


In [20]:
VECTOR_STORE = InMemoryVectorStore(embedding=EMBEDDINGS)
#Les documents chunkés sont stockés dans un InMemoryVectorStore en utilisant le cache embedding permettant de faire de rapides checke de similarités


NameError: name 'EMBEDDINGS' is not defined

In [17]:
def split_documents(docs: List[Document]) -> list[Document]:
    """Split each document."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunck_size=1500, chunck_overlap=200
    )
    return text_splitter.split_documents(docs)

NameError: name 'List' is not defined

In [16]:
# Ce retriever custom herite d'un retriever de base et gère la liste de doc interne
class DocumentRetriever(BaseRetriever):
    "A retriever that contains the top k documents that contain the user query"
    documents: List[Document] = []
    k: int = 5

    def model_post_init(self, ctx: Any) -> None:
        self.store_documents(self.documents)

    @staticmethod
    def store_documents(docs: List[Document]) -> None:
        """Add documents to the vector store."""
        splits = split_documents(docs)
        VECTOR_STORE.add_documents(splits)

    def add_uploaded_docs(self, uploaded_files):
        """Add uploaded documents"""
        docs = []
        temp_dir = tempfile.TemporaryDirectory()
        for file in uploaded_files:
            temp_filepath = os.path.join(temp_dir.name, file.name)
            with open(temp_filepath, "wb") as f:
                f.write(file.getvalue())
                docs.extend(load_document(temp_filepath))
        self.documents.extend(docs)
        self.store_documents(docs)

    def _get_relevant_documents(self, query, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        """Sync implementations for retriever."""
        if len(self.documents) == 0:
            return []
        return VECTOR_STORE.similarity_search(query=query, k=self.k)
    


NameError: name 'BaseRetriever' is not defined

#Designing the state graph

In [None]:
from typing import Annotated
from langchain_core.documents import Document
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langgraph.checkpoint.memory import MemorySaver
from langgraph.constants import END
from langgraph.graph import START, StateGraph, add_messages
from typing_extensions import List, TypedDict

#from chapter4.llms import chat_model
#from chapter4.retriever import DocumentRetriever

In [None]:
system_prompt = (
    """Tu es un assistant intelligent spécialisé dans l’analyse de profils professionnels.
    Ton rôle est d’aider un utilisateur à comprendre si son expérience, ses compétences et ses attentes correspondent à ses objectifs de carrière.
    Tu dois :
    - Lire et analyser toutes les informations fournies sur le profil de l’utilisateur (CV, projets, compétences, expériences, objectifs).
    - Évaluer de manière objective la correspondance entre les compétences et les aspirations professionnelles de l’utilisateur et le type de poste ou de mission qu’il recherche.
    - Fournir des recommandations claires pour positionner le profil de manière pertinente : points forts à mettre en avant, compétences à valoriser, éventuels écarts à combler.
    - Être concis, structuré et orienté action, comme un conseiller de confiance pour un recrutement ou une évolution de carrière.
    - Ne jamais inventer de compétences ou d’expériences qui n’existent pas dans les données fournies.""")

In [None]:
retriever = DocumentRetriever()
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt)
        ("human", "{question}"),
    ]
)

In [None]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
    issues_report: bool
    messages: Annotated[list, add_messages]

In [None]:
def retrieve(state: State):
    retrieved_docs = retriever.invoke(state["messages"][-1].content)
    print(retrieved_docs)
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke(
        {"question": state["messages"][-1].content, "context": docs_content}
    )
    response = chat_model.invoke(messages)
    print(response.content)
    return {"answer": response.content}

def double_check(state: State):
    result = chat_model.invoke(
        [{
            "role": "user",
            "content": (
                f"Review the following project documentation for compliance with our corporate standards."
                f"Return 'ISSUES FOUND' followed by any detected or 'NO ISSUES': {state['answer']}"
            )
        }]
    )
    if "ISSUES FOUND" in result.content:
        print("issues detected")
        return {
            "issues_report": result.split("ISSUES FOUND", 1)[1].strip(),
        }
    print("no issues detected")
    return {
        "issues_report": ""
        "issues_detected": False
    }



In [None]:
def doc_finalizer(state: State):
    """Finalise la documentation en intégrant les feedbacks"""
    if "issues_detected" in state and state["issue_detected"]:
        response = chat_model.invoke(
            messages=[{
                "role": "user",
                "content": (
                    f"Revise the following documentation to address these feedback points: {state['issues_report']}\n"
                    f"Original Document: {state['answer']}\n"
                    f"Always return the full revised document, even if no changes are needed."
                )
            }]
        )
        return {
            "messages": [AIMessage(response.content)]
        }
    return {
        "messages": [AIMessage(state["answer"])]
    }

In [None]:
graph_builder = StateGraph(State).add_sequence(
    [retriever, generate, double_check, doc_finalizer]
)
graph_builder.add_edge(START, "retriever")
graph_builder.add_edge("doc_finalizer", END)
memory = MemorySaver()
graph = graph_builder.compile(checkpointer=memory)
config = {"configurable": {"thread_id": "abc123"}}

In [None]:
from IPython.display import Image, display
display(Image(graph.get_graph().draw_mermaid_png()))

TEST

In [None]:
from langchain_core.messages import HumanMessage
input_messages = [HumanMessage("What's the square root of 10?")]
response = graph.invoke({"messages": input_messages}, config=config)
print(response["messages"][-1].content)