In [1]:
!pip install pymupdf


Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/71/c2/a9059607f80dcaf2392f991748cfc53456820392c0220cff02572653512a/pymupdf-1.25.5-cp39-abi3-win_amd64.whl.metadata
  Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl (16.6 MB)
   ---------------------------------------- 0.0/16.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.6 MB ? eta -:--:--
   ---------------------------------------- 0.2/16.6 MB 2.3 MB/s eta 0:00:08
    --------------------------------------- 0.4/16.6 MB 3.5 MB/s eta 0:00:05
   - -------------------------------------- 0.7/16.6 MB 4.7 MB/s eta 0:00:04
   -- ------------------------------------- 1.2/16.6 MB 6.0 MB/s eta 0:00:03
   ---- ----------------------------------- 1.7/16.6 MB 6.8 MB/s eta 0:00:03
   ---- -------------------------------



In [2]:
!pip install langdetect


Defaulting to user installation because normal site-packages is not writeable
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     - ----------------------------------- 30.7/981.5 kB 660.6 kB/s eta 0:00:02
     ---- --------------------------------- 122.9/981.5 kB 1.4 MB/s eta 0:00:01
     ------- ------------------------------ 204.8/981.5 kB 1.8 MB/s eta 0:00:01
     ---------- --------------------------- 276.5/981.5 kB 2.1 MB/s eta 0:00:01
     -------------- ----------------------- 368.6/981.5 kB 1.8 MB/s eta 0:00:01
     ------------------- ------------------ 501.8/981.5 kB 1.9 MB/s eta 0:00:01
     ------------------------ ------------- 624.6/981.5 kB 2.0 MB/s eta 0:00:01
     --------------------------- ---------- 706.6/981.5 kB 1.9 MB/s eta 0:00:01
     --------------------------------- ---- 860.2/981.5 kB 2.1 MB/s eta 0:00:01
     -------------------------------------  972.8/981.5 kB 2

In [3]:
!pip install openai  python-dotenv langchain langchain-openai chromadb


Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-openai
  Obtaining dependency information for langchain-openai from https://files.pythonhosted.org/packages/07/7e/0d8838972ffead497b40cd42a1676f9ad90427d422c92dff2fb5461c4308/langchain_openai-0.3.12-py3-none-any.whl.metadata
  Downloading langchain_openai-0.3.12-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.45 (from langchain)
  Obtaining dependency information for langchain-core<1.0.0,>=0.3.45 from https://files.pythonhosted.org/packages/01/78/39209de2ccc45a18e4bfa644a9846ec72831b464172b27dee156a622b599/langchain_core-0.3.50-py3-none-any.whl.metadata
  Downloading langchain_core-0.3.50-py3-none-any.whl.metadata (5.9 kB)
Downloading langchain_openai-0.3.12-py3-none-any.whl (61 kB)
   ---------------------------------------- 0.0/61.3 kB ? eta -:--:--
   --------------------------------- ------ 51.2/61.3 kB 1.3 MB/s eta 0:00:01
   ---------------------------------

In [1]:
import os
import json
import openai
import fitz  # PyMuPDF pour extraire le texte des PDFs
from langdetect import detect  # Pour détecter la langue du texte
from dotenv import load_dotenv

# Charger les variables d'environnement
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise ValueError("La clé API OpenAI n'a pas été trouvée. Vérifiez votre fichier .env.")

# Imports LangChain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# 1. Fonction pour charger et traiter les PDFs
def charger_donnees_pdf(dossier_pdf):
    """
    Parcourt tous les fichiers PDF du dossier_pdf,
    extrait le texte et crée une liste de Documents LangChain.
    """
    documents = []
    for nom_fichier in os.listdir(dossier_pdf):
        if nom_fichier.endswith(".pdf"):
            chemin_fichier = os.path.join(dossier_pdf, nom_fichier)
            try:
                with fitz.open(chemin_fichier) as pdf:
                    texte = "".join(page.get_text("text") for page in pdf)
                    documents.append(
                        Document(
                            page_content=texte,
                            metadata={"source": nom_fichier, "langue": "en"}  # Tous les documents sont en anglais
                        )
                    )
            except Exception as e:
                print(f"❌ Erreur lors du traitement de {nom_fichier} : {e}")
    return documents

# 2. Fonction pour indexer les documents dans Chroma
def preparer_et_indexer_documents(documents, chemin_chroma):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    docs_split = splitter.split_documents(documents)
    embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
    vecteur_store = Chroma(
        collection_name="preferences_services_eco_chatbot",
        embedding_function=embeddings,
        persist_directory=chemin_chroma
    )
    vecteur_store.add_documents(docs_split)
    vecteur_store.persist()
    return vecteur_store

# 3. Construire la chaîne RAG
def construire_chatbot(vectorstore, temperature=0.3):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=openai.api_key, temperature=temperature)
    memoire = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4}),
        memory=memoire,
        verbose=True
    )

# 4. Mode interactif
def mode_terminal():
    print("\n🔍 Chargement et indexation des documents...")
    docs = charger_donnees_pdf("./load_documents_pdf")
    vecteurs = preparer_et_indexer_documents(docs, "./embeddings_pdf2")
    chatbot = construire_chatbot(vecteurs)
    print("\n🤖 Chatbot prêt ! Posez vos questions (ou tapez 'exit'):\n")
    
    while True:
        question = input("> ")
        if question.lower() in ["exit", "quit"]:
            break
        langue_utilisateur = detect(question)
        reponse = chatbot.invoke({"question": question})
        texte_reponse = reponse["answer"]
        
        # Assurer que la réponse est dans la même langue que la question
        if langue_utilisateur == "fr" and detect(texte_reponse) == "en":
            traduction = ChatOpenAI(model_name="gpt-4-turbo", openai_api_key=openai.api_key, temperature=0.3)
            texte_reponse = traduction.invoke(f"Traduisez en français: {texte_reponse}")
        elif langue_utilisateur == "en" and detect(texte_reponse) == "fr":
            traduction = ChatOpenAI(model_name="gpt-4-turbo", openai_api_key=openai.api_key, temperature=0.3)
            texte_reponse = traduction.invoke(f"Translate to English: {texte_reponse}")
        
        print("\n🧠 Réponse :", texte_reponse, "\n")

if __name__ == "__main__":
    mode_terminal()



🔍 Chargement et indexation des documents...


  vecteur_store = Chroma(
  vecteur_store.persist()
  memoire = ConversationBufferMemory(memory_key="chat_history", return_messages=True)



🤖 Chatbot prêt ! Posez vos questions (ou tapez 'exit'):



LangDetectException: No features in text.