Test des fonctionnalité de mongodb

In [23]:
from pymongo import MongoClient
# from PyPDF2 import PdfReader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv

# Chargez les variables d'environnement à partir du fichier .env
load_dotenv()

In [24]:
# Accédez aux variables d'environnement en utilisant os.environ
openai_api_key = os.environ.get("OPENAI_API_KEY")
mongodb_conn_string = os.environ.get("MONGODB_CONN_STRING")
db_name = os.environ.get("DB_NAME")
collection_name = os.environ.get("COLLECTION_NAME")



In [25]:
client = MongoClient(mongodb_conn_string)
db = client[db_name]
collection = db[collection_name]

In [26]:
from langchain_community.document_loaders import PyPDFLoader
# Step 1: Load
loaders = [
    PyPDFLoader("Code_de_la_commande_publique.pdf"),
    PyPDFLoader("Vade-mecum_complet.pdf")
]

In [27]:
# Step 2: Transform (Split)
data = []
for loader in loaders:
    data.extend(loader.load())

In [28]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[
    "\n\n", "\n", "(?<=\. )", " "], length_function=len)
docs = text_splitter.split_documents(data)
print('Split into ' + str(len(docs)) + ' docs')

Split into 3494 docs


In [29]:
# Dictionnaire des liens Google Drive
pdf_links = {
    "Code_de_la_commande_publique.pdf": "https://drive.google.com/file/d/1YnpoJhBJ-RkKxt1o2BXQFgbWSkOrPNSJ/view?usp=sharing",
    "Vade-mecum_complet.pdf": "https://drive.google.com/file/d/1eIgLDBh8XqHGHrVRITPNqwuaWH_miKZ_/view?usp=sharing"
}


In [30]:
# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(openai_api_key= os.environ.get("OPENAI_API_KEY"))

In [31]:
# Parcourez la collection et mettez à jour les documents avec les liens Google Drive
for pdf_filename, pdf_link in pdf_links.items():
    update_result = collection.update_many(
        {"source": pdf_filename},
        {"$set": {"link": pdf_link}}
    )

    # Vérifiez si la mise à jour a réussi pour chaque document
    if update_result.modified_count > 0:
        print(f"Link added successfully for '{pdf_filename}'.")
    else:
        print(f"Failed to add link for '{pdf_filename}'.")

Link added successfully for 'Code_de_la_commande_publique.pdf'.
Link added successfully for 'Vade-mecum_complet.pdf'.
