In [1]:
!pip install -q -r requirements.txt


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# --- Imports ---
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# --- Load environment variables ---
load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")
MONGODB_DB = os.getenv("MONGODB_DB", "my_db")
MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION", "vector_docs")

In [6]:
# --- Configuration ---
docs_dir_path = "C:/Users/HP/OneDrive/Desktop/Prototype_BrainyDocs/Backend/Docs_dir"


In [7]:
# --- Connect to MongoDB ---
client = MongoClient(MONGODB_URI)
db = client[MONGODB_DB]
collection = db[MONGODB_COLLECTION] 

In [8]:
# --- Embedding model ---
embedding = HuggingFaceEmbeddings()

In [9]:
# --- Load documents from folder ---
loader = DirectoryLoader(
    path=docs_dir_path,
    glob="*.pdf",
    loader_cls=UnstructuredFileLoader
)
documents = loader.load()



In [10]:
# --- Split documents into smaller chunks ---
text_splitter = CharacterTextSplitter(chunk_size=1200, chunk_overlap=250)
text_chunks = text_splitter.split_documents(documents)

Created a chunk of size 2874, which is longer than the specified 1200
Created a chunk of size 2225, which is longer than the specified 1200
Created a chunk of size 5902, which is longer than the specified 1200
Created a chunk of size 1494, which is longer than the specified 1200
Created a chunk of size 3585, which is longer than the specified 1200
Created a chunk of size 3609, which is longer than the specified 1200
Created a chunk of size 6469, which is longer than the specified 1200
Created a chunk of size 3475, which is longer than the specified 1200
Created a chunk of size 3562, which is longer than the specified 1200
Created a chunk of size 2101, which is longer than the specified 1200
Created a chunk of size 1285, which is longer than the specified 1200
Created a chunk of size 1318, which is longer than the specified 1200
Created a chunk of size 1303, which is longer than the specified 1200
Created a chunk of size 1259, which is longer than the specified 1200
Created a chunk of s

In [22]:
# Create vector store in MongoDB
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=text_chunks,
    embedding=embedding,
    collection=collection,      
    index_name="vector_index",
)

print("Uploaded embeddings to MongoDB Atlas!")


✅ Uploaded embeddings to MongoDB Atlas!
