In [19]:
from pinecone import Pinecone, ServerlessSpec
import os
import pinecone
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import uuid4 

embedding_model = SentenceTransformer("all-mpnet-base-v2")

load_dotenv()

True

In [20]:
# Initialize Pinecone client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = 'procurement-chatbot'  # make sure this is correct in your .env

if not index_name:
    raise ValueError("INDEX_NAME is not set in your .env file")
index = pc.Index('procurement-chatbot')

In [21]:
def load_pdfs_from_folder(folder_path="data"):
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder '{folder_path}' does not exist. Check the path.")

    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(folder_path, filename))
            pages = loader.load_and_split()
            for i, page in enumerate(pages):
                documents.append({
                    "content": page.page_content,
                    "metadata": {"filename": filename, "page_number": i + 1}
                })
    return documents

In [22]:
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=True,
    )

    chunks = []
    for doc in documents:
        split_texts = text_splitter.split_text(doc["content"])
        for i, chunk_content in enumerate(split_texts):
            chunks.append({
                "content": chunk_content,
                "metadata": {**doc["metadata"], "chunk_id": i}
            })
    return chunks

In [23]:
def get_embedding(text="None"):
    embedding = embedding_model.encode(text).tolist()
    return embedding

def upsert_chunks_to_pinecone(index, chunks, batch_size=100):
    vectors = []
    for i, chunk in enumerate(chunks):
        content = chunk["content"]
        metadata = chunk.get("metadata", {})

        metadata["text"] = content

        embedding = get_embedding(content) # Get embedding after 'content' is part of metadata
        vector_id = str(uuid4())
        vectors.append((vector_id, embedding, metadata))

        if len(vectors) == batch_size or i == len(chunks) - 1:
            index.upsert(vectors=vectors)
            print(f"Upserted batch ending at chunk {i + 1}")
            vectors = []
    print(f"All {len(chunks)} vectors upserted to Pinecone.")

In [24]:
# Replace with actual path
data_path = r"C:\Users\Taj\Documents\Procurement_chatbot\data"

# Load, split, embed, and upsert
documents = load_pdfs_from_folder(data_path)
chunks = split_documents(documents)

print(f"Loaded {len(documents)} pages")
print(f"Generated {len(chunks)} chunks")

upsert_chunks_to_pinecone(index, chunks)

Loaded 238 pages
Generated 860 chunks
Upserted batch ending at chunk 100
Upserted batch ending at chunk 200
Upserted batch ending at chunk 300
Upserted batch ending at chunk 400
Upserted batch ending at chunk 500
Upserted batch ending at chunk 600
Upserted batch ending at chunk 700
Upserted batch ending at chunk 800
Upserted batch ending at chunk 860
All 860 vectors upserted to Pinecone.
