In [4]:
import os
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document  # Import the Document class
import pdfplumber
from werkzeug.utils import secure_filename
from datetime import datetime

In [5]:
CHROMA_PATH = os.getenv('CHROMA_PATH', 'chroma_embeddings')
COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'local-rag')
TEXT_EMBEDDING_MODEL = os.getenv('TEXT_EMBEDDING_MODEL', 'nomic-embed-text')  # Default model
TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')

In [6]:
embedding = OllamaEmbeddings(model=TEXT_EMBEDDING_MODEL, show_progress=True)

  embedding = OllamaEmbeddings(model=TEXT_EMBEDDING_MODEL, show_progress=True)


In [7]:
def get_vector_db():
    # Initialize and return the vector DB
    db = Chroma(
        collection_name=COLLECTION_NAME,
        persist_directory=CHROMA_PATH,
        embedding_function=embedding
    )
    return db

In [8]:
def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in {'pdf'}

In [9]:
def save_file(file):
    ct = datetime.now()
    ts = ct.timestamp()
    filename = str(ts) + "_" + secure_filename(file.filename)
    file_path = os.path.join(TEMP_FOLDER, filename)
    file.save(file_path)

    return file_path

In [10]:
def load_and_split_data(file_path):
    # Load the PDF using pdfplumber
    with pdfplumber.open(file_path) as pdf:
        text = ""
        # Extract text from each page
        for page in pdf.pages:
            text += page.extract_text()

    # Create Document objects from the extracted text
    documents = [Document(page_content=text)]  # Wrap the text as a Document

    # Initialize the SemanticChunker for semantic-based chunking
    chunker = SemanticChunker(embeddings=embedding)

    # Split the text into semantic chunks
    chunks = chunker.split_documents(documents)  # Pass the list of Document objects

    return chunks

In [11]:
def embed_multiple_pdfs(file_paths):
    db = get_vector_db()

    for file_path in file_paths:
        if os.path.exists(file_path) and allowed_file(file_path):
            # Load and split data from the PDF file
            chunks = load_and_split_data(file_path)
            # Add documents to the database
            db.add_documents(chunks)
        else:
            print(f"File '{file_path}' is either invalid or does not exist.")

    # Persist the changes in the database
    db.persist()
    print(f"All documents have been processed and stored in ChromaDB.")

In [12]:
pdf_paths = [r"C:\Users\somaa\Desktop\RAG_INternship\goog-10-k-2023 (1).pdf",
             r"C:\Users\somaa\Desktop\RAG_INternship\tsla-20231231-gen.pdf",
             r"C:\Users\somaa\Desktop\RAG_INternship\uber-10-k-2023.pdf"  ]


In [13]:
embed_multiple_pdfs(pdf_paths)

  db = Chroma(






llamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 169/169 [10:11<00:00,  3.62s/it]

All documents have been processed and stored in ChromaDB.


  db.persist()


In [16]:
def inspect_stored_embeddings():
    db = get_vector_db()
    
    # Fetch stored embeddings, documents, and metadata
    collection_data = db._collection.get(include=['embeddings', 'documents'])
    embeddings = collection_data['embeddings']
    documents = collection_data['documents']
    
    
    for idx, (embedding, document) in enumerate(zip(embeddings, documents)):
        print(f"Document {idx + 1}:")
        print(f"Content: {document[:200]}...")  # Show first 200 characters of the document
        print(f"Embedding (first 5 dimensions): {embedding[:5]}...\n")  # Show first 5 dimensions of the embedding for brevity

# Call this function to inspect the embeddings and associated data
inspect_stored_embeddings()


Document 1:
Content: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
___________________________________________
FORM 10-K
___________________________________________
(Mark One)
☒ ANNUAL REPORT PUR...
Embedding (first 5 dimensions): [ 0.21078122  0.5575794  -2.25715327 -0.26436976  0.51947939]...

Document 2:
Content: Yes ☐ No ☒
Indicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities
Exchange Act of 1934 during the preceding 12 months (or...
Embedding (first 5 dimensions): [ 0.3889789   0.89617568 -3.19549012  0.04632065  0.74298537]...

Document 3:
Content: ☒
If securities are registered pursuant to Section 12(b) of the Act, indicate by check mark whether the financial statements of the
registrant included in the filing reflect the correction of an error...
Embedding (first 5 dimensions): [-0.02432753  1.30236888 -2.90390491 -1.33956206 -0.26673007]...

Document 4:
Content: Risk Facto