In [30]:
from dotenv import load_dotenv
import os
import time
import numpy as np
import pandas as pd
from typing import List, Dict, Any
import base64
load_dotenv()

True

In [2]:
import os
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")

In [None]:
# Install Required Libraries
# !pip install langchain langchain-community langchain-google-genai
# !pip install pymongo motor
# !pip install sentence-transformers
# !pip install pypdf2 pdfplumber PyMuPDF
# !pip install faiss-cpu
# !pip install rank-bm25
# !pip install python-docx
# !pip install google-generativeai
# !pip install clip-by-openai torch torchvision
# !pip install transformers
#!pip install langchain-experimental
#!pip install clip

In [4]:
from langchain.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, MongoDBAtlasVectorSearch
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

In [5]:
# Additional imports
import fitz  # PyMuPDF
import google.generativeai as genai
import faiss
from rank_bm25 import BM25Okapi
from docx import Document as DocxDocument
import pymongo
import clip
import torch
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
# Cell 3: Configuration
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

PDF_PATH = "llama2.pdf"

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)

In [7]:
# import pymongo
# import sys

# # Replace the placeholder data with your Atlas connection string. Be sure it includes
# # a valid username and password! Note that in a production environment,
# # you should not store your password in plain-text here.

# try:
#   client = pymongo.MongoClient(MONGODB_URI)
  
# # return a friendly error if a URI error is thrown 
# except pymongo.errors.ConfigurationError:
#   print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string?")
#   sys.exit(1)

In [8]:
# Extract PDF Data
def extract_pdf_content(pdf_path: str) -> List[Document]:
    documents = []
    pdf_doc = fitz.open(pdf_path)
    
    print(f"Processing {len(pdf_doc)} pages...")
    
    for page_num in range(len(pdf_doc)):
        page = pdf_doc[page_num]
        
        # Extract text
        text = page.get_text()
        if text.strip():
            doc = Document(
                page_content=text,
                metadata={'page': page_num + 1, 'type': 'text'}
            )
            documents.append(doc)
        
        # Extract images as base64
        images = page.get_images()
        for img_idx, img in enumerate(images):
            try:
                xref = img[0]
                pix = fitz.Pixmap(pdf_doc, xref)
                if pix.n - pix.alpha < 4:
                    img_data = pix.tobytes("png")
                    img_b64 = base64.b64encode(img_data).decode()
                    
                    doc = Document(
                        page_content=f"Image on page {page_num + 1}",
                        metadata={
                            'page': page_num + 1, 
                            'type': 'image',
                            'image_data': img_b64
                        }
                    )
                    documents.append(doc)
                pix = None
            except:
                continue
        
        # Extract tables
        tables = page.find_tables()
        for table_idx, table in enumerate(tables):
            try:
                table_data = table.extract()
                table_text = '\n'.join([' | '.join([str(cell) for cell in row if cell]) for row in table_data if row])
                
                doc = Document(
                    page_content=table_text,
                    metadata={'page': page_num + 1, 'type': 'table'}
                )
                documents.append(doc)
            except:
                continue
    
    pdf_doc.close()
    return documents

# Extract documents
documents = extract_pdf_content(PDF_PATH)
print(f"Extracted {len(documents)} documents")

Processing 77 pages...
Extracted 143 documents


In [9]:
# Initialize embeddings for semantic chunking
base_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Use SemanticChunker for better semantic chunking
semantic_splitter = SemanticChunker(
    embeddings=base_embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=95
)

  base_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [10]:
# Apply semantic chunking
chunked_docs = []
for doc in documents:
    if doc.metadata['type'] == 'text':
        # Use semantic chunking for text
        chunks = semantic_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            chunked_doc = Document(
                page_content=chunk,
                metadata={**doc.metadata, 'chunk_id': i}
            )
            chunked_docs.append(chunked_doc)
    else:
        # Keep images and tables as is
        chunked_docs.append(doc)

print(f"Created {len(chunked_docs)} chunks using SemanticChunker")

Created 327 chunks using SemanticChunker


In [11]:
#!pip install git+https://github.com/openai/CLIP.git
import clip
import torch



In [12]:
#Create Multimodal Embeddings
# Load CLIP model for multimodal embeddings
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# Text embeddings
text_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

Using device: cpu


In [13]:
def create_multimodal_embeddings(docs):
    embeddings_list = []
    
    for doc in docs:
        if doc.metadata['type'] == 'text':
            # Text embedding
            embedding = text_embeddings.embed_query(doc.page_content)
            embeddings_list.append(embedding)
            
        elif doc.metadata['type'] == 'image':
            # Image embedding using CLIP
            try:
                img_data = base64.b64decode(doc.metadata['image_data'])
                image = Image.open(io.BytesIO(img_data))
                image_input = clip_preprocess(image).unsqueeze(0).to(device)
                
                with torch.no_grad():
                    image_features = clip_model.encode_image(image_input)
                    # Normalize and convert to list
                    embedding = image_features.cpu().numpy().flatten().tolist()
                    # Pad/truncate to match text embedding dimension (384)
                    if len(embedding) > 384:
                        embedding = embedding[:384]
                    else:
                        embedding.extend([0.0] * (384 - len(embedding)))
                    
                embeddings_list.append(embedding)
            except:
                # Fallback to text embedding of description
                embedding = text_embeddings.embed_query(doc.page_content)
                embeddings_list.append(embedding)
                
        elif doc.metadata['type'] == 'table':
            # Table embedding (treat as text)
            embedding = text_embeddings.embed_query(doc.page_content)
            embeddings_list.append(embedding)
    
    return embeddings_list

# Create multimodal embeddings
print("Creating multimodal embeddings...")
start_time = time.time()
embedded_chunks = create_multimodal_embeddings(chunked_docs)
embedding_time = time.time() - start_time
print(f"Multimodal embeddings created in {embedding_time:.2f} seconds")

Creating multimodal embeddings...
Multimodal embeddings created in 9.58 seconds


In [None]:
from urllib.parse import quote_plus
#from langchain_community.vectorstores import MongoDBAtlasVectorSearch

from pymongo import MongoClient

# Encode credentials
username = quote_plus(os.getenv("MONGODB_USERNAME"))
password = quote_plus(os.getenv("MONGODB_PASSWORD"))

# Replace with your actual cluster host (e.g., cluster0.abcde.mongodb.net)
MONGODB_URI = (
    f"mongodb+srv://{username}:{password}@cluster0.gk1jj9u.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
)


client = MongoClient(MONGODB_URI)
db = client["rag_database_new"]
collection = db["document_embeddings_manual"]

# Just check connection
print("✅ MongoDB connection successful!")



# # MongoDB Atlas Vector Search setup
# vector_store = MongoDBAtlasVectorSearch.from_documents(
#     documents=chunked_docs,
#     embedding=text_embeddings,
#     connection_string=MONGODB_URI,
#     database_name="rag_database",
#     collection_name="document_embeddings",
#     index_name="vector_index"
# )

# print("MongoDB Atlas Vector Store created!")



# Store documents with multimodal embeddings
print("Storing multimodal embeddings in MongoDB...")
for i, (doc, embedding) in enumerate(zip(chunked_docs, embedded_chunks)):
    document = {
        "_id": f"doc_{i}",
        "content": doc.page_content,
        "metadata": doc.metadata,
        "embedding": embedding,
        "content_type": doc.metadata['type']
    }
    collection.replace_one({"_id": f"doc_{i}"}, document, upsert=True)

# Create MongoDB Atlas Vector Search Index (run this in MongoDB Atlas UI)
index_definition = {
    "fields": [{
        "type": "vector",
        "path": "embedding",
        "numDimensions": 384,
        "similarity": "cosine"
    }]
}
print("MongoDB setup complete!")




✅ MongoDB connection successful!
Storing multimodal embeddings in MongoDB...
MongoDB setup complete!


In [15]:
# Create Multiple Index Types
# Flat Index (FAISS)
print("Creating Flat index...")
flat_index = faiss.IndexFlatIP(384)  # Inner product for cosine similarity
flat_index.add(np.array(embedded_chunks).astype('float32'))

# HNSW Index
print("Creating HNSW index...")
hnsw_index = faiss.IndexHNSWFlat(384, 32)
hnsw_index.add(np.array(embedded_chunks).astype('float32'))

# IVF Index
print("Creating IVF index...")
quantizer = faiss.IndexFlatIP(384)
ivf_index = faiss.IndexIVFFlat(quantizer, 384, min(100, len(embedded_chunks)//10))
ivf_index.train(np.array(embedded_chunks).astype('float32'))
ivf_index.add(np.array(embedded_chunks).astype('float32'))

print("All indexes created!")

Creating Flat index...
Creating HNSW index...
Creating IVF index...
All indexes created!


In [16]:
# Create Retrieval Pipeline
def retrieve_documents(query: str, index_type: str = "flat", k: int = 5):
    # Create multimodal query embedding
    query_embedding = text_embeddings.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    
    # Retrieve based on index type
    start_time = time.time()
    
    if index_type == "flat":
        scores, indices = flat_index.search(query_vector, k)
    elif index_type == "hnsw":
        scores, indices = hnsw_index.search(query_vector, k)
    elif index_type == "ivf":
        scores, indices = ivf_index.search(query_vector, k)
    
    retrieval_time = time.time() - start_time
    
    # Get retrieved documents
    retrieved_docs = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(chunked_docs):
            retrieved_docs.append({
                'document': chunked_docs[idx],
                'score': float(score),
                'content_type': chunked_docs[idx].metadata['type']
            })
    
    return retrieved_docs, retrieval_time

In [17]:
# Test Retrieval Speed
test_query = "explain about llama2"

# Test all index types
index_types = ["flat", "hnsw", "ivf"]
speed_results = {}

for idx_type in index_types:
    docs, ret_time = retrieve_documents(test_query, idx_type, k=5)
    speed_results[idx_type] = ret_time
    print(f"{idx_type.upper()} Index: {ret_time:.4f} seconds")

# Find fastest
fastest_index = min(speed_results, key=speed_results.get)
print(f"Fastest index: {fastest_index.upper()}")

FLAT Index: 0.0010 seconds
HNSW Index: 0.0000 seconds
IVF Index: 0.0000 seconds
Fastest index: HNSW


In [18]:
def calculate_accuracy_scores(query: str, retrieved_docs: List, k: int = 5):
    query_embedding = text_embeddings.embed_query(query)
    
    accuracy_scores = []
    for item in retrieved_docs[:k]:
        if item['content_type'] == 'text':
            doc_embedding = text_embeddings.embed_query(item['document'].page_content)
        else:
            # For images/tables, use the pre-computed embedding
            doc_idx = chunked_docs.index(item['document'])
            doc_embedding = embedded_chunks[doc_idx]
        
        # Cosine similarity
        cosine_sim = np.dot(query_embedding, doc_embedding) / (
            np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
        )
        accuracy_scores.append({
            'score': cosine_sim,
            'content_type': item['content_type'],
            'page': item['document'].metadata.get('page', 'N/A')
        })
    
    return accuracy_scores

# Test accuracy
retrieved_docs, _ = retrieve_documents(test_query, fastest_index, k=5)
accuracy_scores = calculate_accuracy_scores(test_query, retrieved_docs)

print("Accuracy Scores:")
for i, score_info in enumerate(accuracy_scores):
    print(f"Document {i+1}: {score_info['score']:.4f} (Type: {score_info['content_type']}, Page: {score_info['page']})")


Accuracy Scores:
Document 1: 0.5991 (Type: text, Page: 2)
Document 2: 0.5936 (Type: text, Page: 2)
Document 3: 0.5429 (Type: table, Page: 77)
Document 4: 0.5348 (Type: table, Page: 77)
Document 5: 0.5249 (Type: text, Page: 77)


In [19]:
# Reranking with BM25
# Prepare BM25
tokenized_docs = [doc.page_content.split() for doc in chunked_docs]
bm25 = BM25Okapi(tokenized_docs)

def rerank_with_bm25(query: str, retrieved_docs: List, top_k: int = 3):
    # Get BM25 scores
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Rerank retrieved documents
    reranked = []
    for item in retrieved_docs:
        doc_idx = chunked_docs.index(item['document'])
        combined_score = item['score'] * 0.7 + bm25_scores[doc_idx] * 0.3
        reranked.append({
            'document': item['document'],
            'vector_score': item['score'],
            'bm25_score': bm25_scores[doc_idx],
            'combined_score': combined_score
        })
    
    # Sort by combined score
    reranked.sort(key=lambda x: x['combined_score'], reverse=True)
    return reranked[:top_k]

# Apply reranking
reranked_docs = rerank_with_bm25(test_query, retrieved_docs, top_k=3)
print("Reranked documents ready!")

Reranked documents ready!


In [33]:
reranked_docs

[{'document': Document(metadata={'page': 77, 'type': 'text', 'chunk_id': 2}, page_content='The fine-tuning data includes publicly available instruction datasets, as\nwell as over one million new human-annotated examples. Neither the pretraining\nnor the fine-tuning datasets include Meta user data. Data Freshness\nThe pretraining data has a cutoff of September 2022, but some tuning data is\nmore recent, up to July 2023. Evaluation Results\nSee evaluations for pretraining (Section 2); fine-tuning (Section 3); and safety (Section 4). Ethical Considerations and Limitations (Section 5.2)\nLlama 2 is a new technology that carries risks with use. Testing conducted to date has been in\nEnglish, and has not covered, nor could it cover all scenarios. For these reasons, as with all LLMs,\nLlama 2’s potential outputs cannot be predicted in advance, and the model may in some instances\nproduce inaccurate or objectionable responses to user prompts. Therefore, before deploying any\napplications of Ll

In [20]:
# Create Prompt Template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are a helpful AI assistant. Use the following context to answer the question.
    
Context:
{context}

Question: {question}

Answer: Provide a comprehensive answer based on the context above. If the context doesn't contain enough information, say so clearly."""
)

In [24]:
# !pip uninstall protobuf -y
# !pip install protobuf==4.23.3



In [22]:
# Generate Output with LLM
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1)

def generate_answer(query: str, retrieved_docs: List):
    # Prepare context
    context = "\n\n".join([
        f"Document {i+1}: {doc['document'].page_content}" 
        for i, doc in enumerate(retrieved_docs)
    ])
    
    # Generate response
    formatted_prompt = prompt_template.format(context=context, question=query)
    response = llm.invoke(formatted_prompt)
    
    return response.content

# Generate answer
answer = generate_answer(test_query, reranked_docs)
print("Generated Answer:")
print(answer)

Generated Answer:
Llama 2 is a large language model (LLM) pretrained on 2 trillion tokens of publicly available data (cutoff September 2022).  Its fine-tuning involved publicly available instruction datasets and over one million new human-annotated examples.  Importantly, neither the pretraining nor fine-tuning data included Meta user data.  Some of the fine-tuning data is more recent, with a cutoff of July 2023.

Llama 2 is intended for commercial and research use in English.  The tuned models are designed for assistant-like chat applications, while the pretrained models can be adapted for various natural language generation tasks.  Its use is restricted; it cannot be used in ways that violate applicable laws or regulations, in languages other than English, or in any manner prohibited by Llama 2's Acceptable Use Policy and Licensing Agreement.

While testing has been conducted in English, it hasn't covered all scenarios.  Therefore, Llama 2 may produce inaccurate or objectionable resp

In [23]:
# Create DOCX Output
def create_docx_output(query: str, answer: str, retrieved_docs: List, filename: str = "rag_output.docx"):
    doc = DocxDocument()
    
    # Title
    doc.add_heading('RAG Pipeline Output', 0)
    
    # Query
    doc.add_heading('Query:', level=1)
    doc.add_paragraph(query)
    
    # Answer
    doc.add_heading('Generated Answer:', level=1)
    doc.add_paragraph(answer)
    
    # Retrieved Documents
    doc.add_heading('Retrieved Documents:', level=1)
    for i, item in enumerate(retrieved_docs):
        doc.add_heading(f'Document {i+1}:', level=2)
        doc.add_paragraph(f"Score: {item.get('combined_score', item.get('score', 0)):.4f}")
        doc.add_paragraph(f"Content: {item['document'].page_content[:500]}...")
        doc.add_paragraph(f"Metadata: {item['document'].metadata}")
    
    doc.save(filename)
    print(f"Output saved to {filename}")

# Create DOCX output
create_docx_output(test_query, answer, reranked_docs)

print("RAG Pipeline Complete!")

Output saved to rag_output.docx
RAG Pipeline Complete!
