In [2]:
%pip install -qU numpy pandas chromadb PyPDF2 litellm python-dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Setup LLM API Key

In [3]:
import os 
from dotenv import load_dotenv


load_dotenv()
# os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")




True

### 1. Data Preparation

1.1 Load PDF

In [4]:
import requests  # Import the requests library to handle HTTP requests

# URL of the PDF file you want to download
file_url = r"https://ijsret.com/wp-content/uploads/2024/09/IJSRET_V10_issue5_474.pdf"

# Send an HTTP GET request to the URL
response = requests.get(file_url)

# Specify the local path and filename where the PDF will be saved
pdf_path = "multi_modal_rag.pdf"

# Open the file in binary write mode and save the content
with open(pdf_path, "wb") as file:
    file.write(response.content)  # Write the downloaded content to the file


1.2 Extract Text From PDF

In [5]:
from PyPDF2 import PdfReader
from typing import Optional

pdf_pages = []

def extract_text(pdf_path: str) -> Optional[str]:
    """
    Extract text from all pages of a given pdf file.close

    Args:
        pdf_path (str): Path to pdf file
    
    Returns:
        Optional[str]: Concatenated text from PDF, or None if extraction fails.
    """


    try:
        with open(pdf_path, "rb") as file:
            
            pdf_reader = PdfReader(file)

            for page in pdf_reader.pages:

                text = page.extract_text()

                pdf_pages.append(text)

            pdf_text = "\n".join(pdf_pages)
        
        return pdf_text
    
    except Exception as e:
    
        print(f"Failed to extract text from {pdf_path}: {e}")

In [6]:
pdf_text = extract_text("multi_modal_rag.pdf")

### 2. Text Chunking

In [7]:
from typing import List
import re 
from collections import deque


def text_chunk(text: str, max_length: int = 1000) -> List[str]:
    """
    Splits a given text into chunks.

    The function maintains sentence boundaries by splitting based on punctuation.

    Args:
        text (str): The input text to be chunked.
        max_length (int): Maximum length of each chunk.

    Returns:
        List[str]: A list of text chunks, each containing full sentences.
    """

    # Split text into sentences while ensuring punctuation (. ! ?) stays at the end
    sentences = deque(re.split( r"(?<=[.!?])\s+", text.replace("\n"," ")))
    
    temp_text_chunk = ""
    chunks = []


    while sentences:
        sentence = sentences.popleft().strip()

        if sentence:
            if len(temp_text_chunk) + len(sentence) > max_length and temp_text_chunk:

                chunks.append(temp_text_chunk)
                temp_text_chunk = sentence
            else:
                temp_text_chunk += " "+ sentence

    if temp_text_chunk:
        chunks.append(temp_text_chunk)
    return chunks

In [8]:
chunks = text_chunk(pdf_text)

In [9]:
chunks

[' © 202 4 IJSRET   2399     International Journal of Scientific Research &  Engineering Trends                                                                                                          Volume 10, Issue 5, Sept -Oct-2024, ISSN (Online): 2395 -566X     Advanced Multi  Model RAG Application   Professor Disha Nagpure, Sujal Pore, Shardul Deshmukh, Aditya Suryawanshi    Department of Artificial Intelligence and Machine Learning,   Alard College of  Engineering and Management, Marunji, Pune, Maharashtra, India   Abstract - This paper presents a modular, context -aware multimodal Retrieval -Augmented Generation (RAG) application that  leverages both chain -based and agentic execution strategies. Powered by Gemini 1.5 F lash as the core language model, the  system integrates Langchain and Langsmith frameworks to enable dynamic document retrieval, task orchestration, and  seamless handling of multiple data sources.',
 "Key features include a YouTube summarizer using transcript A

In [10]:
print(len(chunks))

42


### Create Vector Store

In [11]:
%pip install -qU sentence-transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import warnings
warnings.filterwarnings("ignore")
import chromadb 
from chromadb.utils import embedding_functions
from chromadb.api.models import Collection
from sentence_transformers import SentenceTransformer

# TOBE Continued