In [9]:
import fitz
import os
import numpy as np
import json
import re
from openai import OpenAI
import torch
import pickle
from sentence_transformers import SentenceTransformer, SimilarityFunction
from tqdm import tqdm

In [5]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [2]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text content from a PDF file with page separation.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        List[Dict]: List of pages with text content and metadata
    """
    print(f"Extracting text from {pdf_path}...")  # Print the path of the PDF being processed
    pdf = fitz.open(pdf_path)  # Open the PDF file using PyMuPDF
    pages = []  # Initialize an empty list to store the pages with text content
    
    # Iterate over each page in the PDF
    for page_num in range(len(pdf)):
        page = pdf[page_num]  # Get the current page
        text = page.get_text()  # Extract text from the current page
        
        # Skip pages with very little text (less than 50 characters)
        if len(text.strip()) > 50:
            # Append the page text and metadata to the list
            pages.append({
                "text": text,
                "metadata": {
                    "source": pdf_path,  # Source file path
                    "page": page_num + 1  # Page number (1-based index)
                }
            })
    
    print(f"Extracted {len(pages)} pages with content")  # Print the number of pages extracted
    return pages  # Return the list of pages with text content and metadata

In [3]:
def chunk_text(text, metadata, chunk_size=1000, overlap=200):
    """
    Split text into overlapping chunks while preserving metadata.
    
    Args:
        text (str): Input text to chunk
        metadata (Dict): Metadata to preserve
        chunk_size (int): Size of each chunk in characters
        overlap (int): Overlap between chunks in characters
        
    Returns:
        List[Dict]: List of text chunks with metadata
    """
    chunks = []  # Initialize an empty list to store the chunks
    
    # Iterate over the text with the specified chunk size and overlap
    for i in range(0, len(text), chunk_size - overlap):
        chunk_text = text[i:i + chunk_size]  # Extract the chunk of text
        
        # Skip very small chunks (less than 50 characters)
        if chunk_text and len(chunk_text.strip()) > 50:
            # Create a copy of metadata and add chunk-specific info
            chunk_metadata = metadata.copy()
            chunk_metadata.update({
                "chunk_index": len(chunks),  # Index of the chunk
                "start_char": i,  # Start character index of the chunk
                "end_char": i + len(chunk_text),  # End character index of the chunk
                "is_summary": False  # Flag indicating this is not a summary
            })
            
            # Append the chunk with its metadata to the list
            chunks.append({
                "text": chunk_text,
                "metadata": chunk_metadata
            })
    
    return chunks  # Return the list of chunks with metadata

In [4]:
class SimpleVectorStore:
    """
    A simple vector store implementation using NumPy.
    """
    def __init__(self):
        self.vectors = []  # List to store vector embeddings
        self.texts = []  # List to store text content
        self.metadata = []  # List to store metadata
    
    def add_item(self, text, embedding, metadata=None):
        """
        Add an item to the vector store.
        
        Args:
            text (str): Text content
            embedding (List[float]): Vector embedding
            metadata (Dict, optional): Additional metadata
        """
        self.vectors.append(np.array(embedding))  # Append the embedding as a numpy array
        self.texts.append(text)  # Append the text content
        self.metadata.append(metadata or {})  # Append the metadata or an empty dict if None
    
    def similarity_search(self, query_embedding, k=5, filter_func=None):
        """
        Find the most similar items to a query embedding.
        
        Args:
            query_embedding (List[float]): Query embedding vector
            k (int): Number of results to return
            filter_func (callable, optional): Function to filter results
            
        Returns:
            List[Dict]: Top k most similar items
        """
        if not self.vectors:
            return []  # Return an empty list if there are no vectors
        
        # Convert query embedding to numpy array
        query_vector = np.array(query_embedding)
        
        # Calculate similarities using cosine similarity
        similarities = []
        for i, vector in enumerate(self.vectors):
            # Skip if doesn't pass the filter
            if filter_func and not filter_func(self.metadata[i]):
                continue
                
            # Calculate cosine similarity
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))  # Append index and similarity score
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top k results
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],  # Add the text content
                "metadata": self.metadata[idx],  # Add the metadata
                "similarity": float(score)  # Add the similarity score
            })
        
        return results  # Return the list of top k results

In [None]:
def create_embeddings(text):
    """
    Create embeddings for the given text.
    
    Args:
        texts (str or List[str]): Input text(s)
        model (str): Embedding model name
        
    Returns:
        List[List[float]]: Embedding vector(s)
    """
    # Create embeddings for the current batch
    response = embedder.encode(text)
    return response  # Return the embeddings

In [10]:
def generate_page_summary(page_text):
    """
    Generate a concise summary of a page.
    
    Args:
        page_text (str): Text content of the page
        
    Returns:
        str: Generated summary
    """
    # Define the system prompt to instruct the summarization model
    system_prompt = """You are an expert summarization system.
    Create a detailed summary of the provided text. 
    Focus on capturing the main topics, key information, and important facts.
    Your summary should be comprehensive enough to understand what the page contains
    but more concise than the original."""

    # Truncate input text if it exceeds the maximum token limit
    max_tokens = 6000
    truncated_text = page_text[:max_tokens] if len(page_text) > max_tokens else page_text

    # Make a request to the OpenAI API to generate the summary
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",  # Specify the model to use
        messages=[
            {"role": "system", "content": system_prompt},  # System message to guide the assistant
            {"role": "user", "content": f"Please summarize this text:\n\n{truncated_text}"}  # User message with the text to summarize
        ],
        temperature=0.3  # Set the temperature for response generation
    )
    
    # Return the generated summary content
    return response.choices[0].message.content

In [11]:
def process_document_hierarchically(pdf_path, chunk_size=1000, chunk_overlap=200):
    """
    Process a document into hierarchical indices.
    
    Args:
        pdf_path (str): Path to the PDF file
        chunk_size (int): Size of each detailed chunk
        chunk_overlap (int): Overlap between chunks
        
    Returns:
        Tuple[SimpleVectorStore, SimpleVectorStore]: Summary and detailed vector stores
    """
    # Extract pages from PDF
    pages = extract_text_from_pdf(pdf_path)
    
    # Create summaries for each page
    print("Generating page summaries...")
    summaries = []
    for i, page in enumerate(pages):
        print(f"Summarizing page {i+1}/{len(pages)}...")
        summary_text = generate_page_summary(page["text"])
        
        # Create summary metadata
        summary_metadata = page["metadata"].copy()
        summary_metadata.update({"is_summary": True})
        
        # Append the summary text and metadata to the summaries list
        summaries.append({
            "text": summary_text,
            "metadata": summary_metadata
        })
    
    # Create detailed chunks for each page
    detailed_chunks = []
    for page in pages:
        # Chunk the text of the page
        page_chunks = chunk_text(
            page["text"], 
            page["metadata"], 
            chunk_size, 
            chunk_overlap
        )
        # Extend the detailed_chunks list with the chunks from the current page
        detailed_chunks.extend(page_chunks)
    
    print(f"Created {len(detailed_chunks)} detailed chunks")
    
    # Create embeddings for summaries
    print("Creating embeddings for summaries...")
    summary_texts = [summary["text"] for summary in summaries]
    summary_embeddings = create_embeddings(summary_texts)
    
    # Create embeddings for detailed chunks
    print("Creating embeddings for detailed chunks...")
    chunk_texts = [chunk["text"] for chunk in detailed_chunks]
    chunk_embeddings = create_embeddings(chunk_texts)
    
    # Create vector stores
    summary_store = SimpleVectorStore()
    detailed_store = SimpleVectorStore()
    
    # Add summaries to summary store
    for i, summary in enumerate(summaries):
        summary_store.add_item(
            text=summary["text"],
            embedding=summary_embeddings[i],
            metadata=summary["metadata"]
        )
    
    # Add chunks to detailed store
    for i, chunk in enumerate(detailed_chunks):
        detailed_store.add_item(
            text=chunk["text"],
            embedding=chunk_embeddings[i],
            metadata=chunk["metadata"]
        )
    
    print(f"Created vector stores with {len(summaries)} summaries and {len(detailed_chunks)} chunks")
    return summary_store, detailed_store

In [12]:

def retrieve_hierarchically(query, summary_store, detailed_store, k_summaries=3, k_chunks=5):
    """
    Retrieve information using hierarchical indices.
    
    Args:
        query (str): User query
        summary_store (SimpleVectorStore): Store of document summaries
        detailed_store (SimpleVectorStore): Store of detailed chunks
        k_summaries (int): Number of summaries to retrieve
        k_chunks (int): Number of chunks to retrieve per summary
        
    Returns:
        List[Dict]: Retrieved chunks with relevance scores
    """
    print(f"Performing hierarchical retrieval for query: {query}")
    
    # Create query embedding
    query_embedding = create_embeddings(query)
    
    # First, retrieve relevant summaries
    summary_results = summary_store.similarity_search(
        query_embedding, 
        k=k_summaries
    )
    
    print(f"Retrieved {len(summary_results)} relevant summaries")
    
    # Collect pages from relevant summaries
    relevant_pages = [result["metadata"]["page"] for result in summary_results]
    
    # Create a filter function to only keep chunks from relevant pages
    def page_filter(metadata):
        return metadata["page"] in relevant_pages
    
    # Then, retrieve detailed chunks from only those relevant pages
    detailed_results = detailed_store.similarity_search(
        query_embedding, 
        k=k_chunks * len(relevant_pages),
        filter_func=page_filter
    )
    
    print(f"Retrieved {len(detailed_results)} detailed chunks from relevant pages")
    
    # For each result, add which summary/page it came from
    for result in detailed_results:
        page = result["metadata"]["page"]
        matching_summaries = [s for s in summary_results if s["metadata"]["page"] == page]
        if matching_summaries:
            result["summary"] = matching_summaries[0]["text"]
    
    return detailed_results

In [13]:
def generate_response(query, retrieved_chunks):
    """
    Generate a response based on the query and retrieved chunks.
    
    Args:
        query (str): User query
        retrieved_chunks (List[Dict]): Retrieved chunks from hierarchical search
        
    Returns:
        str: Generated response
    """
    # Extract text from chunks and prepare context parts
    context_parts = []
    
    for i, chunk in enumerate(retrieved_chunks):
        page_num = chunk["metadata"]["page"]  # Get the page number from metadata
        context_parts.append(f"[Page {page_num}]: {chunk['text']}")  # Format the chunk text with page number
    
    # Combine all context parts into a single context string
    context = "\n\n".join(context_parts)
    
    # Define the system message to guide the AI assistant
    system_message = """You are a helpful AI assistant answering questions based on the provided context.
Use the information from the context to answer the user's question accurately.
If the context doesn't contain relevant information, acknowledge that.
Include page numbers when referencing specific information."""

    # Generate the response using the OpenAI API
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",  # Specify the model to use
        messages=[
            {"role": "system", "content": system_message},  # System message to guide the assistant
            {"role": "user", "content": f"Context:\n\n{context}\n\nQuestion: {query}"}  # User message with context and query
        ],
        temperature=0.2  # Set the temperature for response generation
    )
    
    # Return the generated response content
    return response.choices[0].message.content

In [14]:
def hierarchical_rag(query, pdf_path, chunk_size=1000, chunk_overlap=200, 
                    k_summaries=3, k_chunks=5, regenerate=False):
    """
    Complete hierarchical RAG pipeline.
    
    Args:
        query (str): User query
        pdf_path (str): Path to the PDF document
        chunk_size (int): Size of each detailed chunk
        chunk_overlap (int): Overlap between chunks
        k_summaries (int): Number of summaries to retrieve
        k_chunks (int): Number of chunks to retrieve per summary
        regenerate (bool): Whether to regenerate vector stores
        
    Returns:
        Dict: Results including response and retrieved chunks
    """
    # Create store filenames for caching
    summary_store_file = f"{os.path.basename(pdf_path)}_summary_store.pkl"
    detailed_store_file = f"{os.path.basename(pdf_path)}_detailed_store.pkl"
    
    # Process document and create stores if needed
    if regenerate or not os.path.exists(summary_store_file) or not os.path.exists(detailed_store_file):
        print("Processing document and creating vector stores...")
        # Process the document to create hierarchical indices and vector stores
        summary_store, detailed_store = process_document_hierarchically(
            pdf_path, chunk_size, chunk_overlap
        )
        
        # Save the summary store to a file for future use
        with open(summary_store_file, 'wb') as f:
            pickle.dump(summary_store, f)
        
        # Save the detailed store to a file for future use
        with open(detailed_store_file, 'wb') as f:
            pickle.dump(detailed_store, f)
    else:
        # Load existing summary store from file
        print("Loading existing vector stores...")
        with open(summary_store_file, 'rb') as f:
            summary_store = pickle.load(f)
        
        # Load existing detailed store from file
        with open(detailed_store_file, 'rb') as f:
            detailed_store = pickle.load(f)
    
    # Retrieve relevant chunks hierarchically using the query
    retrieved_chunks = retrieve_hierarchically(
        query, summary_store, detailed_store, k_summaries, k_chunks
    )
    
    # Generate a response based on the retrieved chunks
    response = generate_response(query, retrieved_chunks)
    
    # Return results including the query, response, retrieved chunks, and counts of summaries and detailed chunks
    return {
        "query": query,
        "response": response,
        "retrieved_chunks": retrieved_chunks,
        "summary_count": len(summary_store.texts),
        "detailed_count": len(detailed_store.texts)
    }

In [15]:
pdf_path = "AI_Information.pdf"  # Path to the PDF document
query = "How do transformers handle sequential data compared to RNNs?"  # User's question

hierarchical_rag(query=query, pdf_path=pdf_path)

Processing document and creating vector stores...
Extracting text from AI_Information.pdf...
Extracted 15 pages with content
Generating page summaries...
Summarizing page 1/15...
Summarizing page 2/15...
Summarizing page 3/15...
Summarizing page 4/15...
Summarizing page 5/15...
Summarizing page 6/15...
Summarizing page 7/15...
Summarizing page 8/15...
Summarizing page 9/15...
Summarizing page 10/15...
Summarizing page 11/15...
Summarizing page 12/15...
Summarizing page 13/15...
Summarizing page 14/15...
Summarizing page 15/15...
Created 47 detailed chunks
Creating embeddings for summaries...


NameError: name 'create_embeddings' is not defined