In [1]:
import fitz
import os
import numpy as np
import json
import re
from openai import OpenAI
import torch
import pickle
from sentence_transformers import SentenceTransformer, SimilarityFunction
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text content from a PDF file with page separation.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        List[Dict]: List of pages with text content and metadata
    """
    print(f"Extracting text from {pdf_path}...")  # Print the path of the PDF being processed
    pdf = fitz.open(pdf_path)  # Open the PDF file using PyMuPDF
    pages = []  # Initialize an empty list to store the pages with text content
    
    # Iterate over each page in the PDF
    for page_num in range(len(pdf)):
        page = pdf[page_num]  # Get the current page
        text = page.get_text()  # Extract text from the current page
        
        # Skip pages with very little text (less than 50 characters)
        if len(text.strip()) > 50:
            # Append the page text and metadata to the list
            pages.append({
                "text": text,
                "metadata": {
                    "source": pdf_path,  # Source file path
                    "page": page_num + 1  # Page number (1-based index)
                }
            })
    
    print(f"Extracted {len(pages)} pages with content")  # Print the number of pages extracted
    return pages  # Return the list of pages with text content and metadata

In [21]:
def chunk_text(text, chunk_size=1000, overlap=200):
    """
    Split text into overlapping chunks.
    
    Args:
        text (str): Input text to chunk
        chunk_size (int): Size of each chunk in characters
        overlap (int): Overlap between chunks in characters
        
    Returns:
        List[Dict]: List of chunks with metadata
    """
    chunks = []  # Initialize an empty list to store the chunks
    
    # Iterate over the text in steps of (chunk_size - overlap)
    for i in range(0, len(text), chunk_size - overlap):
        chunk_text = text[i:i + chunk_size]  # Extract the chunk of text
        if chunk_text:  # Ensure we don't add empty chunks
            chunks.append({
                "text": chunk_text,  # Add the chunk text
                "metadata": {
                    "start_pos": i,  # Start position of the chunk in the original text
                    "end_pos": i + len(chunk_text)  # End position of the chunk in the original text
                }
            })
    
    print(f"Created {len(chunks)} text chunks")  # Print the number of chunks created
    return chunks  # Return the list of chunks with metadata

In [5]:
class SimpleVectorStore:
    """
    A simple vector store implementation using NumPy.
    """
    def __init__(self):
        self.vectors = []  # List to store vector embeddings
        self.texts = []  # List to store text content
        self.metadata = []  # List to store metadata
    
    def add_item(self, text, embedding, metadata=None):
        """
        Add an item to the vector store.
        
        Args:
            text (str): Text content
            embedding (List[float]): Vector embedding
            metadata (Dict, optional): Additional metadata
        """
        self.vectors.append(np.array(embedding))  # Append the embedding as a numpy array
        self.texts.append(text)  # Append the text content
        self.metadata.append(metadata or {})  # Append the metadata or an empty dict if None
    
    def similarity_search(self, query_embedding, k=5, filter_func=None):
        """
        Find the most similar items to a query embedding.
        
        Args:
            query_embedding (List[float]): Query embedding vector
            k (int): Number of results to return
            filter_func (callable, optional): Function to filter results
            
        Returns:
            List[Dict]: Top k most similar items
        """
        if not self.vectors:
            return []  # Return an empty list if there are no vectors
        
        # Convert query embedding to numpy array
        query_vector = np.array(query_embedding)
        
        # Calculate similarities using cosine similarity
        similarities = []
        for i, vector in enumerate(self.vectors):
            # Skip if doesn't pass the filter
            if filter_func and not filter_func(self.metadata[i]):
                continue
                
            # Calculate cosine similarity
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))  # Append index and similarity score
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top k results
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],  # Add the text content
                "metadata": self.metadata[idx],  # Add the metadata
                "similarity": float(score)  # Add the similarity score
            })
        
        return results  # Return the list of top k results

In [6]:
def create_embeddings(text):
    """
    Create embeddings for the given text.
    
    Args:
        texts (str or List[str]): Input text(s)
        model (str): Embedding model name
        
    Returns:
        List[List[float]]: Embedding vector(s)
    """
    # Create embeddings for the current batch
    response = embedder.encode(text)
    return response  # Return the embeddings

In [7]:
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200):
    """
    Process a document for RAG.
    
    Args:
        pdf_path (str): Path to the PDF file
        chunk_size (int): Size of each chunk in characters
        chunk_overlap (int): Overlap between chunks in characters
        
    Returns:
        SimpleVectorStore: Vector store containing document chunks
    """
    # Extract text from the PDF file
    pages = extract_text_from_pdf(pdf_path)
    
    # Process each page and create chunks
    all_chunks = []
    for page in pages:
        # Pass the text content (string) to chunk_text, not the dictionary
        page_chunks = chunk_text(page["text"], chunk_size, chunk_overlap)
        
        # Update metadata for each chunk with the page's metadata
        for chunk in page_chunks:
            chunk["metadata"].update(page["metadata"])
        
        all_chunks.extend(page_chunks)
    
    # Create embeddings for the text chunks
    print("Creating embeddings for chunks...")
    chunk_texts = [chunk["text"] for chunk in all_chunks]
    chunk_embeddings = create_embeddings(chunk_texts)
    
    # Create a vector store to hold the chunks and their embeddings
    vector_store = SimpleVectorStore()
    for i, chunk in enumerate(all_chunks):
        vector_store.add_item(
            text=chunk["text"],
            embedding=chunk_embeddings[i],
            metadata=chunk["metadata"]
        )
    
    print(f"Vector store created with {len(all_chunks)} chunks")
    return vector_store

In [14]:
def generate_hypothetical_document(query, desired_length=1000):
    """
    Generate a hypothetical document that answers the query.
    
    Args:
        query (str): User query
        desired_length (int): Target length of the hypothetical document
        
    Returns:
        str: Generated hypothetical document
    """
    # Define the system prompt to instruct the model on how to generate the document
    system_prompt = f"""You are an expert document creator. 
    Given a question, generate a detailed document that would directly answer this question.
    The document should be approximately {desired_length} characters long and provide an in-depth, 
    informative answer to the question. Write as if this document is from an authoritative source
    on the subject. Include specific details, facts, and explanations.
    Do not mention that this is a hypothetical document - just write the content directly."""

    # Define the user prompt with the query
    user_prompt = f"Question: {query}\n\nGenerate a document that fully answers this question:"
    
    # Make a request to the OpenAI API to generate the hypothetical document
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",  # Specify the model to use
        messages=[
            {"role": "system", "content": system_prompt},  # System message to guide the assistant
            {"role": "user", "content": user_prompt}  # User message with the query
        ],
        temperature=0.1  # Set the temperature for response generation
    )
    
    # Return the generated document content
    return response.choices[0].message.content

In [15]:
def generate_response(query, relevant_chunks):
    """
    Generate a final response based on the query and relevant chunks.
    
    Args:
        query (str): User query
        relevant_chunks (List[Dict]): Retrieved relevant chunks
        
    Returns:
        str: Generated response
    """
    # Concatenate the text from the chunks to create context
    context = "\n\n".join([chunk["text"] for chunk in relevant_chunks])
    
    # Generate response using OpenAI API
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
        ],
        temperature=0.5,
        max_tokens=500
    )
    
    return response.choices[0].message.content

In [22]:
def hyde_rag(query, pdf_path, k=5, should_generate_response=True):
    """
    Perform RAG using Hypothetical Document Embedding.
    
    Args:
        query (str): User query
        k (int): Number of chunks to retrieve
        generate_response (bool): Whether to generate a final response
        
    Returns:
        Dict: Results including hypothetical document and retrieved chunks
    """
    print(f"\n=== Processing query with HyDE: {query} ===\n")
    
    # Step 1: Generate a hypothetical document that answers the query
    print("Generating hypothetical document...")
    hypothetical_doc = generate_hypothetical_document(query)
    print(f"Generated hypothetical document of {len(hypothetical_doc)} characters")
    
    # Step 2: Create embedding for the hypothetical document
    print("Creating embedding for hypothetical document...")
    hypothetical_embedding = create_embeddings([hypothetical_doc])[0]
    
    # Step 3: Retrieve similar chunks based on the hypothetical document
    print(f"Retrieving {k} most similar chunks...")
    vector_store = process_document(pdf_path)  # Process the document to create a vector store
    retrieved_chunks = vector_store.similarity_search(hypothetical_embedding, k=k)
    
    # Prepare the results dictionary
    results = {
        "query": query,
        "hypothetical_document": hypothetical_doc,
        "retrieved_chunks": retrieved_chunks
    }
    
    # Step 4: Generate a response if requested
    if should_generate_response:
        print("Generating final response...")
        response = generate_response(query, retrieved_chunks)
        results["response"] = response
    
    return results

In [23]:
pdf_path = "AI_Information.pdf"  # Path to the PDF document
query = "How do transformers handle sequential data compared to RNNs?"  # User's question

hyde_rag(query=query, pdf_path=pdf_path, k=6)


=== Processing query with HyDE: How do transformers handle sequential data compared to RNNs? ===

Generating hypothetical document...
Generated hypothetical document of 1692 characters
Creating embedding for hypothetical document...
Retrieving 6 most similar chunks...
Extracting text from AI_Information.pdf...
Extracted 15 pages with content
Created 4 text chunks
Created 4 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 3 text chunks
Created 4 text chunks
Created 3 text chunks
Creating embeddings for chunks...
Vector store created with 48 chunks
Generating final response...


{'query': 'How do transformers handle sequential data compared to RNNs?',
 'hypothetical_document': 'Transformers and Recurrent Neural Networks (RNNs) are both used for handling sequential data in machine learning and natural language processing tasks. However, they have different approaches to processing sequential data.\n\nRNNs process sequential data by iterating through the input sequence one element at a time, maintaining a hidden state that captures information about the sequence seen so far. This allows RNNs to capture dependencies and patterns in sequential data. However, RNNs suffer from issues such as vanishing gradients and difficulty in capturing long-range dependencies.\n\nOn the other hand, transformers handle sequential data using an attention mechanism that allows them to capture dependencies between all elements in the sequence simultaneously. This attention mechanism enables transformers to capture long-range dependencies more effectively than RNNs. Additionally, tran