In [9]:
import fitz
import os
import numpy as np
import json
import re
from openai import OpenAI
import torch
from sentence_transformers import SentenceTransformer, SimilarityFunction
from tqdm import tqdm
import os
import numpy as np
from rank_bm25 import BM25Okapi
import fitz
from openai import OpenAI
import re
import json
import time
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and prints the first `num_chars` characters.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    # Open the PDF file
    mypdf = fitz.open(pdf_path)
    all_text = ""  # Initialize an empty string to store the extracted text

    # Iterate through each page in the PDF
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]  # Get the page
        text = page.get_text("text")  # Extract text from the page
        all_text += text  # Append the extracted text to the all_text string

    return all_text  # Return the extracted text

In [4]:
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    """
    Split text into overlapping chunks.
    
    Args:
        text (str): Input text to chunk
        chunk_size (int): Size of each chunk in characters
        chunk_overlap (int): Overlap between chunks in characters
        
    Returns:
        List[Dict]: List of chunks with text and metadata
    """
    chunks = []  # Initialize an empty list to store chunks
    
    # Iterate over the text with the specified chunk size and overlap
    for i in range(0, len(text), chunk_size - chunk_overlap):
        chunk = text[i:i + chunk_size]  # Extract a chunk of the specified size
        if chunk:  # Ensure we don't add empty chunks
            chunk_data = {
                "text": chunk,  # The chunk text
                "metadata": {
                    "start_char": i,  # Start character index of the chunk
                    "end_char": i + len(chunk)  # End character index of the chunk
                }
            }
            chunks.append(chunk_data)  # Add the chunk data to the list
    
    print(f"Created {len(chunks)} text chunks")  # Print the number of created chunks
    return chunks  # Return the list of chunks

In [5]:
def clean_text(text):
    """
    Clean text by removing extra whitespace and special characters.
    
    Args:
        text (str): Input text
        
    Returns:
        str: Cleaned text
    """
    # Replace multiple whitespace characters (including newlines and tabs) with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Fix common OCR issues by replacing tab and newline characters with a space
    text = text.replace('\\t', ' ')
    text = text.replace('\\n', ' ')
    
    # Remove any leading or trailing whitespace and ensure single spaces between words
    text = ' '.join(text.split())
    
    return text

In [10]:
def create_embeddings(text):
    """
    Create embeddings for the given text.
    
    Args:
        texts (str or List[str]): Input text(s)
        model (str): Embedding model name
        
    Returns:
        List[List[float]]: Embedding vector(s)
    """
    # Create embeddings for the current batch
    response = embedder.encode(text)
    return response  # Return the embeddings
        

In [18]:
class SimpleVectorStore:
    """
    A simple vector store implementation using NumPy.
    """
    def __init__(self):
        self.vectors = []  # List to store embedding vectors
        self.texts = []  # List to store text content
        self.metadata = []  # List to store metadata
    
    def add_item(self, text, embedding, metadata=None):
        """
        Add an item to the vector store.
        
        Args:
            text (str): The text content
            embedding (List[float]): The embedding vector
            metadata (Dict, optional): Additional metadata
        """
        self.vectors.append(np.array(embedding))  # Append the embedding vector
        self.texts.append(text)  # Append the text content
        self.metadata.append(metadata or {})  # Append the metadata (or empty dict if None)
    
    def add_items(self, items, embeddings):
        """
        Add multiple items to the vector store.
        
        Args:
            items (List[Dict]): List of text items
            embeddings (List[List[float]]): List of embedding vectors
        """
        for i, (item, embedding) in enumerate(zip(items, embeddings)):
            self.add_item(
                text=item["text"],  # Extract text from item
                embedding=embedding,  # Use corresponding embedding
                metadata={**item.get("metadata", {}), "index": i}  # Merge item metadata with index
            )
    
    def similarity_search_with_scores(self, query_embedding, k=5):
        """
        Find the most similar items to a query embedding with similarity scores.
        
        Args:
            query_embedding (List[float]): Query embedding vector
            k (int): Number of results to return
            
        Returns:
            List[Tuple[Dict, float]]: Top k most similar items with scores
        """
        if not self.vectors:
            return []  # Return empty list if no vectors are stored
        
        # Convert query embedding to numpy array
        query_vector = np.array(query_embedding)
        
        # Calculate similarities using cosine similarity
        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = cosine_similarity([query_vector], [vector])[0][0]  # Compute cosine similarity
            similarities.append((i, similarity))  # Append index and similarity score
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top k results with scores
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],  # Retrieve text by index
                "metadata": self.metadata[idx],  # Retrieve metadata by index
                "similarity": float(score)  # Add similarity score
            })
        
        return results
    
    def get_all_documents(self):
        """
        Get all documents in the store.
        
        Returns:
            List[Dict]: All documents
        """
        return [{"text": text, "metadata": meta} for text, meta in zip(self.texts, self.metadata)]  # Combine texts and metadata

In [7]:
def create_bm25_index(chunks):
    """
    Create a BM25 index from the given chunks.
    
    Args:
        chunks (List[Dict]): List of text chunks
        
    Returns:
        BM25Okapi: A BM25 index
    """
    # Extract text from each chunk
    texts = [chunk["text"] for chunk in chunks]
    
    # Tokenize each document by splitting on whitespace
    tokenized_docs = [text.split() for text in texts]
    
    # Create the BM25 index using the tokenized documents
    bm25 = BM25Okapi(tokenized_docs)
    
    # Print the number of documents in the BM25 index
    print(f"Created BM25 index with {len(texts)} documents")
    
    return bm25

In [8]:
def bm25_search(bm25, chunks, query, k=5):
    """
    Search the BM25 index with a query.
    
    Args:
        bm25 (BM25Okapi): BM25 index
        chunks (List[Dict]): List of text chunks
        query (str): Query string
        k (int): Number of results to return
        
    Returns:
        List[Dict]: Top k results with scores
    """
    # Tokenize the query by splitting it into individual words
    query_tokens = query.split()
    
    # Get BM25 scores for the query tokens against the indexed documents
    scores = bm25.get_scores(query_tokens)
    
    # Initialize an empty list to store results with their scores
    results = []
    
    # Iterate over the scores and corresponding chunks
    for i, score in enumerate(scores):
        # Create a copy of the metadata to avoid modifying the original
        metadata = chunks[i].get("metadata", {}).copy()
        # Add index to metadata
        metadata["index"] = i
        
        results.append({
            "text": chunks[i]["text"],
            "metadata": metadata,  # Add metadata with index
            "bm25_score": float(score)
        })
    
    # Sort the results by BM25 score in descending order
    results.sort(key=lambda x: x["bm25_score"], reverse=True)
    
    # Return the top k results
    return results[:k]

In [24]:
def fusion_retrieval(query, chunks, vector_store, bm25_index, k=5, alpha=0.5):
    """
    Perform fusion retrieval combining vector-based and BM25 search.
    
    Args:
        query (str): Query string
        chunks (List[Dict]): Original text chunks
        vector_store (SimpleVectorStore): Vector store
        bm25_index (BM25Okapi): BM25 index
        k (int): Number of results to return
        alpha (float): Weight for vector scores (0-1), where 1-alpha is BM25 weight
        
    Returns:
        List[Dict]: Top k results based on combined scores
    """
    print(f"Performing fusion retrieval for query: {query}")
    
    # Define small epsilon to avoid division by zero
    epsilon = 1e-8
    
    # Get vector search results
    query_embedding = create_embeddings(query)  # Create embedding for the query
    vector_results = vector_store.similarity_search_with_scores(query_embedding, k=len(chunks))  # Perform vector search
    
    # Get BM25 search results
    bm25_results = bm25_search(bm25_index, chunks, query, k=len(chunks))  # Perform BM25 search
    
    # Create dictionaries to map document index to score
    vector_scores_dict = {result["metadata"]["index"]: result["similarity"] for result in vector_results}
    bm25_scores_dict = {result["metadata"]["index"]: result["bm25_score"] for result in bm25_results}
    
    # Ensure all documents have scores for both methods
    all_docs = vector_store.get_all_documents()
    combined_results = []
    
    for i, doc in enumerate(all_docs):
        vector_score = vector_scores_dict.get(i, 0.0)  # Get vector score or 0 if not found
        bm25_score = bm25_scores_dict.get(i, 0.0)  # Get BM25 score or 0 if not found
        combined_results.append({
            "text": doc["text"],
            "metadata": doc["metadata"],
            "vector_score": vector_score,
            "bm25_score": bm25_score,
            "index": i
        })
    
    # Extract scores as arrays
    vector_scores = np.array([doc["vector_score"] for doc in combined_results])
    bm25_scores = np.array([doc["bm25_score"] for doc in combined_results])
    
    print(f"vector scores: {vector_scores}")
    print(f"Alpha: {alpha}")
    print(f"1 - Alpha: {1 - alpha}")
    print(f"bm25 scores: {bm25_scores}")

    # Normalize scores
    norm_vector_scores = (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores) + epsilon)
    norm_bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores) + epsilon)
    
    # Compute combined scores
    combined_scores = alpha * norm_vector_scores + (1 - alpha) * norm_bm25_scores
    
    # Add combined scores to results
    for i, score in enumerate(combined_scores):
        combined_results[i]["combined_score"] = float(score)
    
    # Sort by combined score (descending)
    combined_results.sort(key=lambda x: x["combined_score"], reverse=True)
    
    # Return top k results
    top_results = combined_results[:k]
    
    print(f"Retrieved {len(top_results)} documents with fusion retrieval")
    return top_results

In [12]:
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200):
    """
    Process a document for fusion retrieval.
    
    Args:
        pdf_path (str): Path to the PDF file
        chunk_size (int): Size of each chunk in characters
        chunk_overlap (int): Overlap between chunks in characters
        
    Returns:
        Tuple[List[Dict], SimpleVectorStore, BM25Okapi]: Chunks, vector store, and BM25 index
    """
    # Extract text from the PDF file
    text = extract_text_from_pdf(pdf_path)
    
    # Clean the extracted text to remove extra whitespace and special characters
    cleaned_text = clean_text(text)
    
    # Split the cleaned text into overlapping chunks
    chunks = chunk_text(cleaned_text, chunk_size, chunk_overlap)
    
    # Extract the text content from each chunk for embedding creation
    chunk_texts = [chunk["text"] for chunk in chunks]
    print("Creating embeddings for chunks...")
    
    # Create embeddings for the chunk texts
    embeddings = create_embeddings(chunk_texts)
    
    # Initialize the vector store
    vector_store = SimpleVectorStore()
    
    # Add the chunks and their embeddings to the vector store
    vector_store.add_items(chunks, embeddings)
    print(f"Added {len(chunks)} items to vector store")
    
    # Create a BM25 index from the chunks
    bm25_index = create_bm25_index(chunks)
    
    # Return the chunks, vector store, and BM25 index
    return chunks, vector_store, bm25_index

In [13]:
def generate_response(query, context):
    """
    Generate a response based on the query and context.
    
    Args:
        query (str): User query
        context (str): Context from retrieved documents
        
    Returns:
        str: Generated response
    """
    # Define the system prompt to guide the AI assistant
    system_prompt = """You are a helpful AI assistant. Answer the user's question based on the provided context. 
    If the context doesn't contain relevant information to answer the question fully, acknowledge this limitation."""

    # Format the user prompt with the context and query
    user_prompt = f"""Context:
    {context}

    Question: {query}

    Please answer the question based on the provided context."""

    # Generate the response using the OpenAI API
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",  # Specify the model to use
        messages=[
            {"role": "system", "content": system_prompt},  # System message to guide the assistant
            {"role": "user", "content": user_prompt}  # User message with context and query
        ],
        temperature=0.1  # Set the temperature for response generation
    )
    
    # Return the generated response
    return response.choices[0].message.content

In [14]:
def answer_with_fusion_rag(query, chunks, vector_store, bm25_index, k=5, alpha=0.5):
    """
    Answer a query using fusion RAG.
    
    Args:
        query (str): User query
        chunks (List[Dict]): Text chunks
        vector_store (SimpleVectorStore): Vector store
        bm25_index (BM25Okapi): BM25 index
        k (int): Number of documents to retrieve
        alpha (float): Weight for vector scores
        
    Returns:
        Dict: Query results including retrieved documents and response
    """
    # Retrieve documents using fusion retrieval method
    retrieved_docs = fusion_retrieval(query, chunks, vector_store, bm25_index, k=k, alpha=alpha)
    
    # Format the context from the retrieved documents by joining their text with separators
    context = "\n\n---\n\n".join([doc["text"] for doc in retrieved_docs])
    
    # Generate a response based on the query and the formatted context
    response = generate_response(query, context)
    
    # Return the query, retrieved documents, and the generated response
    return {
        "query": query,
        "retrieved_documents": retrieved_docs,
        "response": response
    }

In [27]:
chunks, vector_store, bm25_index = process_document("AI_Information.pdf")

Created 42 text chunks
Creating embeddings for chunks...
Added 42 items to vector store
Created BM25 index with 42 documents


In [29]:
answer_with_fusion_rag(
    query="What is artificial intelligence?",
    chunks=chunks,
    vector_store=vector_store,
    bm25_index=create_bm25_index(chunks)
)  # Example query to test the fusion RAG system

Created BM25 index with 42 documents
Performing fusion retrieval for query: What is artificial intelligence?
vector scores: [0.7433635  0.54601061 0.43958056 0.42608011 0.36637625 0.48235208
 0.53574812 0.56143618 0.40263274 0.40873906 0.41222674 0.37130743
 0.40511107 0.49916551 0.42931235 0.38744962 0.38743073 0.36604521
 0.44279024 0.48129061 0.46574518 0.42452952 0.43626469 0.4350239
 0.37664813 0.41399723 0.40133178 0.4530105  0.3458541  0.46268743
 0.37514341 0.43351191 0.37324965 0.43635452 0.42823768 0.39975524
 0.37688848 0.41114178 0.37470832 0.39046559 0.49881044 0.39450854]
Alpha: 0.5
1 - Alpha: 0.5
bm25 scores: [3.66658871 1.13984143 0.97593044 3.67773839 0.67955219 1.33035843
 1.17446639 1.17446639 1.14194556 1.15689489 0.96905085 1.14194556
 0.         0.         0.69559887 1.15689489 0.73007846 0.70032378
 0.9645181  1.13774503 0.99716803 0.         0.70511332 0.
 0.71737875 0.         0.68633778 0.         0.70511332 1.13984143
 0.97362642 1.14194556 1.01683717 0.     

{'query': 'What is artificial intelligence?',
 'retrieved_documents': [{'text': 'Understanding Artificial Intelligence Chapter 1: Introduction to Artificial Intelligence Artificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot to perform tasks commonly associated with intelligent beings. The term is frequently applied to the project of developing systems endowed with the intellectual processes characteristic of humans, such as the ability to reason, discover meaning, generalize, or learn from past experience. Over the past few decades, advancements in computing power and data availability have significantly accelerated the development and deployment of AI. Historical Context The idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. However, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop in 1956 is widely considered the birthplace of AI. Early AI research focused