In [1]:
import fitz
import os
import numpy as np
import json
import re
from openai import OpenAI
import torch
from sentence_transformers import SentenceTransformer, SimilarityFunction
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and prints the first `num_chars` characters.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    # Open the PDF file
    mypdf = fitz.open(pdf_path)
    all_text = ""  # Initialize an empty string to store the extracted text

    # Iterate through each page in the PDF
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]  # Get the page
        text = page.get_text("text")  # Extract text from the page
        all_text += text  # Append the extracted text to the all_text string

    return all_text  # Return the extracted text

In [4]:
def chunk_text(text, chunk_size=800, overlap=100):
    """
    Split text into overlapping chunks.
    
    Args:
        text (str): Input text to chunk
        chunk_size (int): Size of each chunk in characters
        overlap (int): Overlap between chunks in characters
        
    Returns:
        List[Dict]: List of chunk dictionaries with text and metadata
    """
    chunks = []  # Initialize an empty list to store the chunks
    
    # Iterate over the text with the specified chunk size and overlap
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i + chunk_size]  # Extract a chunk of the specified size
        if chunk:  # Ensure we don't add empty chunks
            chunks.append({
                "text": chunk,  # The chunk text
                "chunk_id": len(chunks) + 1,  # Unique ID for the chunk
                "start_char": i,  # Starting character index of the chunk
                "end_char": i + len(chunk)  # Ending character index of the chunk
            })
    
    print(f"Created {len(chunks)} text chunks")  # Print the number of created chunks
    return chunks  # Return the list of chunks

In [16]:
def create_embeddings(text):
    """
    Create embeddings for the given text.
    
    Args:
        texts (str or List[str]): Input text(s)
        model (str): Embedding model name
        
    Returns:
        List[List[float]]: Embedding vector(s)
    """
    # Create embeddings for the current batch
    response = embedder.encode(text)
    return response  # Return the embeddings
        

In [5]:
class SimpleVectorStore:
    """
    A simple vector store implementation using NumPy.
    """
    def __init__(self):
        # Initialize lists to store vectors, texts, and metadata
        self.vectors = []
        self.texts = []
        self.metadata = []
    
    def add_item(self, text, embedding, metadata=None):
        """
        Add an item to the vector store.
        
        Args:
            text (str): The text content
            embedding (List[float]): The embedding vector
            metadata (Dict, optional): Additional metadata
        """
        # Append the embedding, text, and metadata to their respective lists
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})
    
    def add_items(self, texts, embeddings, metadata_list=None):
        """
        Add multiple items to the vector store.
        
        Args:
            texts (List[str]): List of text contents
            embeddings (List[List[float]]): List of embedding vectors
            metadata_list (List[Dict], optional): List of metadata dictionaries
        """
        # If no metadata list is provided, create an empty dictionary for each text
        if metadata_list is None:
            metadata_list = [{} for _ in range(len(texts))]
        
        # Add each text, embedding, and metadata to the store
        for text, embedding, metadata in zip(texts, embeddings, metadata_list):
            self.add_item(text, embedding, metadata)
    
    def similarity_search(self, query_embedding, k=5):
        """
        Find the most similar items to a query embedding.
        
        Args:
            query_embedding (List[float]): Query embedding vector
            k (int): Number of results to return
            
        Returns:
            List[Dict]: Top k most similar items
        """
        # Return an empty list if there are no vectors in the store
        if not self.vectors:
            return []
        
        # Convert query embedding to a numpy array
        query_vector = np.array(query_embedding)
        
        # Calculate similarities using cosine similarity
        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        
        # Sort by similarity in descending order
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Collect the top k results
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],
                "metadata": self.metadata[idx],
                "similarity": float(score)  # Convert to float for JSON serialization
            })
        
        return results

In [6]:
def generate_propositions(chunk):
    """
    Generate atomic, self-contained propositions from a text chunk.
    
    Args:
        chunk (Dict): Text chunk with content and metadata
        
    Returns:
        List[str]: List of generated propositions
    """
    # System prompt to instruct the AI on how to generate propositions
    system_prompt = """Please break down the following text into simple, self-contained propositions. 
    Ensure that each proposition meets the following criteria:

    1. Express a Single Fact: Each proposition should state one specific fact or claim.
    2. Be Understandable Without Context: The proposition should be self-contained, meaning it can be understood without needing additional context.
    3. Use Full Names, Not Pronouns: Avoid pronouns or ambiguous references; use full entity names.
    4. Include Relevant Dates/Qualifiers: If applicable, include necessary dates, times, and qualifiers to make the fact precise.
    5. Contain One Subject-Predicate Relationship: Focus on a single subject and its corresponding action or attribute, without conjunctions or multiple clauses.

    Output ONLY the list of propositions without any additional text or explanations."""

    # User prompt containing the text chunk to be converted into propositions
    user_prompt = f"Text to convert into propositions:\n\n{chunk['text']}"
    
    # Generate response from the model
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",  # Using a stronger model for accurate proposition generation
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    
    # Extract propositions from the response
    raw_propositions = response.choices[0].message.content.strip().split('\n')
    
    # Clean up propositions (remove numbering, bullets, etc.)
    clean_propositions = []
    for prop in raw_propositions:
        # Remove numbering (1., 2., etc.) and bullet points
        cleaned = re.sub(r'^\s*(\d+\.|\-|\*)\s*', '', prop).strip()
        if cleaned and len(cleaned) > 10:  # Simple filter for empty or very short propositions
            clean_propositions.append(cleaned)
    
    return clean_propositions

In [7]:
def evaluate_proposition(proposition, original_text):
    """
    Evaluate a proposition's quality based on accuracy, clarity, completeness, and conciseness.
    
    Args:
        proposition (str): The proposition to evaluate
        original_text (str): The original text for comparison
        
    Returns:
        Dict: Scores for each evaluation dimension
    """
    # System prompt to instruct the AI on how to evaluate the proposition
    system_prompt = """You are an expert at evaluating the quality of propositions extracted from text.
    Rate the given proposition on the following criteria (scale 1-10):

    - Accuracy: How well the proposition reflects information in the original text
    - Clarity: How easy it is to understand the proposition without additional context
    - Completeness: Whether the proposition includes necessary details (dates, qualifiers, etc.)
    - Conciseness: Whether the proposition is concise without losing important information

    The response must be in valid JSON format with numerical scores for each criterion:
    {"accuracy": X, "clarity": X, "completeness": X, "conciseness": X}
    """

    # User prompt containing the proposition and the original text
    user_prompt = f"""Proposition: {proposition}

    Original Text: {original_text}

    Please provide your evaluation scores in JSON format."""

    # Generate response from the model
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",  # Using a stronger model for accurate evaluation
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        response_format={"type": "json_object"},
        temperature=0
    )
    
    # Parse the JSON response
    try:
        scores = json.loads(response.choices[0].message.content.strip())
        return scores
    except json.JSONDecodeError:
        # Fallback if JSON parsing fails
        return {
            "accuracy": 5,
            "clarity": 5,
            "completeness": 5,
            "conciseness": 5
        }


In [8]:
def process_document_into_propositions(pdf_path, chunk_size=800, chunk_overlap=100, 
                                      quality_thresholds=None):
    """
    Process a document into quality-checked propositions.
    
    Args:
        pdf_path (str): Path to the PDF file
        chunk_size (int): Size of each chunk in characters
        chunk_overlap (int): Overlap between chunks in characters
        quality_thresholds (Dict): Threshold scores for proposition quality
        
    Returns:
        Tuple[List[Dict], List[Dict]]: Original chunks and proposition chunks
    """
    # Set default quality thresholds if not provided
    if quality_thresholds is None:
        quality_thresholds = {
            "accuracy": 7,
            "clarity": 7,
            "completeness": 7,
            "conciseness": 7
        }
    
    # Extract text from the PDF file
    text = extract_text_from_pdf(pdf_path)
    
    # Create chunks from the extracted text
    chunks = chunk_text(text, chunk_size, chunk_overlap)
    
    # Initialize a list to store all propositions
    all_propositions = []
    
    print("Generating propositions from chunks...")
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        
        # Generate propositions for the current chunk
        chunk_propositions = generate_propositions(chunk)
        print(f"Generated {len(chunk_propositions)} propositions")
        
        # Process each generated proposition
        for prop in chunk_propositions:
            proposition_data = {
                "text": prop,
                "source_chunk_id": chunk["chunk_id"],
                "source_text": chunk["text"]
            }
            all_propositions.append(proposition_data)
    
    # Evaluate the quality of the generated propositions
    print("\nEvaluating proposition quality...")
    quality_propositions = []
    
    for i, prop in enumerate(all_propositions):
        if i % 10 == 0:  # Status update every 10 propositions
            print(f"Evaluating proposition {i+1}/{len(all_propositions)}...")
            
        # Evaluate the quality of the current proposition
        scores = evaluate_proposition(prop["text"], prop["source_text"])
        prop["quality_scores"] = scores
        
        # Check if the proposition passes the quality thresholds
        passes_quality = True
        for metric, threshold in quality_thresholds.items():
            if scores.get(metric, 0) < threshold:
                passes_quality = False
                break
        
        if passes_quality:
            quality_propositions.append(prop)
        else:
            print(f"Proposition failed quality check: {prop['text'][:50]}...")
    
    print(f"\nRetained {len(quality_propositions)}/{len(all_propositions)} propositions after quality filtering")
    
    return chunks, quality_propositions

In [9]:
def build_vector_stores(chunks, propositions):
    """
    Build vector stores for both chunk-based and proposition-based approaches.
    
    Args:
        chunks (List[Dict]): Original document chunks
        propositions (List[Dict]): Quality-filtered propositions
        
    Returns:
        Tuple[SimpleVectorStore, SimpleVectorStore]: Chunk and proposition vector stores
    """
    # Create vector store for chunks
    chunk_store = SimpleVectorStore()
    
    # Extract chunk texts and create embeddings
    chunk_texts = [chunk["text"] for chunk in chunks]
    print(f"Creating embeddings for {len(chunk_texts)} chunks...")
    chunk_embeddings = create_embeddings(chunk_texts)
    
    # Add chunks to vector store with metadata
    chunk_metadata = [{"chunk_id": chunk["chunk_id"], "type": "chunk"} for chunk in chunks]
    chunk_store.add_items(chunk_texts, chunk_embeddings, chunk_metadata)
    
    # Create vector store for propositions
    prop_store = SimpleVectorStore()
    
    # Extract proposition texts and create embeddings
    prop_texts = [prop["text"] for prop in propositions]
    print(f"Creating embeddings for {len(prop_texts)} propositions...")
    prop_embeddings = create_embeddings(prop_texts)
    
    # Add propositions to vector store with metadata
    prop_metadata = [
        {
            "type": "proposition", 
            "source_chunk_id": prop["source_chunk_id"],
            "quality_scores": prop["quality_scores"]
        } 
        for prop in propositions
    ]
    prop_store.add_items(prop_texts, prop_embeddings, prop_metadata)
    
    return chunk_store, prop_store

In [10]:
def retrieve_from_store(query, vector_store, k=5):
    """
    Retrieve relevant items from a vector store based on query.
    
    Args:
        query (str): User query
        vector_store (SimpleVectorStore): Vector store to search
        k (int): Number of results to retrieve
        
    Returns:
        List[Dict]: Retrieved items with scores and metadata
    """
    # Create query embedding
    query_embedding = create_embeddings(query)
    
    # Search vector store for the top k most similar items
    results = vector_store.similarity_search(query_embedding, k=k)
    
    return results

In [11]:
def compare_retrieval_approaches(query, chunk_store, prop_store, k=5):
    """
    Compare chunk-based and proposition-based retrieval for a query.
    
    Args:
        query (str): User query
        chunk_store (SimpleVectorStore): Chunk-based vector store
        prop_store (SimpleVectorStore): Proposition-based vector store
        k (int): Number of results to retrieve from each store
        
    Returns:
        Dict: Comparison results
    """
    print(f"\n=== Query: {query} ===")
    
    # Retrieve results from the proposition-based vector store
    print("\nRetrieving with proposition-based approach...")
    prop_results = retrieve_from_store(query, prop_store, k)
    
    # Retrieve results from the chunk-based vector store
    print("Retrieving with chunk-based approach...")
    chunk_results = retrieve_from_store(query, chunk_store, k)
    
    # Display proposition-based results
    print("\n=== Proposition-Based Results ===")
    for i, result in enumerate(prop_results):
        print(f"{i+1}) {result['text']} (Score: {result['similarity']:.4f})")
    
    # Display chunk-based results
    print("\n=== Chunk-Based Results ===")
    for i, result in enumerate(chunk_results):
        # Truncate text to keep the output manageable
        truncated_text = result['text'][:150] + "..." if len(result['text']) > 150 else result['text']
        print(f"{i+1}) {truncated_text} (Score: {result['similarity']:.4f})")
    
    # Return the comparison results
    return {
        "query": query,
        "proposition_results": prop_results,
        "chunk_results": chunk_results
    }

In [18]:
def generate_response(query, results, result_type="proposition"):
    """
    Generate a response based on retrieved results.
    
    Args:
        query (str): User query
        results (List[Dict]): Retrieved items
        result_type (str): Type of results ('proposition' or 'chunk')
        
    Returns:
        str: Generated response
    """
    # Combine retrieved texts into a single context string
    context = "\n\n".join([result["text"] for result in results])
    
    # System prompt to instruct the AI on how to generate the response
    system_prompt = f"""You are an AI assistant answering questions based on retrieved information.
Your answer should be based on the following {result_type}s that were retrieved from a knowledge base.
If the retrieved information doesn't answer the question, acknowledge this limitation."""

    # User prompt containing the query and the retrieved context
    user_prompt = f"""Query: {query}

Retrieved {result_type}s:
{context}

Please answer the query based on the retrieved information."""

    # Generate the response using the OpenAI client
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2
    )
    
    # Return the generated response text
    return response.choices[0].message.content


In [19]:
# Define the path to the PDF file
pdf_path = "AI_Information.pdf"
query = "what are the concerns about AI?"

# Process document into propositions and chunks
chunks, propositions = process_document_into_propositions("")

# Build vector stores for chunks and propositions
chunk_store, prop_store = build_vector_stores(chunks, propositions)

# Initialize a list to store results for each query
results = []

# Run tests for each query
print(f"Query: {query}")

# Get retrieval results from both chunk-based and proposition-based approaches
retrieval_results = compare_retrieval_approaches(query, chunk_store, prop_store)

# Generate responses based on the retrieved proposition-based results
print("\nGenerating response from proposition-based results...")
prop_response = generate_response(
    query, 
    retrieval_results["proposition_results"], 
    "proposition"
)

# Generate responses based on the retrieved chunk-based results
print("Generating response from chunk-based results...")
chunk_response = generate_response(
    query, 
    retrieval_results["chunk_results"], 
    "chunk"
)

Created 0 text chunks
Generating propositions from chunks...

Evaluating proposition quality...

Retained 0/0 propositions after quality filtering
Creating embeddings for 0 chunks...
Creating embeddings for 0 propositions...
Query: what are the concerns about AI?

=== Query: what are the concerns about AI? ===

Retrieving with proposition-based approach...
Retrieving with chunk-based approach...

=== Proposition-Based Results ===

=== Chunk-Based Results ===

Generating response from proposition-based results...
Generating response from chunk-based results...


In [20]:
prop_response

'The concerns about AI include potential job displacement due to automation, ethical considerations regarding AI decision-making, privacy and security risks related to data collection and usage, and the potential for AI to perpetuate existing biases and inequalities. Additionally, there are concerns about the potential for AI to be used for malicious purposes, such as cyber attacks or misinformation campaigns.'

In [21]:
chunk_response

'The concerns about AI include potential job displacement due to automation, ethical considerations regarding AI decision-making, privacy and security risks related to data collection and usage, and the potential for bias in AI algorithms. Additionally, there are concerns about the potential misuse of AI for malicious purposes and the need for regulations to ensure responsible development and deployment of AI technologies.'