In [1]:
import fitz
import os
import numpy as np
import json
import re
from openai import OpenAI
import torch
from sentence_transformers import SentenceTransformer, SimilarityFunction
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
def rewrite_query(original_query, model="gpt-3.5-turbo-1106"):
    """
    Rewrites a query to make it more specific and detailed for better retrieval.
    
    Args:
        original_query (str): The original user query
        model (str): The model to use for query rewriting
        
    Returns:
        str: The rewritten query
    """
    # Define the system prompt to guide the AI assistant's behavior
    system_prompt = "You are an AI assistant specialized in improving search queries. Your task is to rewrite user queries to be more specific, detailed, and likely to retrieve relevant information."
    
    # Define the user prompt with the original query to be rewritten
    user_prompt = f"""
    Rewrite the following query to make it more specific and detailed. Include relevant terms and concepts that might help in retrieving accurate information.
    
    Original query: {original_query}
    
    Rewritten query:
    """
    
    # Generate the rewritten query using the specified model
    response = client.chat.completions.create(
        model=model,
        temperature=0.0,  # Low temperature for deterministic output
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    # Return the rewritten query, stripping any leading/trailing whitespace
    return response.choices[0].message.content.strip()

In [4]:
def generate_step_back_query(original_query, model="gpt-3.5-turbo-1106"):
    """
    Generates a more general 'step-back' query to retrieve broader context.
    
    Args:
        original_query (str): The original user query
        model (str): The model to use for step-back query generation
        
    Returns:
        str: The step-back query
    """
    # Define the system prompt to guide the AI assistant's behavior
    system_prompt = "You are an AI assistant specialized in search strategies. Your task is to generate broader, more general versions of specific queries to retrieve relevant background information."
    
    # Define the user prompt with the original query to be generalized
    user_prompt = f"""
    Generate a broader, more general version of the following query that could help retrieve useful background information.
    
    Original query: {original_query}
    
    Step-back query:
    """
    
    # Generate the step-back query using the specified model
    response = client.chat.completions.create(
        model=model,
        temperature=0.1,  # Slightly higher temperature for some variation
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    # Return the step-back query, stripping any leading/trailing whitespace
    return response.choices[0].message.content.strip()

In [5]:
def decompose_query(original_query, num_subqueries=4, model="gpt-3.5-turbo-1106"):
    """
    Decomposes a complex query into simpler sub-queries.
    
    Args:
        original_query (str): The original complex query
        num_subqueries (int): Number of sub-queries to generate
        model (str): The model to use for query decomposition
        
    Returns:
        List[str]: A list of simpler sub-queries
    """
    # Define the system prompt to guide the AI assistant's behavior
    system_prompt = "You are an AI assistant specialized in breaking down complex questions. Your task is to decompose complex queries into simpler sub-questions that, when answered together, address the original query."
    
    # Define the user prompt with the original query to be decomposed
    user_prompt = f"""
    Break down the following complex query into {num_subqueries} simpler sub-queries. Each sub-query should focus on a different aspect of the original question.
    
    Original query: {original_query}
    
    Generate {num_subqueries} sub-queries, one per line, in this format:
    1. [First sub-query]
    2. [Second sub-query]
    And so on...
    """
    
    # Generate the sub-queries using the specified model
    response = client.chat.completions.create(
        model=model,
        temperature=0.2,  # Slightly higher temperature for some variation
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    # Process the response to extract sub-queries
    content = response.choices[0].message.content.strip()
    
    # Extract numbered queries using simple parsing
    lines = content.split("\n")
    sub_queries = []
    
    for line in lines:
        if line.strip() and any(line.strip().startswith(f"{i}.") for i in range(1, 10)):
            # Remove the number and leading space
            query = line.strip()
            query = query[query.find(".")+1:].strip()
            sub_queries.append(query)
    
    return sub_queries

In [6]:
# Example query
original_query = "What are the impacts of AI on job automation and employment?"

# Apply query transformations
print("Original Query:", original_query)

# Query Rewriting
rewritten_query = rewrite_query(original_query)
print("\n1. Rewritten Query:")
print(rewritten_query)

# Step-back Prompting
step_back_query = generate_step_back_query(original_query)
print("\n2. Step-back Query:")
print(step_back_query)

# Sub-query Decomposition
sub_queries = decompose_query(original_query, num_subqueries=4)
print("\n3. Sub-queries:")
for i, query in enumerate(sub_queries, 1):
    print(f"   {i}. {query}")

Original Query: What are the impacts of AI on job automation and employment?

1. Rewritten Query:
What are the specific industries or job roles that are most affected by AI-driven automation, and what are the potential implications for employment rates and job displacement?

2. Step-back Query:
How does technology impact the workforce and employment trends?

3. Sub-queries:
   1. What are the specific job roles or industries most affected by AI-driven automation?
   2. How does AI impact the creation of new job opportunities in various sectors?
   3. What are the potential societal and economic implications of widespread job automation due to AI?
   4. How do government policies and regulations influence the relationship between AI, job automation, and employment?


In [7]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and prints the first `num_chars` characters.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    # Open the PDF file
    mypdf = fitz.open(pdf_path)
    all_text = ""  # Initialize an empty string to store the extracted text

    # Iterate through each page in the PDF
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]  # Get the page
        text = page.get_text("text")  # Extract text from the page
        all_text += text  # Append the extracted text to the all_text string

    return all_text  # Return the extracted text

In [8]:
def chunk_text(text, n, overlap):
    """
    Chunks the given text into segments of n characters with overlap.

    Args:
    text (str): The text to be chunked.
    n (int): The number of characters in each chunk.
    overlap (int): The number of overlapping characters between chunks.

    Returns:
    List[str]: A list of text chunks.
    """
    chunks = []

    for i in range(0, len(text), n-overlap):
        chunks.append(text[i:i+n])
    return chunks

In [9]:
def generate_questions(text_chunk, num_questions=5, model="gpt-3.5-turbo-1106"):
    """
    Generates relevant questions that can be answered from the given text chunk.

    Args:
    text_chunk (str): The text chunk to generate questions from.
    num_questions (int): Number of questions to generate.
    model (str): The model to use for question generation.

    Returns:
    List[str]: List of generated questions.
    """
    # Define the system prompt to guide the AI's behavior
    system_prompt = "You are an expert at generating relevant questions from text. Create concise questions that can be answered using only the provided text. Focus on key information and concepts."
    
    # Define the user prompt with the text chunk and the number of questions to generate
    user_prompt = f"""
    Based on the following text, generate {num_questions} different questions that can be answered using only this text:

    {text_chunk}
    
    Format your response as a numbered list of questions only, with no additional text.
    """
    
    # Generate questions using the OpenAI API
    response = client.chat.completions.create(
        model=model,
        temperature=0.7,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    # Extract and clean questions from the response
    questions_text = response.choices[0].message.content.strip()
    questions = []
    
    # Extract questions using regex pattern matching
    for line in questions_text.split('\n'):
        # Remove numbering and clean up whitespace
        cleaned_line = re.sub(r'^\d+\.\s*', '', line.strip())
        if cleaned_line and cleaned_line.endswith('?'):
            questions.append(cleaned_line)
    
    return questions

In [10]:
def create_embeddings(text):
    response = embedder.encode(text)
    return response

In [11]:
class SimpleVectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []

    def add_item(self, text, embedding, metadata=None):
        self.vectors.append(embedding)
        self.texts.append(text)
        self.metadata.append(metadata or {})
    
    def similarity_search(self, query_embedding, k=5):
        if not self.vectors:
            return []
        
        # Convert query embedding to numpy array
        query_vector = np.array(query_embedding)
        
        # Calculate similarities using cosine similarity
        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top k results
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],
                "metadata": self.metadata[idx],
                "similarity": score
            })
        
        return results

In [14]:
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200):
    """
    Process a document for RAG.

    Args:
    pdf_path (str): Path to the PDF file.
    chunk_size (int): Size of each chunk in characters.
    chunk_overlap (int): Overlap between chunks in characters.

    Returns:
    SimpleVectorStore: A vector store containing document chunks and their embeddings.
    """
    print("Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(pdf_path)
    
    print("Chunking text...")
    chunks = chunk_text(extracted_text, chunk_size, chunk_overlap)
    print(f"Created {len(chunks)} text chunks")
    
    print("Creating embeddings for chunks...")
    # Create embeddings for all chunks at once for efficiency
    chunk_embeddings = create_embeddings(chunks)
    
    # Create vector store
    store = SimpleVectorStore()
    
    # Add chunks to vector store
    for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):
        store.add_item(
            text=chunk,
            embedding=embedding,
            metadata={"index": i, "source": pdf_path}
        )
    
    print(f"Added {len(chunks)} chunks to the vector store")
    return store

In [31]:
def transformed_search(query, vector_store, transformation_type, k):
    """
    Search using a transformed query.
    
    Args:
        query (str): Original query
        vector_store (SimpleVectorStore): Vector store to search
        transformation_type (str): Type of transformation ('rewrite', 'step_back', or 'decompose')
        top_k (int): Number of results to return
        
    Returns:
        List[Dict]: Search results
    """
    
    results = []
    
    if transformation_type == "rewrite":
        # Query rewriting
        transformed_query = rewrite_query(query)
        
        # Create embedding for transformed query
        query_embedding = create_embeddings(transformed_query)
        
        # Search with rewritten query
        results = vector_store.similarity_search(query_embedding, k=k)
        
    elif transformation_type == "step_back":
        # Step-back prompting
        transformed_query = generate_step_back_query(query)
        
        # Create embedding for transformed query
        query_embedding = create_embeddings(transformed_query)
        
        # Search with step-back query
        results = vector_store.similarity_search(query_embedding, k=k)
        
    elif transformation_type == "decompose":
        # Sub-query decomposition
        sub_queries = decompose_query(query)
        
        # Create embeddings for all sub-queries
        sub_query_embeddings = create_embeddings(sub_queries)
        
        # Search with each sub-query and combine results
        all_results = []
        for i, embedding in enumerate(sub_query_embeddings):
            sub_results = vector_store.similarity_search(embedding, k=k)  # Get fewer results per sub-query
            all_results.extend(sub_results)
        
        # Remove duplicates (keep highest similarity score)
        seen_texts = {}
        for result in all_results:
            text = result["text"]
            if text not in seen_texts or result["similarity"] > seen_texts[text]["similarity"]:
                seen_texts[text] = result
        
        # Sort by similarity and take top_k
        results = sorted(seen_texts.values(), key=lambda x: x["similarity"], reverse=True)[:k]
        
    else:
        # Regular search without transformation
        query_embedding = create_embeddings(query)
        results = vector_store.similarity_search(query_embedding, k=k)
    
    return results

In [18]:
def prepare_context(search_results):
    """
    Prepares a unified context from search results for response generation.

    Args:
    search_results (List[Dict]): Results from semantic search.

    Returns:
    str: Combined context string.
    """
    # Extract unique chunks referenced in the results
    chunk_indices = set()
    context_chunks = []
    
    # First add direct chunk matches
    for result in search_results:
        if result["metadata"]["type"] == "chunk":
            chunk_indices.add(result["metadata"]["index"])
            context_chunks.append(f"Chunk {result['metadata']['index']}:\n{result['text']}")
    
    # Then add chunks referenced by questions
    for result in search_results:
        if result["metadata"]["type"] == "question":
            chunk_idx = result["metadata"]["chunk_index"]
            if chunk_idx not in chunk_indices:
                chunk_indices.add(chunk_idx)
                context_chunks.append(f"Chunk {chunk_idx} (referenced by question '{result['text']}'):\n{result['metadata']['original_chunk']}")
    
    # Combine all context chunks
    full_context = "\n\n".join(context_chunks)
    return full_context

In [19]:
def generate_response(query, context, model="gpt-3.5-turbo-1106"):
    """
    Generates a response based on the query and context.

    Args:
    query (str): User's question.
    context (str): Context information retrieved from the vector store.
    model (str): Model to use for response generation.

    Returns:
    str: Generated response.
    """
    system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"
    
    user_prompt = f"""
        Context:
        {context}

        Question: {query}

        Please answer the question based only on the context provided above. Be concise and accurate.
    """
    
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    return response.choices[0].message.content

In [20]:
def rag_with_query_transformation(pdf_path, query, transformation_type=None):
    """
    Run complete RAG pipeline with optional query transformation.
    
    Args:
        pdf_path (str): Path to PDF document
        query (str): User query
        transformation_type (str): Type of transformation (None, 'rewrite', 'step_back', or 'decompose')
        
    Returns:
        Dict: Results including query, transformed query, context, and response
    """
    # Process the document to create a vector store
    vector_store = process_document(pdf_path)
    
    # Apply query transformation and search
    if transformation_type:
        # Perform search with transformed query
        results = transformed_search(query, vector_store, transformation_type)
    else:
        # Perform regular search without transformation
        query_embedding = create_embeddings(query)
        results = vector_store.similarity_search(query_embedding, k=3)
    
    # Combine context from search results
    context = "\n\n".join([f"PASSAGE {i+1}:\n{result['text']}" for i, result in enumerate(results)])
    
    # Generate response based on the query and combined context
    response = generate_response(query, context)
    
    # Return the results including original query, transformation type, context, and response
    return {
        "original_query": query,
        "transformation_type": transformation_type,
        "context": context,
        "response": response
    }

In [21]:
import json
import asyncio
import numpy as np
import pandas as pd

# Initialize the SentenceTransformer model and set the similarity function
model = SentenceTransformer("all-MiniLM-L6-v2")
model.similarity_fn_name = SimilarityFunction.DOT

In [22]:
extracted_text = extract_text_from_pdf("AI_Information.pdf")
text_chunks = chunk_text(extracted_text, 1000, 200)

In [23]:
text_embedding = create_embeddings(text_chunks)

In [24]:
vector_store = SimpleVectorStore()

In [33]:
async def process_validation_data(k, transformation_type="rewrite"):
    system_prompt = """You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly and exactly from the provided context, respond with: 'I do not have enough information to answer that.'
    First think about the keywords from the question and then use them to elaborate the answer.
    The response needs to be just the answer sentence
    
    """
    # Load the validation data from the JSON file
    with open('val.json') as f:
        data = json.load(f)

    # List to store the results for each sample
    results = []

    # Iterate over each example in the validation data
    for idx, item in enumerate(data):
        query = item['question']
        ideal_answer = item['ideal_answer']
        
        # Retrieve the top k most relevant context chunks
        top_chunks = transformed_search(query, vector_store, transformation_type, k=k)
        
        # Create the user prompt by combining all context chunks and the query
        context_prompt = "\n".join([
            f"Context {i + 1}:\n{chunk}\n=====================================\n"
            for i, chunk in enumerate(top_chunks)
        ])
        user_prompt = f"{context_prompt}\nQuestion: {query}"
        
        # Generate the AI response using the system prompt and the user prompt
        ai_response = generate_response(system_prompt, user_prompt)
        
        # Evaluate similarity using SentenceTransformer
        # Encode the AI response and ideal answer
        embedding_response = model.encode([ai_response])
        embedding_ideal = model.encode([ideal_answer])
        # Compute similarity score (result is a 1x1 matrix; extract the single value)
        similarity_matrix = model.similarity(embedding_response, embedding_ideal)
        score = similarity_matrix[0][0].numpy()
        
        # Prepare the result dictionary with dynamic context columns
        result = {
            "Query": query,
            "Ideal Answer": ideal_answer,
            "AI Response": ai_response,
            "Score": score
        }
        # Add each context as its own column
        for i, chunk in enumerate(top_chunks):
            result[f"Context {i + 1}"] = chunk
        
        # Append the result to the list
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    return df

In [29]:
import nest_asyncio
nest_asyncio.apply()

In [32]:
for k in range(1, 9):
    print(asyncio.run(process_validation_data(k=k))['Score'].mean())

0.41870908737182616
0.41870908737182616
0.49298410415649413
0.49298410415649413
0.41870908737182616
0.49298410415649413
0.49298410415649413
0.49298410415649413


In [34]:
for k in range(1, 9):
    print(asyncio.run(process_validation_data(k=k, transformation_type='step_back'))['Score'].mean())

0.49298410415649413
0.49298410415649413
0.49298410415649413
0.49298410415649413
0.49298410415649413
0.49298410415649413
0.49298410415649413
0.49298410415649413


In [35]:
for k in range(1, 9):
    print(asyncio.run(process_validation_data(k=k, transformation_type='decompose'))['Score'].mean())

0.49298410415649413
0.49298410415649413
0.49298410415649413
0.49298410415649413
0.41870908737182616
0.49298410415649413
0.49298410415649413
0.49298410415649413
