In [1]:
import fitz
import os
import numpy as np
import json
from openai import OpenAI
import torch
from sentence_transformers import SentenceTransformer, SimilarityFunction
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [4]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

In [5]:
pdf_path = "/home/patrick/rag_from_scratch/AI_Information.pdf"

Extracting the text from pdf file

In [6]:
def extract_text_from_pdf(pdf_path):
    mypdf = fitz.open(pdf_path)
    all_text = "" # Initizalizing an empty string to store the extarcted text

    # Iterating through each page in the pdf
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text

    return all_text

In [7]:
def generate_chunk_header(chunk, model="gpt-3.5-turbo-1106"):
    """
    Generates a title/header for a given text chunk using an LLM.

    Args:
    chunk (str): The text chunk to summarize as a header.
    model (str): The model to be used for generating the header. Default is "meta-llama/Llama-3.2-3B-Instruct".

    Returns:
    str: Generated header/title.
    """
    # Define the system prompt to guide the AI's behavior
    system_prompt = "Generate a concise and informative title for the given text."
    
    # Generate a response from the AI model based on the system prompt and text chunk
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": chunk}
        ]
    )

    # Return the generated header/title, stripping any leading/trailing whitespace
    return response.choices[0].message.content.strip()

In [8]:
def chunk_text_with_headers(text, n, overlap):
    """
    Chunks text into smaller segments and generates headers.

    Args:
    text (str): The full text to be chunked.
    n (int): The chunk size in characters.
    overlap (int): Overlapping characters between chunks.

    Returns:
    List[dict]: A list of dictionaries with 'header' and 'text' keys.
    """
    chunks = []  # Initialize an empty list to store chunks

    # Iterate through the text with the specified chunk size and overlap
    for i in range(0, len(text), n - overlap):
        chunk = text[i:i + n]  # Extract a chunk of text
        header = generate_chunk_header(chunk)  # Generate a header for the chunk using LLM
        chunks.append({"header": header, "text": chunk})  # Append the header and chunk to the list

    return chunks  # Return the list of chunks with headers

In [9]:
text = extract_text_from_pdf(pdf_path)

In [10]:
# Chunk the extracted text with headers
# We use a chunk size of 1000 characters and an overlap of 200 characters
text_chunks = chunk_text_with_headers(text, 1000, 200)

# Print a sample chunk with its generated header
print("Sample Chunk:")
print("Header:", text_chunks[0]['header'])
print("Content:", text_chunks[0]['text'])

Sample Chunk:
Header: "An Overview of Artificial Intelligence: History, Development, and Applications"
Content: Understanding Artificial Intelligence 
Chapter 1: Introduction to Artificial Intelligence 
Artificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot 
to perform tasks commonly associated with intelligent beings. The term is frequently applied to 
the project of developing systems endowed with the intellectual processes characteristic of 
humans, such as the ability to reason, discover meaning, generalize, or learn from past 
experience. Over the past few decades, advancements in computing power and data availability 
have significantly accelerated the development and deployment of AI. 
Historical Context 
The idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. 
However, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop 
in 1956 is widely considered the b

In [11]:
def create_embeddings(text):
    response = embedder.encode(text)
    return response

response = create_embeddings(text_chunks)

In [12]:
# Generate embeddings for each chunk
embeddings = []  # Initialize an empty list to store embeddings

# Iterate through each text chunk with a progress bar
for chunk in tqdm(text_chunks, desc="Generating embeddings"):
    # Create an embedding for the chunk's text
    text_embedding = create_embeddings(chunk["text"])
    # Create an embedding for the chunk's header
    header_embedding = create_embeddings(chunk["header"])
    # Append the chunk's header, text, and their embeddings to the list
    embeddings.append({"header": chunk["header"], "text": chunk["text"], "embedding": text_embedding, "header_embedding": header_embedding})

Generating embeddings: 100%|██████████| 42/42 [00:05<00:00,  7.81it/s]


In [13]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [14]:
def semantic_search(query, chunks, k=5):
    query_embedding = create_embeddings(query)

    similarities = []  # Initialize a list to store similarity scores
    
    # Iterate through each chunk to calculate similarity scores
    for chunk in chunks:
        # Compute cosine similarity between query embedding and chunk text embedding
        sim_text = cosine_similarity(np.array(query_embedding), np.array(chunk["embedding"]))
        # Compute cosine similarity between query embedding and chunk header embedding
        sim_header = cosine_similarity(np.array(query_embedding), np.array(chunk["header_embedding"]))
        # Calculate the average similarity score
        avg_similarity = (sim_text + sim_header) / 2
        # Append the chunk and its average similarity score to the list
        similarities.append((chunk, avg_similarity))

    # Sort the chunks based on similarity scores in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Return the top-k most relevant chunks
    return [x[0] for x in similarities[:k]]

In [23]:
chunks

NameError: name 'chunks' is not defined

In [26]:
with open('val.json') as f:
    data = json.load(f)
query = data[0]['question']

top_chunks = semantic_search(query, embeddings, k=2)

# Print the results
print("Query:", query)
for i, chunk in enumerate(top_chunks):
    print(f"Header {i+1}: {chunk['header']}")
    print(f"Content:\n{chunk['text']}\n")

Query: What is 'Explainable AI' and why is it considered important?
Header 1: "Advancements in AI Research: Environmental Monitoring, Deep Learning, and Explainable AI"
Content:
 incidents. 
Environmental Monitoring 
AI-powered environmental monitoring systems track air and water quality, detect pollution, and 
support environmental protection efforts. These systems provide real-time data, identify 
pollution sources, and inform environmental policies. 
Chapter 15: The Future of AI Research 
Advancements in Deep Learning 
Continued advancements in deep learning are expected to drive further breakthroughs in AI. 
Research is focused on developing more efficient and interpretable deep learning models, as well 
as exploring new architectures and training techniques. 
Explainable AI (XAI) 
Explainable AI (XAI) aims to make AI systems more transparent and understandable. Research in 
XAI focuses on developing methods for explaining AI decisions, enhancing trust, and improving 
accountabilit

In [16]:
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="gpt-4o-mini"):
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
    )
    return response

In [17]:
import json
import asyncio
import numpy as np
import pandas as pd

# Initialize the SentenceTransformer model and set the similarity function
model = SentenceTransformer("all-MiniLM-L6-v2")
model.similarity_fn_name = SimilarityFunction.DOT

In [1]:
async def process_validation_data(k):
    system_prompt = """You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly and exactly from the provided context, respond with: 'I do not have enough information to answer that.'
    First think about the keywords from the question and then use them to elaborate the answer.
    The response needs to be just the answer sentence
    
    """
    # Load the validation data from the JSON file
    with open('val.json') as f:
        data = json.load(f)

    # List to store the results for each sample
    results = []

    # Iterate over each example in the validation data
    for idx, item in enumerate(data):
        query = item['question']
        ideal_answer = item['ideal_answer']
        
        # Retrieve the top k most relevant context chunks
        top_chunks = semantic_search(query, embeddings, k=k)
        
        # Create the user prompt by combining all context chunks and the query
        context_prompt = "\n".join([
            f"Context {i + 1}:\n{chunk}\n=====================================\n"
            for i, chunk in enumerate(top_chunks)
        ])
        user_prompt = f"{context_prompt}\nQuestion: {query}"
        
        # Generate the AI response using the system prompt and the user prompt
        ai_response = generate_response(system_prompt, user_prompt).choices[0].message.content
        
        # Evaluate similarity using SentenceTransformer
        # Encode the AI response and ideal answer
        embedding_response = model.encode([ai_response])
        embedding_ideal = model.encode([ideal_answer])
        # Compute similarity score (result is a 1x1 matrix; extract the single value)
        similarity_matrix = model.similarity(embedding_response, embedding_ideal)
        score = similarity_matrix[0][0].numpy()
        
        # Prepare the result dictionary with dynamic context columns
        result = {
            "Query": query,
            "Ideal Answer": ideal_answer,
            "AI Response": ai_response,
            "Score": score
        }
        # Add each context as its own column
        for i, chunk in enumerate(top_chunks):
            result[f"Context {i + 1}"] = chunk
        
        # Append the result to the list
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    return df

In [19]:
import nest_asyncio
nest_asyncio.apply()

In [28]:
# Run the asynchronous function and store the DataFrame in df
df = asyncio.run(process_validation_data(k=2))
df

Unnamed: 0,Query,Ideal Answer,AI Response,Score,Context 1,Context 2
0,What is 'Explainable AI' and why is it conside...,Explainable AI (XAI) aims to make AI systems m...,Explainable AI (XAI) aims to make AI systems m...,0.9669191,"{'header': '""Advancements in AI Research: Envi...","{'header': '""An Overview of Artificial Intelli..."
1,Can AI be used to predict earthquakes?,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,{'header': 'The Role of Government and Policy ...,{'header': 'Applications of AI in Environmenta...
2,What are some of the ethical concerns related ...,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,"{'header': '""AI Applications in Entertainment,...","{'header': '""Core Concepts of Artificial Intel..."
3,How does AI contribute to personalized medicine?,AI enables personalized medicine by analyzing ...,AI enables personalized medicine by analyzing ...,0.9669296,{'header': 'The Impact of Artificial Intellige...,{'header': 'The Impact of AI in Education and ...
4,Does the document mention any specific compani...,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,"{'header': '""An Overview of Artificial Intelli...","{'header': '""Challenges and Strategies in AI D..."
5,What is the role of AI in smart grids?,AI optimizes energy distribution in smart grid...,I do not have enough information to answer that.,-0.02291282,"{'header': '""AI Applications in Smart Cities: ...",{'header': 'The Role of Government and Policy ...
6,"Can AI write a complete, original novel?",I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,{'header': 'The Impact of AI on Creative Proce...,"{'header': '""An Overview of Artificial Intelli..."
7,What is a 'cobot'?,It mentions collaborative settings (cobots) in...,A 'cobot' is a collaborative robot that works ...,0.74602586,"{'header': '""AI and Robotics: Integration, Typ...","{'header': '""Applications and Advancements in ..."
8,What is Direct Air Capture (DAC) used for?,DAC technology removes CO2 directly from the a...,I do not have enough information to answer that.,0.04811383,{'header': 'Applications of AI in Environmenta...,"{'header': '""Applications of AI in Healthcare ..."
9,Is AI currently being used to control nuclear ...,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,{'header': 'The Role of Government and Policy ...,"{'header': '""Ethical and Security Consideratio..."


In [29]:
for k in range(1, 9):
    print(asyncio.run(process_validation_data(k=k))['Score'].mean())

0.7480759620666504
0.7481794357299805
0.7481794357299805


KeyboardInterrupt: 