In [39]:
import fitz
import os
import numpy as np
import json
import torch
from sentence_transformers import SentenceTransformer
from openai import OpenAI

In [40]:
from dotenv import load_dotenv
load_dotenv()

True

In [41]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

KeyboardInterrupt: 

In [4]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [5]:
pdf_path = "/home/patrick/rag_from_scratch/AI_Information.pdf"

In [6]:
def extract_text_from_pdf(pdf_path):
    mypdf = fitz.open(pdf_path)
    all_text = "" # Initizalizing an empty string to store the extarcted text

    # Iterating through each page in the pdf
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        all_text += page.get_text("text") + " "

    return all_text.strip()

In [7]:
extracted_text = extract_text_from_pdf(pdf_path)

In [None]:
def create_embeddings(text):
    response = embedder.encode(text)
    return response

# Splitting text into sentences (basic split)
sentences = extracted_text.split(". ")

# Generate embeddings for each sentence
embeddings = [create_embeddings(sentence) for sentence in sentences]

print(f"Generated {len(embeddings)} sentence embeddings.")

Generated 257 sentence embeddings.


In [9]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [10]:
similarities = [cosine_similarity(embeddings[i], embeddings[i + 1]) for i in range(len(embeddings) - 1)]

In [14]:
def compute_breakpoints(similarities, method="percentile", threshold=90):
    """
    Computes chunking breakpoints based on similarity drops.

    Args:
    similarities (List[float]): List of similarity scores between sentences.
    method (str): 'percentile', 'standard_deviation', or 'interquartile'.
    threshold (float): Threshold value (percentile for 'percentile', std devs for 'standard_deviation').

    Returns:
    List[int]: Indices where chunk splits should occur.
    """
    # Determine the threshold value based on the selected method
    if method == "percentile":
        # Calculate the Xth percentile of the similarity scores
        threshold_value = np.percentile(similarities, threshold)
    elif method == "standard_deviation":
        # Calculate the mean and standard deviation of the similarity scores
        mean = np.mean(similarities)
        std_dev = np.std(similarities)
        # Set the threshold value to mean minus X standard deviations
        threshold_value = mean - (threshold * std_dev)
    elif method == "interquartile":
        # Calculate the first and third quartiles (Q1 and Q3)
        q1, q3 = np.percentile(similarities, [25, 75])
        # Set the threshold value using the IQR rule for outliers
        threshold_value = q1 - 1.5 * (q3 - q1)
    else:
        # Raise an error if an invalid method is provided
        raise ValueError("Invalid method. Choose 'percentile', 'standard_deviation', or 'interquartile'.")

    # Identify indices where similarity drops below the threshold value
    return [i for i, sim in enumerate(similarities) if sim < threshold_value]

# Compute breakpoints using the percentile method with a threshold of 90
breakpoints = compute_breakpoints(similarities, method="percentile", threshold=90)

In [16]:
def split_into_chunks(sentences, breakpoints):
    """
    Splits sentences into semantic chunks.

    Args:
    sentences (List[str]): List of sentences.
    breakpoints (List[int]): Indices where chunking should occur.

    Returns:
    List[str]: List of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    start = 0  # Initialize the start index

    # Iterate through each breakpoint to create chunks
    for bp in breakpoints:
        # Append the chunk of sentences from start to the current breakpoint
        chunks.append(". ".join(sentences[start:bp + 1]) + ".")
        start = bp + 1  # Update the start index to the next sentence after the breakpoint
    # Append the remaining sentences as the last chunk
    chunks.append(". ".join(sentences[start:]))
    return chunks  # Return the list of chunks

# Create chunks using the split_into_chunks function
text_chunks = split_into_chunks(sentences, breakpoints)

# Print the number of chunks created
print(f"Number of semantic chunks: {len(text_chunks)}")

# Print the first chunk to verify the result
print("\nFirst text chunk:")
print(text_chunks[0])

Number of semantic chunks: 231

First text chunk:
Understanding Artificial Intelligence 
Chapter 1: Introduction to Artificial Intelligence 
Artificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot 
to perform tasks commonly associated with intelligent beings.


In [None]:
def create_embeddings_semantic_chunk(text_chunks):
    return [create_embeddings(chunk) for chunk in text_chunks]

chunk_embeddings = create_embeddings_semantic_chunk(text_chunks)

In [19]:
def semantic_search(query, text_chunks, chunk_embeddings, k=5):
    query_embedding = create_embeddings(query)
    
    similarities = [cosine_similarity(query_embedding, emb) for emb in chunk_embeddings]
    
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    return [text_chunks[i] for i in top_indices]

In [27]:
# Load the validation data from a JSON file
with open('val.json') as f:
    data = json.load(f)

# Extract the first query from the validation data
query = data[0]['question']

# Get top 2 relevant chunks
top_chunks = semantic_search(query, text_chunks, chunk_embeddings, k=8)

# Print the query
print(f"Query: {query}")

# Print the top 2 most relevant text chunks
for i, chunk in enumerate(top_chunks):
    print(f"Context {i+1}:\n{chunk}\n{'='*40}")

Query: What is 'Explainable AI' and why is it considered important?
Context 1:

Explainable AI (XAI) 
Explainable AI (XAI) aims to make AI systems more transparent and understandable. Research in 
XAI focuses on developing methods for explaining AI decisions, enhancing trust, and improving 
accountability.
Context 2:
Key trends and areas of development include: 
Explainable AI (XAI) 
Explainable AI (XAI) aims to make AI systems more transparent and understandable. XAI 
techniques are being developed to provide insights into how AI models make decisions, 
enhancing trust and accountability.
Context 3:
Explainable AI (XAI) 
techniques aim to make AI decisions more understandable, enabling users to assess their 
fairness and accuracy.
Context 4:

Transparency and Explainability 
Transparency and explainability are essential for building trust in AI systems.
Context 5:

Transparency and Explainability 
Many AI systems, particularly deep learning models, are "black boxes," making it difficu

In [29]:
# Define the system prompt for the AI assistant
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly and exactly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="gpt-3.5-turbo-1106"):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "meta-llama/Llama-2-7B-chat-hf".

    Returns:
    dict: The response from the AI model.
    """
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
    )
    return response

# Create the user prompt based on the top chunks
user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

# Generate AI response
ai_response = generate_response(system_prompt, user_prompt)

In [30]:
ai_response.choices[0].message.content

'Explainable AI (XAI) aims to make AI systems more transparent and understandable. It is considered important because it enhances trust, improves accountability, and enables users to assess the fairness and accuracy of AI decisions.'

# Evaluating the rag

In [None]:
import json
import asyncio
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, SimilarityFunction

# Initialize the SentenceTransformer model and set the similarity function
model = SentenceTransformer("all-MiniLM-L6-v2")
model.similarity_fn_name = SimilarityFunction.DOT

In [80]:
async def process_validation_data(k):
    system_prompt = """You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly and exactly from the provided context, respond with: 'I do not have enough information to answer that.'
    First think about the keywords from the question and then use them to elaborate the answer.
    The response needs to be just the answer sentence
    
    """
    # Load the validation data from the JSON file
    with open('val.json') as f:
        data = json.load(f)

    # List to store the results for each sample
    results = []

    # Iterate over each example in the validation data
    for idx, item in enumerate(data):
        query = item['question']
        ideal_answer = item['ideal_answer']
        
        # Retrieve the top k most relevant context chunks
        top_chunks = semantic_search(query, text_chunks, chunk_embeddings, k=k)
        
        # Create the user prompt by combining all context chunks and the query
        context_prompt = "\n".join([
            f"Context {i + 1}:\n{chunk}\n=====================================\n"
            for i, chunk in enumerate(top_chunks)
        ])
        user_prompt = f"{context_prompt}\nQuestion: {query}"
        
        # Generate the AI response using the system prompt and the user prompt
        ai_response = generate_response(system_prompt, user_prompt).choices[0].message.content
        
        # Evaluate similarity using SentenceTransformer
        # Encode the AI response and ideal answer
        embedding_response = model.encode([ai_response])
        embedding_ideal = model.encode([ideal_answer])
        # Compute similarity score (result is a 1x1 matrix; extract the single value)
        similarity_matrix = model.similarity(embedding_response, embedding_ideal)
        score = similarity_matrix[0][0].numpy()
        
        # Prepare the result dictionary with dynamic context columns
        result = {
            "Query": query,
            "Ideal Answer": ideal_answer,
            "AI Response": ai_response,
            "Score": score
        }
        # Add each context as its own column
        for i, chunk in enumerate(top_chunks):
            result[f"Context {i + 1}"] = chunk
        
        # Append the result to the list
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    return df

In [81]:
# Run the asynchronous function and store the DataFrame in df
df = asyncio.run(process_validation_data(k=2))
df

Unnamed: 0,Query,Ideal Answer,AI Response,Score,Context 1,Context 2
0,What is 'Explainable AI' and why is it conside...,Explainable AI (XAI) aims to make AI systems m...,Explainable AI (XAI) aims to make AI systems m...,0.96724045,\nExplainable AI (XAI) \nExplainable AI (XAI) ...,Key trends and areas of development include: \...
1,Can AI be used to predict earthquakes?,I don't have enough information to answer that.,"Yes, AI can be used to predict earthquakes by ...",0.049676944,\nDisaster Response \nAI enhances disaster res...,\nChapter 2: Core Concepts of Artificial Intel...
2,What are some of the ethical concerns related ...,I don't have enough information to answer that.,"Promoting fairness, transparency, and accounta...",0.17125088,\nEthical Considerations \nAddressing the ethi...,\nEthical Considerations \nAddressing the ethi...
3,How does AI contribute to personalized medicine?,AI enables personalized medicine by analyzing ...,AI contributes to personalized medicine by ana...,0.9515934,\nPersonalized Medicine \nAI enables personali...,\nThese applications include: \nHealthcare \nA...
4,Does the document mention any specific compani...,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,"This includes investing in basic research, app...",AI-powered tools are used in research and deve...
5,What is the role of AI in smart grids?,AI optimizes energy distribution in smart grid...,"AI in smart grids enhances grid stability, red...",0.86632425,"AI-powered systems enhance grid stability, red...","AI-powered systems can monitor equipment, dete..."
6,"Can AI write a complete, original novel?",I don't have enough information to answer that.,"Yes, AI can write a complete, original novel.",0.1000002,\nAI in Writing and Content Creation \nAI is u...,\n AI-Generated Art \nAI algorithms can create...
7,What is a 'cobot'?,It mentions collaborative settings (cobots) in...,A 'cobot' is a collaborative robot that works ...,0.68394816,"AI enhances their precision, efficiency, and a...","AI enables these robots to navigate, interact ..."
8,What is Direct Air Capture (DAC) used for?,DAC technology removes CO2 directly from the a...,I do not have enough information to answer that.,0.04811383,These systems use real-time data to manage tra...,"AI-powered systems analyze data from sensors, ..."
9,Is AI currently being used to control nuclear ...,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,\nWeaponization of AI \nThe potential use of A...,AI-powered systems improve situational awarene...


In [82]:
for k in range(1, 9):
    print(asyncio.run(process_validation_data(k=k))['Score'].mean())


0.6451980590820312
0.5746803283691406
0.6475242614746094
0.5636902809143066
0.568789291381836
0.5751715660095215
0.5748960494995117
0.5668364524841308
