In [1]:
import fitz
import os
import numpy as np
import json
from openai import OpenAI
import torch
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [4]:
# Load the pre-trained model
embedder = SentenceTransformer("all-MiniLM-L6-v2") # Fazer o embedding usando esse

In [5]:
pdf_path = "/home/patrick/rag_from_scratch/AI_Information.pdf"

Extracting the text from pdf file

In [6]:
def extract_text_from_pdf(pdf_path):
    mypdf = fitz.open(pdf_path)
    all_text = "" # Initizalizing an empty string to store the extarcted text

    # Iterating through each page in the pdf
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text

    return all_text

In [7]:
def chunk_text(text, n, overlap):
    chunks = []

    for i in range(0, len(text), n - overlap):
        chunks.append(text[i:i+n])

    return chunks

In [8]:
text = extract_text_from_pdf(pdf_path)

In [9]:
text_chunks = chunk_text(text, 1000, 100)

In [10]:

print("Number of text chunks:", len(text_chunks))

print("\nFirst text chunk:")
print(text_chunks[0])

Number of text chunks: 38

First text chunk:
Understanding Artificial Intelligence 
Chapter 1: Introduction to Artificial Intelligence 
Artificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot 
to perform tasks commonly associated with intelligent beings. The term is frequently applied to 
the project of developing systems endowed with the intellectual processes characteristic of 
humans, such as the ability to reason, discover meaning, generalize, or learn from past 
experience. Over the past few decades, advancements in computing power and data availability 
have significantly accelerated the development and deployment of AI. 
Historical Context 
The idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. 
However, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop 
in 1956 is widely considered the birthplace of AI. Early AI research focused on problem-solving 
and 

In [11]:
def create_embeddings(text):
    response = embedder.encode(text)
    return response

response = create_embeddings(text_chunks)

In [12]:
response

array([[-0.05094514,  0.01012436,  0.0405515 , ...,  0.14755581,
         0.01451863, -0.04068931],
       [-0.06135775, -0.0272468 ,  0.02331264, ...,  0.00723255,
         0.01329521, -0.04890287],
       [-0.05693215,  0.00113145, -0.03345631, ...,  0.00174798,
         0.00373301, -0.01378473],
       ...,
       [ 0.01580146, -0.06684798, -0.00377489, ..., -0.00862874,
        -0.01902532, -0.00202416],
       [-0.02450312,  0.07355722, -0.01570883, ..., -0.0277919 ,
        -0.0052187 , -0.00850473],
       [ 0.00582304,  0.00249693,  0.00914683, ...,  0.03049997,
         0.04059248, -0.02348365]], shape=(38, 384), dtype=float32)

In [14]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [15]:
def semantic_search(query, text_chunks, embeddings, k=5):
    query_embedding = create_embeddings(query)
    similarity_scores = []

    for i, chunk_embedding in enumerate(embeddings):
        similarity_score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding))
        similarity_scores.append((i, similarity_score))

    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_indices = [index for index, _ in similarity_scores[:k]]
    return [text_chunks[index] for index in top_indices]

In [16]:
with open('val.json') as f:
    data = json.load(f)
query = data[0]['question']

top_chunks = semantic_search(query, text_chunks, response, k=2)

print("Query:", query)

for i, chunk in enumerate(top_chunks):
    print(f"Context {i + 1}:\n{chunk}\n=====================================")

Query: What is 'Explainable AI' and why is it considered important?
Context 1:
eal-time data, identify 
pollution sources, and inform environmental policies. 
Chapter 15: The Future of AI Research 
Advancements in Deep Learning 
Continued advancements in deep learning are expected to drive further breakthroughs in AI. 
Research is focused on developing more efficient and interpretable deep learning models, as well 
as exploring new architectures and training techniques. 
Explainable AI (XAI) 
Explainable AI (XAI) aims to make AI systems more transparent and understandable. Research in 
XAI focuses on developing methods for explaining AI decisions, enhancing trust, and improving 
accountability. 
AI and Neuroscience 
The intersection of AI and neuroscience is a promising area of research. Understanding the 
human brain can inspire new AI algorithms and architectures, while AI can provide insights into 
brain function and cognition. 
AI Safety and Security 
Ensuring the safety and securi

In [17]:
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="gpt-3.5-turbo-1106"):
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
    )
    return response

user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

ai_response = generate_response(system_prompt, user_prompt)

In [18]:
ai_response.choices[0].message.content

'Explainable AI (XAI) aims to make AI systems more transparent and understandable. It is considered important because it focuses on developing methods for explaining AI decisions, enhancing trust, and improving accountability.'

In [19]:
evaluate_system_prompt = "You are an intelligent evaluation system tasked with assessing the AI assistant's responses. If the AI assistant's response is very close to the true response, assign a score of 1. If the response is incorrect or unsatisfactory in relation to the true response, assign a score of 0. If the response is partially aligned with the true response, assign a score of 0.5."

evaluation_prompt = f"User Query: {query}\nAI Response:\n{ai_response.choices[0].message.content}\nTrue Response: {data[0]['ideal_answer']}\n{evaluate_system_prompt}"

evaluation_response = generate_response(evaluate_system_prompt, evaluation_prompt)

print(evaluation_response.choices[0].message.content)

The AI response is very close to the true response, capturing the essence of Explainable AI and its importance. It correctly mentions that Explainable AI aims to make AI systems more transparent and understandable, and it highlights the importance of building trust and accountability. Therefore, the score is 1.


In [20]:
import json
import asyncio
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, SimilarityFunction

# Initialize the SentenceTransformer model and set the similarity function
model = SentenceTransformer("all-MiniLM-L6-v2")
model.similarity_fn_name = SimilarityFunction.DOT

In [23]:
async def process_validation_data(k):
    system_prompt = """You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly and exactly from the provided context, respond with: 'I do not have enough information to answer that.'
    First think about the keywords from the question and then use them to elaborate the answer.
    The response needs to be just the answer sentence
    
    """
    # Load the validation data from the JSON file
    with open('val.json') as f:
        data = json.load(f)

    # List to store the results for each sample
    results = []

    # Iterate over each example in the validation data
    for idx, item in enumerate(data):
        query = item['question']
        ideal_answer = item['ideal_answer']
        
        # Retrieve the top k most relevant context chunks
        top_chunks = semantic_search(query, text_chunks, response, k=k)
        
        # Create the user prompt by combining all context chunks and the query
        context_prompt = "\n".join([
            f"Context {i + 1}:\n{chunk}\n=====================================\n"
            for i, chunk in enumerate(top_chunks)
        ])
        user_prompt = f"{context_prompt}\nQuestion: {query}"
        
        # Generate the AI response using the system prompt and the user prompt
        ai_response = generate_response(system_prompt, user_prompt).choices[0].message.content
        
        # Evaluate similarity using SentenceTransformer
        # Encode the AI response and ideal answer
        embedding_response = model.encode([ai_response])
        embedding_ideal = model.encode([ideal_answer])
        # Compute similarity score (result is a 1x1 matrix; extract the single value)
        similarity_matrix = model.similarity(embedding_response, embedding_ideal)
        score = similarity_matrix[0][0].numpy()
        
        # Prepare the result dictionary with dynamic context columns
        result = {
            "Query": query,
            "Ideal Answer": ideal_answer,
            "AI Response": ai_response,
            "Score": score
        }
        # Add each context as its own column
        for i, chunk in enumerate(top_chunks):
            result[f"Context {i + 1}"] = chunk
        
        # Append the result to the list
        results.append(result)

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    return df

In [27]:
import nest_asyncio
nest_asyncio.apply()

In [28]:
# Run the asynchronous function and store the DataFrame in df
df = asyncio.run(process_validation_data(k=2))
df

Unnamed: 0,Query,Ideal Answer,AI Response,Score,Context 1,Context 2
0,What is 'Explainable AI' and why is it conside...,Explainable AI (XAI) aims to make AI systems m...,Explainable AI (XAI) aims to make AI systems m...,0.9859673,"eal-time data, identify \npollution sources, a...",nt aligns with societal values. Education and ...
1,Can AI be used to predict earthquakes?,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,ts that support conservation efforts. \n \n \n...,AI. Early AI research focused on problem-solv...
2,What are some of the ethical concerns related ...,I don't have enough information to answer that.,"Bias and fairness, transparency and explainabi...",0.19652365,he entertainment industry uses AI for content ...,"inability \nMany AI systems, particularly deep..."
3,How does AI contribute to personalized medicine?,AI enables personalized medicine by analyzing ...,AI contributes to personalized medicine by ana...,0.9515934,ng areas for improvement. \nEducational Data M...,"is used in self-driving cars, medical imaging..."
4,Does the document mention any specific compani...,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,Understanding Artificial Intelligence \nChapte...,regulation of AI is a complex and evolving ar...
5,What is the role of AI in smart grids?,AI optimizes energy distribution in smart grid...,AI optimizes energy management in smart cities...,0.7971191,"estion, and enhance \npublic transit. These sy...",ts that support conservation efforts. \n \n \n...
6,"Can AI write a complete, original novel?",I don't have enough information to answer that.,"AI is used to write articles, generate content...",0.04168408,existing art and generate new pieces that exhi...,Understanding Artificial Intelligence \nChapte...
7,What is a 'cobot'?,It mentions collaborative settings (cobots) in...,A 'cobot' is a type of industrial robot that w...,0.74303055,"adapt to changing \nenvironments, and interac...","more pervasive, there will be a growing need f..."
8,What is Direct Air Capture (DAC) used for?,DAC technology removes CO2 directly from the a...,I do not have enough information to answer that.,0.04811383,"estion, and enhance \npublic transit. These sy...",nd engaging interactions. AI-powered character...
9,Is AI currently being used to control nuclear ...,I don't have enough information to answer that.,I do not have enough information to answer that.,0.9543278,es and ethical frameworks for \nAI development...,ts that support conservation efforts. \n \n \n...


In [29]:
for k in range(1, 9):
    print(asyncio.run(process_validation_data(k=k))['Score'].mean())

0.657020092010498
0.736910343170166
0.5688803672790528
0.5720257759094238
0.6665711402893066
0.6588428497314454
0.656826400756836
0.662020492553711
