In [1]:
import fitz
import os
import numpy as np
import json
from openai import OpenAI

In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
pdf_path = "/home/patrick/rag_from_scratch/AI_Information.pdf"

Extracting the text from pdf file

In [3]:
def extract_text_from_pdf(pdf_path):
    mypdf = fitz.open(pdf_path)
    all_text = "" # Initizalizing an empty string to store the extarcted text

    # Iterating through each page in the pdf
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text

    return all_text

In [4]:
def chunk_text(text, n, overlap):
    chunks = []

    for i in range(0, len(text), n - overlap):
        chunks.append(text[i:i+n])

    return chunks

In [5]:
text = extract_text_from_pdf(pdf_path)

In [8]:
text_chunks = chunk_text(text, 1000, 100)

In [9]:

print("Number of text chunks:", len(text_chunks))

# Print the first text chunk
print("\nFirst text chunk:")
print(text_chunks[0])

Number of text chunks: 38

First text chunk:
Understanding Artificial Intelligence 
Chapter 1: Introduction to Artificial Intelligence 
Artificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot 
to perform tasks commonly associated with intelligent beings. The term is frequently applied to 
the project of developing systems endowed with the intellectual processes characteristic of 
humans, such as the ability to reason, discover meaning, generalize, or learn from past 
experience. Over the past few decades, advancements in computing power and data availability 
have significantly accelerated the development and deployment of AI. 
Historical Context 
The idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. 
However, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop 
in 1956 is widely considered the birthplace of AI. Early AI research focused on problem-solving 
and 

In [17]:
# Initialize the OpenAI client with the base URL and API key
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")  # Retrieve the API key from environment variables
)

In [20]:
def create_embeddings(text, model = "text-embedding-3-small"):
    response = client.embeddings.create(
        model=model,
        input=text
    )

    return response
response = create_embeddings(text_chunks)

In [22]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [26]:
def semantic_search(query, text_chunks, embeddings, k=5):
    query_embedding = create_embeddings(query).data[0].embedding
    similarity_scores = []

    for i, chunk_embedding in enumerate(embeddings):
        similarity_score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding.embedding))
        similarity_scores.append((i, similarity_score))

    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_indices = [index for index, _ in similarity_scores[:k]]
    return [text_chunks[index] for index in top_indices]

In [27]:
with open('val.json') as f:
    data = json.load(f)
query = data[0]['question']

top_chunks = semantic_search(query, text_chunks, response.data, k=2)

print("Query:", query)

for i, chunk in enumerate(top_chunks):
    print(f"Context {i + 1}:\n{chunk}\n=====================================")

Query: What is 'Explainable AI' and why is it considered important?
Context 1:
es and ethical frameworks for 
AI development and deployment is crucial. 
Weaponization of AI 
The potential use of AI in autonomous weapons systems raises significant ethical and security 
concerns. International discussions and regulations are needed to address the risks associated 
with AI-powered weapons. 
Chapter 5: The Future of Artificial Intelligence 
The future of AI is likely to be characterized by continued advancements and broader adoption 
across various domains. Key trends and areas of development include: 
Explainable AI (XAI) 
Explainable AI (XAI) aims to make AI systems more transparent and understandable. XAI 
techniques are being developed to provide insights into how AI models make decisions, 
enhancing trust and accountability. 
AI at the Edge 
AI at the edge involves processing data locally on devices, rather than relying on cloud-based 
servers. This approach reduces latency, improves 

In [None]:
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="gpt-3.5-turbo-1106"):
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
    )
    return response

user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

ai_response = generate_response(system_prompt, user_prompt)

In [35]:
ai_response.choices[0].message.content

'Explainable AI (XAI) aims to make AI systems more transparent and understandable by providing insights into how AI models make decisions. It is considered important for enhancing trust and accountability in AI systems.'

In [36]:
evaluate_system_prompt = "You are an intelligent evaluation system tasked with assessing the AI assistant's responses. If the AI assistant's response is very close to the true response, assign a score of 1. If the response is incorrect or unsatisfactory in relation to the true response, assign a score of 0. If the response is partially aligned with the true response, assign a score of 0.5."

evaluation_prompt = f"User Query: {query}\nAI Response:\n{ai_response.choices[0].message.content}\nTrue Response: {data[0]['ideal_answer']}\n{evaluate_system_prompt}"

evaluation_response = generate_response(evaluate_system_prompt, evaluation_prompt)

print(evaluation_response.choices[0].message.content)

The AI assistant's response is very close to the true response, capturing the essence of Explainable AI and its importance. Therefore, the score is 1.
