In [7]:
# download data
!mkdir data
!wget --user-agent "Mozilla" "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt" -O "data/paul_graham_essay.txt"

--2024-06-28 15:08:01--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/naive_rag/paul_graham_essay.txt’


2024-06-28 15:09:11 (1.09 KB/s) - ‘data/naive_rag/paul_graham_essay.txt’ saved [75042/75042]



In [16]:
# create a probe
! echo "SigAIoT will discuss RAG on the morning of July 4th, 2024." > data/probe.txt

In [5]:
# common configs
import os
from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI


from utils_fn.helpers import print_qa

load_dotenv()

txt_file_path = "data/paul_graham_essay.txt" 
probe_file_path = "data/probe.txt"
gpt35_llm = OpenAI(model="gpt-3.5-turbo", api_key=os.environ.get("OPENAI_API_KEY"))

# 1. Naive RAG

In [8]:
# ingestion
documents = SimpleDirectoryReader(input_files=[txt_file_path, probe_file_path]).load_data()
nodes = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=256, chunk_overlap=0)]).run(documents)
# index
index = VectorStoreIndex.from_documents(documents)
# query
vector_retriever = index.as_retriever(similarity_top_k=5)
query_engine = index.as_query_engine(similarity_top_k=5, llm=gpt35_llm)


Parsing nodes: 0it [00:00, ?it/s]


In [9]:
q1 = "what were things that the author worked on before college?"
print_qa(q1, gpt35_llm.complete(q1), query_engine.query(q1))

[92mQuestion:[0m what were things that the author worked on before college? 
[92mLLM:[0m Before college, the author worked on various projects and activities such as writing for the school newspaper, participating in debate club, volunteering at a local animal shelter, and interning at a marketing firm. They also worked part-time jobs at a grocery store and a coffee shop to save money for college. 
[92mRAG:[0m The author worked on writing short stories and programming, particularly on an IBM 1401 using an early version of Fortran during 9th grade. Later on, the author started programming on microcomputers like the TRS-80 and wrote simple games, a rocket prediction program, and a word processor.


In [10]:
q2 = "What will be discussed on 2024-7-4?"
print_qa(q2, gpt35_llm.complete(q2), query_engine.query(q2))

[92mQuestion:[0m What will be discussed on 2024-7-4? 
[92mLLM:[0m It is difficult to predict the specific topics that will be discussed on July 4, 2024, as it will depend on current events, political developments, and cultural trends at that time. However, it is likely that discussions on Independence Day in the United States will focus on themes related to freedom, democracy, patriotism, and national identity. Other potential topics could include current political issues, social movements, international relations, and cultural events happening around that time. 
[92mRAG:[0m The discussion on July 4, 2024, will likely revolve around the progress and developments related to the new Lisp language called Bel that was created by Paul Graham in Arc.
