In [None]:
# download data
!mkdir data
!wget --user-agent "Mozilla" "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt" -O "data/paul_graham_essay.txt"

In [None]:
# create a probe
! echo "SigAIoT will discuss RAG on the morning of July 4th, 2024." > data/probe.txt

# 1. Naive RAG

In [5]:
# common configs
import os
from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


from utils_fn.helpers import print_qa

load_dotenv()

txt_file_path = "data/paul_graham_essay.txt" 
probe_file_path = "data/probe.txt"
gpt35_llm = OpenAI(model="gpt-3.5-turbo", api_key=os.environ.get("OPENAI_API_KEY"))

In [None]:
# ingestion
documents = SimpleDirectoryReader(input_files=[txt_file_path, probe_file_path]).load_data()
# index
index = VectorStoreIndex.from_documents(
    documents,
    transformations=[SentenceSplitter(chunk_size=256, chunk_overlap=0)]
)
# query
query_engine = index.as_query_engine(similarity_top_k=5, llm=gpt35_llm)

In [None]:
q1 = "what were things that the author worked on before college?"
print_qa(q1, gpt35_llm.complete(q1), query_engine.query(q1))

[92mQuestion:[0m what were things that the author worked on before college? 
[92mLLM:[0m Before college, the author worked on various projects and activities such as writing for the school newspaper, participating in debate club, volunteering at a local animal shelter, and interning at a marketing firm. They also worked part-time jobs, such as babysitting and tutoring, to save money for college. Additionally, the author was involved in extracurricular activities such as playing sports and participating in community service projects. 
[92mRAG:[0m The author worked on writing and programming before college.


In [None]:
q2 = "What will be discussed on 2024-7-4?"
print_qa(q2, gpt35_llm.complete(q2), query_engine.query(q2))

[92mQuestion:[0m What will be discussed on 2024-7-4? 
[92mLLM:[0m It is difficult to predict exactly what will be discussed on July 4, 2024, as it will depend on current events, political developments, and cultural trends at that time. However, it is likely that discussions on this date will revolve around Independence Day celebrations in the United States, as well as topics such as politics, social issues, and current events. Additionally, there may be discussions about historical events and milestones that have occurred on July 4th in the past. 
[92mRAG:[0m RAG will be discussed on the morning of July 4th, 2024.


# 2. Naive RAG = Parsing & Ingestion + Querying 

In [None]:
# Parsing: file -> Document
print(f"documents is a {type(documents)} of length {len(documents)}")
print("")      
print(f"Each element mainly contains:\n{documents[0]}")

documents is a <class 'list'> of length 2

Each element mainly contains:
Doc ID: 66f6baf9-6a7c-4680-afd2-a94622e04b40
Text: What I Worked On    February 2021    Before college the two main
things I worked on, outside of school, were writing and programming. I
didn't write essays. I wrote what beginning writers were supposed to
write then, and probably still are: short stories. My stories were
awful. They had hardly any plot, just characters with strong feelings,
whic...


In [None]:
# Ingestion: Document -> Node
pipeline = IngestionPipeline(transformations=[
    SentenceSplitter(chunk_size=256, chunk_overlap=0),
    OpenAIEmbedding(),
    ])
nodes = pipeline.run(documents=documents)


print(f"nodes is a {type(nodes)} of length {len(nodes)}", end="\n\n")
print(f"Node.text: {len(nodes[0].text)}", end="\n")
print(f"{nodes[0].text}", end="\n\n")
print(f"Node.embedding: {len(nodes[0].embedding)}", end="\n")
print(f"{nodes[0].embedding}", end="\n")

nodes is a <class 'list'> of length 77

Node.text: 944
What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.

The language we used was an early version of Fortran.

Node.em

In [None]:
# Ingestion: Node -> Index
print(index.ref_doc_info)

{'66f6baf9-6a7c-4680-afd2-a94622e04b40': RefDocInfo(node_ids=['f44e55af-a1c7-48d8-be43-1211130b82f0', '0ce292a0-5b4b-47ea-81de-8d5076b905c2', '983ed55f-1d07-49eb-b9b1-951c6a7d482e', 'd71d6d09-48ca-4b4c-a62f-3f9ea0d04af1', 'b8b1acbd-5a25-49a3-8c90-3e600e9fe27a', 'dd2ec332-c346-4279-a45f-bd67ed612e3d', '7a45d79d-9e5a-4e67-be92-a959bfab0a99', 'add1893a-e3c6-4ee8-acf8-e77886b20b23', '98470a45-e633-4849-9007-8ce3286de241', 'a9323836-ea07-40fd-908f-5fe60b820eb2', 'c1f14d36-8df2-4223-ab0b-cc11f84515e5', '6ff82a6b-14bd-4702-a470-501ffeca727e', 'cc2ce042-ebfc-456c-99eb-aa026e24b63e', '98317929-540e-43e6-bc54-7c7e7584cab6', '63b0d1e2-bf28-4416-a95d-2323478f2807', 'dd8483fc-5dc2-4383-8eb7-15ff2d4d0d08', '9c926134-11fb-4ce9-8ce4-e8bf46dac8d5', '39cce5cc-74d4-4ae4-9d64-4ebad14fc4e2', '1d08dce2-ecd4-4e92-9366-bcfb3e06878f', '140d8f9f-d61b-483c-8dfc-ba052c5d993d', 'afd34fd4-3e6b-4c3b-bcb8-cc689abb7b28', '05c17c0f-0671-440b-93e4-98afaf2a91af', '307c05fb-692f-4b31-b4db-64a9472848e7', '17b1d5e4-b408-443

In [None]:
# Query: Index -> Retrieval
vector_retriever = index.as_retriever(similarity_top_k=5)
for x in vector_retriever.retrieve(q2):
    print(x)

Node ID: d579e4c9-810c-4cf6-ba05-5bc0375cbbfc
Text: SigAIoT will discuss RAG on the morning of July 4th, 2024.
Score:  0.806

Node ID: 71a93361-8c15-40e6-a69f-97f845a9dfdf
Text: I used to fly up to Oregon to visit her regularly, and I had a
lot of time to think on those flights. On one of them I realized I was
ready to hand YC over to someone else.    I asked Jessica if she
wanted to be president, but she didn't, so we decided we'd try to
recruit Sam Altman. We talked to Robert and Trevor and we agreed to
make it a comp...
Score:  0.737

Node ID: d71d6d09-48ca-4b4c-a62f-3f9ea0d04af1
Text: Though I liked programming, I didn't plan to study it in
college. In college I was going to study philosophy, which sounded
much more powerful. It seemed, to my naive high school self, to be the
study of the ultimate truths, compared to which the things studied in
other fields would be mere domain knowledge. What I discovered when I
got to colle...
Score:  0.736

Node ID: 3df33bea-f393-406d-872d-b278f

In [None]:
# Query: Retrieval -> Generation
query_engine.get_prompts()["response_synthesizer:text_qa_template"].get_template()

'Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: '