# We are gonna follow 5 steps to create this RAG System
1) Data Ingestion
2) Indexing and Storing
3) Retrieval
4) Response Synthesis
5) Query/Chat Engine

# Data Ingestion

In [1]:
%%capture
!pip install llama-index

In [2]:
import os
import yaml
with open('chatgpt-api-credentials.yml') as file:
    openai_key = yaml.safe_load(file)
os.environ['OPENAI_API_KEY'] = openai_key['OPENAI-API-KEY']

In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_files=['./transformers.pdf']).load_data()

In [4]:
len(documents)

15

### Embedding Model

In [5]:
from llama_index.embeddings.openai import OpenAIEmbedding
embedding_model = OpenAIEmbedding(model="text-embedding-3-large")

### LLM

In [6]:
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-4")

# Indexing

In [8]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents,embed_model=embedding_model)

In [9]:
retriever = index.as_retriever()

In [10]:
retrieved_nodes = retriever.retrieve("What is self attention?")

In [19]:
len(retrieved_nodes)

2

# Response Synthesis

In [20]:
from llama_index.core import get_response_synthesizer
response_synthesizer = get_response_synthesizer(llm = llm)

# Query Engine

In [21]:
query_engine = index.as_query_engine(llm = llm, response_synthesizer= response_synthesizer)


In [22]:
response = query_engine.query("what is self attention?")

In [23]:
response

Response(response='Self-attention, also known as scaled dot-product attention, is a mechanism in which the input consists of queries and keys of a certain dimension, and values of another dimension. The dot products of the query with all keys are computed, divided by the square root of the dimension of the keys, and a softmax function is applied to obtain the weights on the values. This attention function can be computed on a set of queries simultaneously, with the keys and values also packed together into matrices. This mechanism is used to allow the model to focus on different parts of the input sequence when producing an output.', source_nodes=[NodeWithScore(node=TextNode(id_='4724d83f-6043-4cb7-ad31-137841e47252', embedding=None, metadata={'page_label': '4', 'file_name': 'transformers.pdf', 'file_path': 'transformers.pdf', 'file_type': 'application/pdf', 'file_size': 2215244, 'creation_date': '2024-09-10', 'last_modified_date': '2024-03-27'}, excluded_embed_metadata_keys=['file_nam

In [27]:
dir(response)

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'get_formatted_sources',
 'metadata',
 'response',
 'source_nodes']

In [28]:
response.response

'Self-attention, also known as scaled dot-product attention, is a mechanism in which the input consists of queries and keys of a certain dimension, and values of another dimension. The dot products of the query with all keys are computed, divided by the square root of the dimension of the keys, and a softmax function is applied to obtain the weights on the values. This attention function can be computed on a set of queries simultaneously, with the keys and values also packed together into matrices. This mechanism is used to allow the model to focus on different parts of the input sequence when producing an output.'

# Simplified version of all the above steps

In [31]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

documents = SimpleDirectoryReader(input_files=['./Transformers.pdf']).load_data()

llm = OpenAI(model="gpt-4")
embed_model = OpenAIEmbedding(model = "text-embedding-3-large")

index = VectorStoreIndex.from_documents(documents,embed_model = embed_model)

query_engine = index.as_query_engine(llm=llm)