<a href="https://colab.research.google.com/github/SantosCristiano/artificial-intelligence-python/blob/main/%E3%80%90public%E3%80%91Llamaindex_Advanced_RAG_video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Llamaindex - Advanced RAG - video

useful links:
- https://www.llamaindex.ai/


In [None]:
%%capture
!pip install llama-index >> null
!pip install openai >> null
!pip install pypdf >> null   # for reading PDF files
!pip install docx2txt > null # for reading MS doc files

In [None]:
import os
import openai

import logging
import sys
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
    ServiceContext,
    Document
)

from llama_index.llms import OpenAI, Anthropic
from llama_index.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser, get_leaf_nodes
from llama_index.text_splitter import SentenceSplitter
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index.schema import MetadataMode
from llama_index.postprocessor import MetadataReplacementPostProcessor

# from IPython.display import Markdown, display
# from transformers import AutoTokenizer, T5ForConditionalGeneration

# Step 0:  Authentication with Org ID and API Key

In [None]:
openai_key = "xxxxxxxxxxxxxxx" #<--- Your API KEY
#org_ID = "xxxxxxxxxxxx" #<--- Your Organization ID

In [None]:
openai.api_key = openai_key

# Step 1:  Fetch Data and Store into local directory

In [None]:
# create local directory and retrieve file from external source
!mkdir -p 'my_data'
!wget 'https://www.gutenberg.org/cache/epub/72306/pg72306.txt' -O './my_data/teahistory.txt'
!wget 'https://www.gutenberg.org/cache/epub/11367/pg11367.txt' -O './my_data/chinahistory.txt'

--2023-12-13 08:45:56--  https://www.gutenberg.org/cache/epub/72306/pg72306.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493827 (482K) [text/plain]
Saving to: ‘./my_data/teahistory.txt’


2023-12-13 08:46:09 (3.60 MB/s) - ‘./my_data/teahistory.txt’ saved [493827/493827]

--2023-12-13 08:46:09--  https://www.gutenberg.org/cache/epub/11367/pg11367.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977274 (954K) [text/plain]
Saving to: ‘./my_data/chinahistory.txt’


2023-12-13 08:46:21 (5.09 MB/s) - ‘./my_data/chinahistory.txt’ saved [977274/977274]



# Step 2:  Load into files into "Document" Object

In [None]:
 documents = SimpleDirectoryReader("./my_data/").load_data()

# Step 2B (Optional):  Inspect the documents obect

In [None]:
# Inspect the documents
print("length of doc: "+ str(len(documents)))
print("----")
pprint(documents)


length of doc: 2
----
[Document(id_='569f5583-9b5a-4ac8-9c0b-d998a0355ef3', embedding=None, metadata={'file_path': 'my_data/chinahistory.txt', 'file_name': 'chinahistory.txt', 'file_type': 'text/plain', 'file_size': 977274, 'creation_date': '2023-12-13', 'last_modified_date': '2023-12-05', 'last_accessed_date': '2023-12-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='53a758c229f0cfbd63ac4e8e5ab7cfc33250a17894a63be43e688edbd6b2f056', text='\ufeffThe Project Gutenberg eBook of A History of China\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License in

In [None]:
documents[0].metadata
documents[1].metadata

{'file_path': 'my_data/teahistory.txt',
 'file_name': 'teahistory.txt',
 'file_type': 'text/plain',
 'file_size': 493827,
 'creation_date': '2023-12-13',
 'last_modified_date': '2023-12-04',
 'last_accessed_date': '2023-12-13'}

# Step 3:  Node Parsing & Indexing (Base & Sentence Window Method)

In [None]:
# create the sentence window node parser w/ default settings
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

base_node_parser = SentenceSplitter()

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)


In [None]:
nodes = sentence_node_parser.get_nodes_from_documents(documents)
base_nodes = base_node_parser.get_nodes_from_documents(documents)

In [None]:
print("---------")
print("SENTENCE NODES")
print("---------")
print(nodes[100])
print("---------")
print("BASE NODES")
print("---------")
print(base_nodes[100])

---------
SENTENCE NODES
---------
Node ID: 1c0fcf4a-6ead-4662-bca7-067306d7db53
Text: We have no desire to show that China's history is the most
glorious or her civilization the oldest in the world.
---------
BASE NODES
---------
Node ID: 69b770a8-68ce-4b2f-91ce-a6678cb04b39
Text: This one fact alone demonstrates that the Hsia rejected Chinese
culture and were nationalistic Hun. Thus there were now two realms in
North China, one undergoing progressive sinification, the other
falling back to the old traditions of the Huns.  3 _Rise of the Toba
to a great Power_  The present province of Szechwan, in the west, had
belonged t...


In [None]:
dict(base_nodes[100])

{'id_': '69b770a8-68ce-4b2f-91ce-a6678cb04b39',
 'embedding': None,
 'metadata': {'file_path': 'my_data/chinahistory.txt',
  'file_name': 'chinahistory.txt',
  'file_type': 'text/plain',
  'file_size': 977274,
  'creation_date': '2023-12-13',
  'last_modified_date': '2023-12-05',
  'last_accessed_date': '2023-12-13'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='569f5583-9b5a-4ac8-9c0b-d998a0355ef3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'my_data/chinahistory.txt', 'file_name': 'chinahistory.txt', 'file_type': 'text/plain', 'file_size': 977274, 'creation_date': '2023-12-13', 'last_modified_date': '2023-12-05', 'last_accessed_date': '2023-12-13'}, has

In [None]:
ctx_sentence = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=sentence_node_parser)
ctx_base = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=base_node_parser)

sentence_index = VectorStoreIndex(nodes, service_context=ctx_sentence)
base_index = VectorStoreIndex(base_nodes, service_context=ctx_base)

# Step 4:  Save to Persistent Storage

In [None]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")
base_index.storage_context.persist(persist_dir="./base_index")


In [None]:
# Download to own computer for backup

!zip -r ./indexes.zip ./*_index

from google.colab import files
files.download("./indexes.zip")

  adding: base_index/ (stored 0%)
  adding: base_index/image__vector_store.json (deflated 19%)
  adding: base_index/graph_store.json (stored 0%)
  adding: base_index/index_store.json (deflated 68%)
  adding: base_index/docstore.json (deflated 76%)
  adding: base_index/default__vector_store.json (deflated 62%)
  adding: sentence_index/ (stored 0%)
  adding: sentence_index/image__vector_store.json (deflated 19%)
  adding: sentence_index/graph_store.json (stored 0%)
  adding: sentence_index/index_store.json (deflated 68%)
  adding: sentence_index/docstore.json (deflated 94%)
  adding: sentence_index/default__vector_store.json (deflated 63%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Step 5:  Retrieve from Storage

In [None]:
# rebuild storage context
SC_retrieved_sentence = StorageContext.from_defaults(persist_dir="./sentence_index")
SC_retrieved_base = StorageContext.from_defaults(persist_dir="./base_index")

In [None]:
# load index
retrieved_sentence_index = load_index_from_storage(SC_retrieved_sentence)
retrieved_base_index = load_index_from_storage(SC_retrieved_base)

# Step 6: Create query engine

In [None]:
from llama_index.postprocessor import MetadataReplacementPostProcessor

sentence_query_engine = retrieved_sentence_index.as_query_engine(
    similarity_top_k=5,
    verbose=True,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

base_query_engine = retrieved_base_index.as_query_engine(
    similarity_top_k=5,
    verbose=True
)

# Step 7:  Inference

In [None]:
question = "Something happened in the United States 10 years after the first American ships sailed for China which could have made it more expensive to purchase tea. what happened that year? Try to break down your answer into steps."

In [None]:
base_response = base_query_engine.query(
    question
)
print(base_response)

1. The first American ships sailed for China in 1784, bringing back 880,000 pounds of Tea.
2. During 1786-87, five other ships brought over 1,000,000 pounds of Tea to the United States.
3. In 1790, the earliest official record of the importation of Tea into the United States was made.
4. The order of increase for its importation, value, and consumption in the country by decades since 1790 is provided.
5. Something happened in the United States 10 years after the first American ships sailed for China, which could have made it more expensive to purchase tea.
6. To find out what happened that year, we need to look at the information provided in the context.


In [None]:
sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

Step 1: The first American ships sailed for China in 1784, bringing back 880,000 pounds of tea. 

Step 2: During 1786-87, five other ships brought over 1,000,000 pounds of tea to the United States. 

Step 3: In 1794, the rates of duty on tea were increased by 75 percent on direct importations and 100 percent on indirect importations. 

Step 4: Therefore, 10 years after the first American ships sailed for China, the United States increased the rates of duty on tea, which could have made it more expensive to purchase tea.
