In [None]:
!pip install llama-index transformers



# Setup

#### Download Data

In [None]:
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

--2023-10-28 06:09:40--  https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2023-10-28 06:09:40 (2.58 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



#### Load Data

In [None]:
from llama_index import SimpleDirectoryReader

reader = SimpleDirectoryReader("./data/paul_graham/")
documents = reader.load_data()

# Building QA System with OpenSource LLM

In [None]:
from llama_index.llms.anyscale import Anyscale
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
import openai

ANYSCALE_ENDPOINT_TOKEN = "esecret_lail63z8bcg3vgnaehersycjkp"
openai.api_key = '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'

# Define LLM
llm = Anyscale(model = "meta-llama/Llama-2-13b-chat-hf",
                 api_key=ANYSCALE_ENDPOINT_TOKEN)

# Define Embedding Model
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Abstract llm, embedding model
service_context = ServiceContext.from_defaults(
    llm = llm,
    embed_model = embed_model,
)

# Create index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
# Setup Query Engine
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("why did paul graham start YC?")

In [None]:
from IPython.display import display, HTML
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

# Building QA System with OpenSource LLM and Embeddings.

In [None]:
from llama_index.embeddings import HuggingFaceEmbedding

# Define LLM
llm = Anyscale(model = "meta-llama/Llama-2-13b-chat-hf",
                 api_key=ANYSCALE_ENDPOINT_TOKEN)

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Abstract llm, embedding model
service_context = ServiceContext.from_defaults(
    llm = llm,
    embed_model = embed_model,
)

# Create index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
# Setup Query Engine
query_engine = index.as_query_engine()

In [None]:
%%timeit -r 1 -n 1
response = query_engine.query("why did paul graham start YC?")

from IPython.display import display, HTML
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))



APIError: ignored

### Let's use Optimum Embeddings from HuggingFace

You can install the dependencies with `pip install transformers optimum[exporters]`.


In [None]:
!pip install transformers optimum onnxruntime onnx

Collecting optimum[exporters]
  Downloading optimum-1.13.2.tar.gz (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.0/301.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting coloredlogs (from optimum[exporters])
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from optimum[exporters])
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnx (from optimum[exporters])
  Downloading onnx-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━

First, we need to create the ONNX model. ONNX models provide improved inference speeds, and can be used across platforms (i.e. in TransformersJS)

In [None]:
from llama_index.embeddings import OptimumEmbedding

OptimumEmbedding.create_and_save_optimum_model("BAAI/bge-small-en-v1.5", "./bge_onnx")

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
	- default: The default ONNX variant.
Using framework PyTorch: 2.0.1+cu118
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR

Saved optimum model to ./bge_onnx. Use it with `embed_model = OptimumEmbedding(folder_name='./bge_onnx')`.


In [None]:
# load the embedding model
embed_model = OptimumEmbedding(folder_name="./bge_onnx")

In [None]:
# Define LLM
llm = Anyscale(model = "meta-llama/Llama-2-13b-chat-hf",
                 api_key=ANYSCALE_ENDPOINT_TOKEN)

# Abstract llm, embedding model
service_context = ServiceContext.from_defaults(
    llm = llm,
    embed_model = embed_model,
)

# Create index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
# Create Query Engine
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("why did paul graham start YC?")

from IPython.display import display, HTML
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

9.35 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Customizing the chunk size, chunk overlap and LLM context window, number of output tokens.

In [None]:
from llama_index import ServiceContext, LLMPredictor, PromptHelper, VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.llms.anyscale import Anyscale

# Define LLM
ANYSCALE_ENDPOINT_TOKEN = "esecret_lail63z8bcg3vgnaehersycjkp"
llm = Anyscale(model = "meta-llama/Llama-2-13b-chat-hf",
                 api_key=ANYSCALE_ENDPOINT_TOKEN)

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Create Node Parser
node_parser = SimpleNodeParser.from_defaults(chunk_size=2000, chunk_overlap=100)

# Create PromptHelper
prompt_helper = PromptHelper(
  context_window=4096,
  num_output=512,
  chunk_overlap_ratio=0.1,
)

# Customise LLM, Embedding model, Node parser and Prompthelper
service_context = ServiceContext.from_defaults(
  llm=llm,
  embed_model=embed_model,
  node_parser=node_parser,
  prompt_helper=prompt_helper
)

# Create Index
index = VectorStoreIndex.from_documents(documents, service_context = service_context)

In [None]:
# Setup Query Engine
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("why did paul graham start YC?")

from IPython.display import display, HTML
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

# Saving and Loading the Index

In [None]:
from llama_index import StorageContext, load_index_from_storage
from llama_index.node_parser import SimpleNodeParser

# create parser and parse document into nodes
node_parser = SimpleNodeParser.from_defaults(chunk_size=2000, chunk_overlap=100)
nodes = node_parser.get_nodes_from_documents(documents)

# create storage context using default stores
storage_context = StorageContext.from_defaults()

# # build index
index = VectorStoreIndex(nodes, storage_context=storage_context, service_context = service_context)

# save index
index.storage_context.persist(persist_dir="storage")

In [None]:
# to load index later, make sure you setup the storage context
# this will loaded the persisted stores from persist_dir
storage_context = StorageContext.from_defaults(persist_dir="storage")

# then load the index object
# if loading multiple indexes from a persist dir
loaded_index = load_index_from_storage(storage_context = storage_context, service_context=service_context)

# setup query engine
query_engine = loaded_index.as_query_engine(similarity_top_k=3)
response = query_engine.query("why did paul graham start YC?")

# print the synthesized response.
display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

# Count Prompt Tokens and Checking underlying Prompt

In [None]:
from llama_index import set_global_service_context
from llama_index.callbacks import CallbackManager, TokenCountingHandler
import tiktoken

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

callback_manager = CallbackManager([token_counter])

llm = OpenAI(model='gpt-3.5-turbo')

service_context = ServiceContext.from_defaults(
    llm=llm, callback_manager=callback_manager
)

# set the global default!
set_global_service_context(service_context)

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [None]:
print(token_counter.total_embedding_token_count)

16662


Let's reset embedding count.



In [None]:
token_counter.reset_counts()

In [None]:
print(token_counter.total_embedding_token_count)

0


In [None]:
query_engine = index.as_query_engine(similarity_top_k=4)
response = query_engine.query("Why did author start YC?")

In [None]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
)

Embedding Tokens:  15 
 LLM Prompt Tokens:  9037 
 LLM Completion Tokens:  310 
 Total LLM Token Count:  9347 



In [None]:
print("prompt: ", token_counter.llm_token_counts[0].prompt, "...\n")
print(
    "prompt token count: ", token_counter.llm_token_counts[0].prompt_token_count, "\n"
)

print("completion: ", token_counter.llm_token_counts[0].completion, "...\n")
print(
    "completion token count: ",
    token_counter.llm_token_counts[0].completion_token_count,
    "\n",
)

print("total token count", token_counter.llm_token_counts[0].total_token_count)

prompt:  system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 