# Testing the model

## 1. Install dependencies

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade
     

!pip install langchain langchain_core einops accelerate transformers bitsandbytes scipy
     

!pip install xformers sentencepiece 
     

!pip install llama-index
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-langchain
%pip install llama-index-llms-huggingface


!pip install python-dotenv

Looking in indexes: https://download.pytorch.org/whl/cu117
[0mCollecting langchain
  Downloading langchain-0.1.20-py3-none-any.whl.metadata (13 kB)
Collecting langchain_core
  Downloading langchain_core-0.1.52-py3-none-any.whl.metadata (5.9 kB)
Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting scipy
  Downloading scipy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[

In [2]:
# Import transformer classes for generation

from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig

# Import torch for datatype attributes

import torch

import os
from dotenv import load_dotenv

In [None]:
# Define model variable and load token from .env file

load_dotenv()

name = "meta-llama/Llama-2-70b-chat-hf"
token = env

## 2. Instantiate the tokenizer

In [4]:
# Instantiate tokenizer

tokenizer = AutoTokenizer.from_pretrained(
    name, 
    cache_dir='./model/', 
    use_auth_token=token)



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

## 3. Instantiate the model

In [5]:
# First define quantization settings (to reduce memory usage and computational requirements), then instantiate model

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    name, 
    cache_dir='./model/', 
    use_auth_token=token, 
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    rope_scaling={"type": "dynamic", "factor": 2}) 




config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/66.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/15 [00:00<?, ?it/s]

model-00001-of-00015.safetensors:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

model-00002-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00003-of-00015.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00004-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00005-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00006-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00007-of-00015.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00008-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00009-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00010-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00011-of-00015.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00012-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00013-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00014-of-00015.safetensors:   0%|          | 0.00/9.50G [00:00<?, ?B/s]

model-00015-of-00015.safetensors:   0%|          | 0.00/524M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

## 4. Try the model

In [6]:
# Setup a prompt, pass it to the tokenizer and set up the text streamer

prompt = "### User:Who is the richest man in  \
          the world and what is his networth? \
          ### Assistant:"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [7]:
# Run the model
output = model.generate(**inputs, 
                        streamer=streamer, 
                        use_cache=True, 
                        max_new_tokens=float('inf'))



The richest man in the world is currently Bernard Arnault, with an estimated net worth of over $200 billion. He is the CEO of LVMH, a luxury goods conglomerate that owns brands such as Louis Vuitton, Moet Hennessy, and Hennessy. His net worth is estimated to be around $203 billion, according to Forbes.

However, it's important to note that the ranking of the richest people in the world can change constantly, as the net worth of individuals can fluctuate based on various factors such as market conditions, business performance, and investments.

It's also worth mentioning that there are other billionaires who are not far behind Bernard Arnault in terms of net worth, such as Jeff Bezos, Bill Gates, and Warren Buffett. These individuals have also built impressive fortunes through their successful business ventures and investments.


In [10]:
# Convert the output tokens back to text 

output_text = tokenizer.decode(output[0], skip_special_tokens=True)
output_text

"### User:Who is the richest man in            the world and what is his networth?           ### Assistant: The richest man in the world is currently Bernard Arnault, with an estimated net worth of over $200 billion. He is the CEO of LVMH, a luxury goods conglomerate that owns brands such as Louis Vuitton, Moet Hennessy, and Hennessy. His net worth is estimated to be around $203 billion, according to Forbes.\n\nHowever, it's important to note that the ranking of the richest people in the world can change constantly, as the net worth of individuals can fluctuate based on various factors such as market conditions, business performance, and investments.\n\nIt's also worth mentioning that there are other billionaires who are not far behind Bernard Arnault in terms of net worth, such as Jeff Bezos, Bill Gates, and Warren Buffett. These individuals have also built impressive fortunes through their successful business ventures and investments."

# RAG with LlamaIndex

## 1. Import the prompt wrapper for llamaindex

In [11]:

# Import the prompt wrapper...but for llama index

from llama_index.core.prompts.prompts import SimpleInputPrompt



# Create a system prompt 
system_prompt = """[INST] <>
You are a helpful, respectful and honest assistant. Always answer as 
helpfully as possible, while being safe. Your answers should not include
any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain 
why instead of answering something not correct. If you don't know the answer 
to a question, please don't share false information.

Your goal is to provide answers relating to the financial performance of 
the company.<>
"""
# Throw together the query wrapper
query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")


# Complete the query prompt
query_wrapper_prompt.format(query_str='hello')

'hello [/INST]'

## 2. Create a HF LLM using the llama index HuggingFaceLLM wrapper

In [12]:
# Import the llama index HF Wrapper
from llama_index.llms.huggingface import HuggingFaceLLM

# # Create a HF LLM using the llama index wrapper 
llm = HuggingFaceLLM(context_window=4096,
                    max_new_tokens=256,
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)


The model `StabilityAI/stablelm-tuned-alpha-3b` and tokenizer `meta-llama/Llama-2-70b-chat-hf` are different, please ensure that they are compatible.


## 3. Import embedding classes and define embedding instance

In [13]:
# Import embeddings wrapper
from llama_index.embeddings.langchain import LangchainEmbedding

# Import HF embeddings - need these to represent document chunks
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
     

# # Create and dl embeddings instance  
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 4. Define a new service context instance with our transformers model as the llm. 
### (This is needed because the default llm for LlamaIndex is OpenAI, not Llama).

In [14]:

# Import dependencies to change service context. This is needed because by default, LlamaIndex works with OpenAI

from llama_index.core import set_global_service_context
from llama_index.core import ServiceContext
     

# # Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embeddings
)

# # Set the service context
set_global_service_context(service_context)

  service_context = ServiceContext.from_defaults(


## 5. Import dependencies for loading, storing and then retrieving documents.
### (download_loader to load documents and VectorStoreIndex for storing the document chunks)

In [17]:
!pip install --upgrade pymupdf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pymupdf
  Downloading PyMuPDF-1.24.3-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.3-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.3
[0m

In [18]:

# Import dependencies to load documents 

from llama_index.core import VectorStoreIndex, download_loader
from pathlib import Path
     

# Download PDF Loader 
PyMuPDFReader = download_loader("PyMuPDFReader")
# Create PDF Loader
loader = PyMuPDFReader()
# Load documents 
documents = loader.load(file_path=Path('annual_report.pdf'), metadata=True)
     

# Create an index - we'll be able to query this in a sec
index = VectorStoreIndex.from_documents(documents)

  PyMuPDFReader = download_loader("PyMuPDFReader")




[0m

## 6. Set up a query_engine instance that allows us to query the Vector Database using our llm (Llama 2).

In [41]:
# Setup index query engine using LLM 
query_engine = index.as_query_engine()
     

# Test out a query in natural
response = query_engine.query("What was Revenue in FY21 and FY22?")
response.response

' Revenue in FY21 was £145.8 million, and in FY22 it was £163.8 million, which is an increase of 12%.'

### Yay! The result is 100% accurate!!