
**Question - Answering using Llama-2 7b model with 4bit quantization**

In [None]:
%pip install llama-index
%pip install transformers accelerate bitsandbytes
%pip install llama-index-readers-web
%pip install llama-index-llms-huggingface
%pip install llama-index-embeddings-huggingface
%pip install llama-index-program-openai
%pip install llama-index-agent-openai

In [None]:
!pip install pdfminer.six

In [25]:
import time

### Data

In [17]:
# to get data from the pdf - pdf reader

from pdfminer.high_level import extract_text
from llama_index.core import Document

def extract_pdf_text(pdf_path):
    with open(pdf_path, 'rb') as f:
        raw_text = extract_text(f)
    return raw_text.strip()

text = extract_pdf_text("/content/blade runner 2049.pdf")
documents = [Document(text=text)]

### LLM

This should run on a T4 GPU in the free tier

In [4]:
# huggingface api token for downloading llama2
hf_token = "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"

In [18]:
#setting up llm with 4-bit quantization

import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST] "),
    context_window=3900,
    model_kwargs={"token": hf_token, "quantization_config": quantization_config},
    tokenizer_kwargs={"token": hf_token},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = "local:BAAI/bge-small-en-v1.5"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_documents(documents)

In [9]:
from llama_index.core.response.notebook_utils import display_response

## Basic Query Engine

### Tree Summarize

In [29]:
# Vector index with tree summarize response mode is choosen for faster and quality response
start = time.time()
query_engine = vector_index.as_query_engine(response_mode="tree_summarize")  # different response_modes "compact", "refine" can checked
response = query_engine.query("How many male and female characters are in the movie?")
end = time.time()
print("The time taken is : ", end-start)
print("------------------------------------------------------------------------------")
display_response(response)

The time taken is :  9.142175197601318
------------------------------------------------------------------------------


**`Final Response:`** Based on the information provided in the context, there are 4 male characters and 3 female characters in the movie:

Male Characters:

1. K
2. Luv
3. Joi
4. Deckard

Female Characters:

1. Mariette
2. The Girl
3. Nianader Wallace