In [1]:
import torch

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings
)

from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
documents = SimpleDirectoryReader("data").load_data()

In [3]:
system_prompt = """
You are a Q&A assistant.
Your goal is to answer question as accurately as possible based on the instructions and context provided.
"""

# Prompt wrapper
query_wrapper_prompt = PromptTemplate(
    "<|USER|>{query_str}<|ASSISTANT|>"
)

In [None]:
!huggingface-cli login

In [None]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
)

In [None]:
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

In [None]:
Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = SentenceSplitter(chunk_size=1024)

In [None]:
# Build index
index = VectorStoreIndex.from_documents(documents)

# Query engine
query_engine = index.as_query_engine()

response = query_engine.query("What is this document about?")
print(response)

In [None]:
response = query_engine.query("What ia attention in transformers")

In [None]:
print(response)