In [None]:
!pip install llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-readers-web transformers accelerate -q

## Getting Started

In [None]:
from llama_index.core import ServiceContext
from llama_index.core import VectorStoreIndex
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.base import PromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.readers.web import BeautifulSoupWebReader

In [None]:
def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

In [None]:
llm = HuggingFaceLLM(
    model_name="/home/aac/zephyr-7b-alpha",
    tokenizer_name="/home/aac/zephyr-7b-alpha",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"use_safetensors": False},
    # tokenizer_kwargs={},
    generate_kwargs={"do_sample":True, "temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="cuda",
)

## Raw Prompting

In [None]:
question = "How does Paul Graham recommend to work hard? Can you list it as steps"
response = llm.complete(question)
print(response)

### Prompt Engineering

In [None]:
url = "https://paulgraham.com/hwh.html"

documents = BeautifulSoupWebReader().load_data([url])

In [None]:
context = documents[0].text
prompt = f"""Answer the question based on the context below. If the
question cannot be answered using the information provided answer
with "I don't know".

Context: {context}

Question: {question}

Answer: """
response = llm.complete(prompt)
print(response)

In [None]:
response = llm.complete(prompt)
print(response)

### Retrieval Augmented Generation

In [None]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-base-en-v1.5", chunk_size=256, chunk_overlap=32)

In [None]:
# build basic RAG system
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine(similarity_top_k=8)

In [None]:
response = query_engine.query(question)
print(response)