In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

hf_token  = os.getenv('HF_TOKEN')

In [2]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token=hf_token,
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

# Optional quantization to 4bit
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )

llm = HuggingFaceLLM(
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    model_kwargs={
        "token": hf_token,
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        # "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    },
    tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
    tokenizer_kwargs={"token": hf_token},
    stopping_ids=stopping_ids,
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]
Some parameters are on the meta device device because they were offloaded to the cpu.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:

## You can deploy the model on HF Inference Endpoint and use it

from llama_index.llms.huggingface import HuggingFaceInferenceAPI

llm = HuggingFaceInferenceAPI(
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    token=hf_token
)

  llm = HuggingFaceInferenceAPI(


In [5]:
response = llm.complete("How to make a cake?")
print(response)

 A simple recipe for beginners
Making a cake is a fun and rewarding process, and with this simple recipe, you can create a delicious and moist cake that's perfect for any occasion. Here's a step-by-step guide to making a cake for beginners:

Ingredients:

* 2 cups all-purpose flour
* 1 teaspoon baking powder
* 1 teaspoon baking soda
* 1 teaspoon salt
* 1 cup granulated sugar
* 1/2 cup unsalted butter, softened
* 2 large eggs
* 2 teaspoons vanilla extract
* 1 cup whole milk, at room temperature

Instructions:

1. Preheat your oven to 350°F (180°C). Grease two 9-inch (23cm) round cake pans and line the bottoms with parchment paper.
2. In a medium-sized bowl, whisk together the flour, baking powder, baking soda, and salt. Set aside.
3. In a large mixing bowl, use an electric mixer to cream together the sugar and butter until light and fluffy, about 2-3 minutes.
4. Beat in the eggs one at a time, allowing each egg to fully incorporate before adding the next. Beat in the vanilla extract.
5.

In [6]:
import requests

API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": "Bearer " + hf_token}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "Can you please let us know more details about your ",
})