In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

hf_token  = os.getenv('HF_TOKEN')

In [2]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token=hf_token,
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

# Optional quantization to 4bit
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Local LLM
llm = HuggingFaceLLM(
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    model_kwargs={
        "token": hf_token,
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        # "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    },
    tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
    tokenizer_kwargs={"token": hf_token},
    stopping_ids=stopping_ids,
)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:

## You can deploy the model on HF Inference Endpoint and use it

from llama_index.llms.huggingface import HuggingFaceInferenceAPI

# llm = HuggingFaceInferenceAPI(
#     model_name="meta-llama/Meta-Llama-3-8B-Instruct",
#     token=hf_token
# )

In [5]:
response = llm.complete("How to make a cheesecake?")
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 A simple recipe to make a delicious cheesecake
How to make a cheesecake? A simple recipe to make a delicious cheesecake
Cheesecake is a classic dessert that is loved by many. It's a creamy, rich, and decadent treat that can be made in a variety of flavors and textures. In this article, we'll provide a simple recipe to make a delicious cheesecake.
Ingredients:
For the crust:
* 1 1/2 cups graham cracker crumbs
* 1/4 cup granulated sugar
* 6 tablespoons (3/4 stick) unsalted butter, melted

For the cheesecake:
* 2 pounds cream cheese, softened
* 3 large eggs
* 1/2 cup granulated sugar
* 1/2 cup sour cream
* 1 teaspoon vanilla extract
* 1/4 teaspoon salt

Instructions:
1. Preheat your oven to 350°F (180°C).
2. Prepare the crust: In a medium bowl, mix together the graham cracker crumbs, sugar, and melted butter until well combined. Press the mixture into the bottom of a 9-inch springform pan.
3. Prepare the cheesecake: In a large mixing bowl, beat the cream


In [6]:
import requests

API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B"
headers = {"Authorization": "Bearer " + hf_token}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "Can you please let us know more details about your ",
})
print(output)

{'error': 'The model meta-llama/Meta-Llama-3-8B is too large to be loaded automatically (16GB > 10GB). Please use Spaces (https://huggingface.co/spaces) or Inference Endpoints (https://huggingface.co/inference-endpoints).'}


In [7]:
import llama_cpp