In [1]:
import torch
from transformers import pipeline
from dotenv import load_dotenv
import os
# Load environment variables from a .env file
load_dotenv()

# Access environment variables
hf_token = os.getenv("HF_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
!pip install transformers bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [2]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=hf_token,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.73s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [3]:
torch.cuda.is_available()

True

In [4]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "What does the fox say?"},
]
outputs = pipe(
    messages,
    max_new_tokens=2000,
)
print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


{'role': 'assistant', 'content': 'According to the famous song by the Norwegian comedy duo Ylvis, the fox says:\n\n"Ring-ding-ding-ding-dingeringeding!\nWa-pa-pa-pa-pa-pa-pow!\nHatee-hatee-hatee-ho!\nJoff-tchoff-tchoff-tchoffo-joff!"\n\nHowever, in reality, foxes don\'t actually make these sounds. They do make various vocalizations, such as barks, howls, and screams, but not exactly like what\'s depicted in the song.'}


## Quantize the model

In [2]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import bitsandbytes as bnb  # Required for quantization

# Define the model and tokenizer
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model with 8-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,  
    device_map="cuda",  # Automatically place on the correct device
)

# Initialize the pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,  # You can still use bf16, or float16
    device_map="cuda",
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]


In [3]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "How are you doing today?"},
]
# Now you can use the pipeline as usual
output = pipe(messages, max_new_tokens=2000)
print(output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[{'generated_text': [{'role': 'system', 'content': 'You are a helpful assistant'}, {'role': 'user', 'content': 'How are you doing today?'}, {'role': 'assistant', 'content': "I'm doing well, thank you for asking. I'm a large language model, so I don't have emotions or feelings like humans do, but I'm always ready to help and provide information to the best of my abilities. How about you? Is there anything I can assist you with?"}]}]
