In [8]:
%pip install bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


## Test

In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Model details
model_name = "NousResearch/Hermes-3-Llama-3.2-3B"
cache_dir = "/scratch/gilbreth/anand173/model_cache"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",             # Use nf4 quantization for better accuracy
    bnb_4bit_compute_dtype=torch.float16,  # FP16 for GPU computations
)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
)

# Load model with manual device mapping
print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,       # Use 4-bit quantization
    device_map="auto",                    # Automatically allocate across CPU and GPU
    cache_dir=cache_dir,                  # Cache directory for efficient storage
)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

print("Model and tokenizer loaded successfully!")

# Test query
test_query = "What is the capital of France?"

# Tokenize input
print("Tokenizing input...")
inputs = tokenizer(
    test_query,
    return_tensors="pt",
    truncation=True,
    max_length=128,
    padding=True
).to("cuda:0")  # Ensure tensors are on GPU

# Generate output
print("Generating response...")
outputs = model.generate(
    **inputs,
    max_new_tokens=50,              # Limit the number of tokens in the response
    eos_token_id=tokenizer.eos_token_id
)

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Response: {response}")

Loading tokenizer...
Loading model with 4-bit quantization...


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 