In [8]:
%pip install bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


## Test

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Define model name and cache directory
model_name = "NousResearch/Hermes-3-Llama-3.2-3B"
cache_dir = "/scratch/gilbreth/anand173/model_cache"

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Use mixed precision for computation
    bnb_4bit_use_double_quant=True,  # Double quantization for better performance
    bnb_4bit_quant_type="nf4"  # Use NF4 quantization
)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

# Load model with quantization and device mapping
print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply 4-bit quantization
    device_map="auto",  # Automatically place model layers across GPU and CPU
    torch_dtype=torch.float16,  # Ensure compatibility with GPU
    cache_dir=cache_dir
)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

print("Model and tokenizer loaded successfully!")

# Define a test query
test_query = "What is the capital of France?"

# Tokenize the input
inputs = tokenizer(
    test_query,
    return_tensors="pt",
    truncation=True,
    padding=True
).to("cuda")  # Send input tensors to GPU

# Generate the output
print("Generating response...")
outputs = model.generate(
    **inputs,
    max_new_tokens=50,           # Limit the number of tokens in the response
    eos_token_id=tokenizer.eos_token_id
)

# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Response: {response}")

Model not found locally. Downloading...


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/16.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/955 [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.3k [00:00<?, ?B/s]

Model downloaded to: /scratch/gilbreth/anand173/model_cache/models--NousResearch--Hermes-3-Llama-3.2-3B/snapshots/f6a109fe836b13b6905f8c16a7388f2f557c3974
Loading tokenizer...
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacty of 23.50 GiB of which 47.94 MiB is free. Process 108824 has 224.00 MiB memory in use. Process 82764 has 354.00 MiB memory in use. Process 21456 has 21.75 GiB memory in use. Including non-PyTorch memory, this process has 1.10 GiB memory in use. Of the allocated memory 848.00 MiB is allocated by PyTorch, and 10.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF