## Test

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Define model name and cache directory
model_name = "NousResearch/Hermes-3-Llama-3.2-3B"
cache_dir = "/scratch/gilbreth/anand173/model_cache"

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

# Load model with optimized settings
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # Use reduced precision for better performance
    device_map="auto",           # Automatically allocate layers to GPU/CPU
    cache_dir=cache_dir
)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Define a test query
test_query = "What is the capital of France?"

# Tokenize the input
inputs = tokenizer(
    test_query,
    return_tensors="pt",
    truncation=True,
    padding=True
).to("cuda")  # Send input tensors to GPU

# Generate the output
print("Generating response...")
outputs = model.generate(
    **inputs,
    max_new_tokens=50,           # Limit the number of tokens in the response
    eos_token_id=tokenizer.eos_token_id
)

# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Response: {response}")

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/50.3k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/955 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]