# Single-Question Inference (2 Base Models)

Use this notebook to ask one question to either Mistral-7B-Instruct or Llama-3-8B-Instruct.


## Notes
- A GPU is recommended (7B/8B models are large).
- Llama-3 models require accepting the Meta license on Hugging Face.
- Models are pre-cached on the cluster at `/data/cat/ws/albu670g-qa-model/models`.


In [1]:
# Core deps
import os

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

UsageError: Line magic function `%export` not found.


In [2]:
os.environ["HF_TOKEN"] = ""

In [3]:
# Model options (two base models)
MODEL_IDS = {
    "mistral": "mistralai/Mistral-7B-Instruct-v0.2",
    "llama3": "meta-llama/Meta-Llama-3-8B-Instruct",
}

# Select which model to use
selected_key = "mistral"  # change to "llama3"
model_id = MODEL_IDS[selected_key]

# Cache directory for models (pre-cached on the cluster)
CACHE_DIR = os.path.abspath("/data/cat/ws/albu670g-qa-model/models")
if not os.path.isdir(CACHE_DIR):
    # Fallback to a local cache if the cluster path is unavailable
    CACHE_DIR = os.path.abspath("../models")


In [None]:
# Load the model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=CACHE_DIR,
    dtype=dtype,
)
model.to(device)
model.eval()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def build_prompt(question: str) -> str:
    """Build a model-appropriate prompt."""
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question},
    ]
    chat_template = getattr(tokenizer, "chat_template", None)
    if chat_template:
        return tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
    return f"System: {messages[0]['content']}\nUser: {question}\nAssistant:"


In [None]:
def ask_model(question: str, max_new_tokens: int = 128, temperature: float = 0.0) -> str:
    """Run a single-question generation and return the model answer."""
    prompt = build_prompt(question)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    do_sample = temperature > 0
    gen_kwargs = {
        "max_new_tokens": max_new_tokens,
        "do_sample": do_sample,
    }
    if do_sample:
        gen_kwargs["temperature"] = temperature

    pad_token_id = tokenizer.eos_token_id or tokenizer.pad_token_id
    if pad_token_id is not None:
        gen_kwargs["pad_token_id"] = pad_token_id

    with torch.no_grad():
        output_ids = model.generate(**inputs, **gen_kwargs)

    generated_ids = output_ids[0][inputs["input_ids"].shape[-1] :]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()


In [None]:
# Ask one question
question = "What is the capital of France?"
answer = ask_model(question)
print(answer)
