<a href="https://colab.research.google.com/github/PhilipQuirke/quanta_maths/blob/main/notebooks/QCatGen_Run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers

In [None]:
# These are tasks that we test the models on
tasks = [
    "minimum",
    "maximum",
    "sum",
    "difference",
    "product",
    "average",
    "exponential" # Excluded for now as too hard
]

# This is the prompt template we use for each task
prompt_template = "Answer minimally: Given the numbers {x} and {y} calculate the {task}"

In [None]:
# Good open-source models that passed the first 6 tasks for 5 instances each.
cached_good_open_models_6tasks_5instances = [
    {
        'name': 'meta-llama/llama-3.2-90b-vision-instruct',
        'hf_repo': 'meta-llama/Llama-3.2-90B-Vision-Instruct',
        'url': 'https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct',
        'notes': 'Multimodal (text + images), vision reasoning capabilities'
    },
    {
        'name': 'meta-llama/llama-4-maverick',
        'hf_repo': 'meta-llama/Llama-4-Maverick-17B-128E',
        'url': 'https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E',
        'notes': '17B active params (~400B total), 128 experts, natively multimodal, 1M context',
        'instruct_variant': 'meta-llama/Llama-4-Maverick-17B-128E-Instruct'
    },
    {
        'name': 'meta-llama/llama-4-scout',
        'hf_repo': 'meta-llama/Llama-4-Scout-17B-16E',
        'url': 'https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E',
        'notes': '17B active params (~109B total), 16 experts, natively multimodal, 10M context, fits on single H100 GPU',
        'instruct_variant': 'meta-llama/Llama-4-Scout-17B-16E-Instruct'
    },
    {
        'name': 'nvidia/llama-3.1-nemotron-70b-instruct',
        'hf_repo': 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
        'url': 'https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
        'notes': '70B model fine-tuned by NVIDIA using RLHF, #1 on Arena Hard/AlpacaEval 2 LC/MT-Bench as of Oct 2024, trained for helpfulness',
        'base_model': 'meta-llama/Llama-3.1-70B-Instruct'
    },
    {
        'name': 'qwen/qwen-2.5-coder-32b-instruct',
        'hf_repo': 'Qwen/Qwen2.5-Coder-32B-Instruct',
        'url': 'https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct',
        'notes': 'SOTA open-source code LLM, matches GPT-4o coding abilities, 128K context, 5.5T tokens training'
    }
]

## Local Inference on GPU
Model page: https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
model = AutoModelForCausalLM.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

## Remote Inference via Inference Providers
Ensure you have a valid **HF_TOKEN** set in your environment. You can get your token from [your settings page](https://huggingface.co/settings/tokens). Note: running this may incur charges above the free tier.
The following Python example shows how to run the model remotely on HF Inference Providers, automatically selecting an available inference provider for you.
For more information on how to use the Inference Providers, please refer to our [documentation and guides](https://huggingface.co/docs/inference-providers/en/index).

In [None]:
import os
os.environ['HF_TOKEN'] = 'YOUR_TOKEN_HERE'

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.environ["HF_TOKEN"],
)

completion = client.chat.completions.create(
    model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
)

print(completion.choices[0].message)