# NanoChat Inference & CUDA Check

This notebook checks for CUDA availability, downloads the model artifacts, and runs a simple inference using the `nanochat` model.

In [3]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Device count: {torch.cuda.device_count()}")
else:
    print("CUDA is NOT available. Running on CPU or other device.")

PyTorch version: 2.9.1+cu128
CUDA available: True
Device name: NVIDIA RTX A6000
Device count: 4


In [None]:
import os
import requests
from pathlib import Path

def download_file(url, dest_path):
    if os.path.exists(dest_path):
        print(f"File already exists: {dest_path}")
        return
    print(f"Downloading {url} to {dest_path}...")
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(dest_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded {dest_path}")

home = Path.home()
cache_dir = home / ".cache" / "nanochat"
tokenizer_dir = cache_dir / "tokenizer"
checkpoints_dir = cache_dir / "chatsft_checkpoints" / "d32"

tokenizer_dir.mkdir(parents=True, exist_ok=True)
checkpoints_dir.mkdir(parents=True, exist_ok=True)

base_url = "https://huggingface.co/karpathy/nanochat-d32/resolve/main"

download_file(f"{base_url}/token_bytes.pt", tokenizer_dir / "token_bytes.pt")
download_file(f"{base_url}/tokenizer.pkl", tokenizer_dir / "tokenizer.pkl")

download_file(f"{base_url}/meta_000650.json", checkpoints_dir / "meta_000650.json")
download_file(f"{base_url}/model_000650.pt", checkpoints_dir / "model_000650.pt")

print("All files downloaded successfully.")

Downloading https://huggingface.co/karpathy/nanochat-d32/resolve/main/token_bytes.pt to /home/rs63759/.cache/nanochat/tokenizer/token_bytes.pt...
Downloaded /home/rs63759/.cache/nanochat/tokenizer/token_bytes.pt
Downloading https://huggingface.co/karpathy/nanochat-d32/resolve/main/tokenizer.pkl to /home/rs63759/.cache/nanochat/tokenizer/tokenizer.pkl...
Downloaded /home/rs63759/.cache/nanochat/tokenizer/tokenizer.pkl
Downloading https://huggingface.co/karpathy/nanochat-d32/resolve/main/meta_000650.json to /home/rs63759/.cache/nanochat/chatsft_checkpoints/d32/meta_000650.json...
Downloaded /home/rs63759/.cache/nanochat/chatsft_checkpoints/d32/meta_000650.json
Downloading https://huggingface.co/karpathy/nanochat-d32/resolve/main/model_000650.pt to /home/rs63759/.cache/nanochat/chatsft_checkpoints/d32/model_000650.pt...
Downloaded /home/rs63759/.cache/nanochat/chatsft_checkpoints/d32/model_000650.pt
All files downloaded successfully.


In [None]:
import sys
import os

if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

from nanochat.common import compute_init, autodetect_device_type
from nanochat.checkpoint_manager import load_model
from nanochat.engine import Engine
from contextlib import nullcontext

SOURCE = "sft"  # sft/mid/rl options
DEVICE_TYPE = "" # default is cuda
DTYPE = "bfloat16" 

print("Initializing...")

# setup device
device_type = autodetect_device_type() if DEVICE_TYPE == "" else DEVICE_TYPE
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)

print(f"Using device: {device}")

ptdtype = torch.float32 if DTYPE == 'float32' else torch.bfloat16
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()

# load model
try:
    model, tokenizer, meta = load_model(SOURCE, device, phase="eval", model_tag="d32")
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Make sure you have trained a model or downloaded checkpoints.")
    raise e

engine = Engine(model, tokenizer)

2025-11-23 19:41:50,143 - nanochat.common - [32m[1mINFO[0m - Distributed world size: 1
2025-11-23 19:41:50,146 - nanochat.checkpoint_manager - [32m[1mINFO[0m - Loading model from /home/rs63759/.cache/nanochat/chatsft_checkpoints/d32 with step 650


Initializing...
Autodetected device type: cuda
Using device: cuda
Error loading model: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 47.41 GiB of which 6.19 MiB is free. Process 2859735 has 46.37 GiB memory in use. Including non-PyTorch memory, this process has 1.00 GiB memory in use. Of the allocated memory 768.00 MiB is allocated by PyTorch, and 0 bytes is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Make sure you have trained a model or downloaded checkpoints.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 47.41 GiB of which 6.19 MiB is free. Process 2859735 has 46.37 GiB memory in use. Including non-PyTorch memory, this process has 1.00 GiB memory in use. Of the allocated memory 768.00 MiB is allocated by PyTorch, and 0 bytes is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def chat(prompt, temperature=0.6, top_k=50, max_tokens=256):
    """Runs a single turn of chat inference."""
    
    bos = tokenizer.get_bos_token_id()
    user_start = tokenizer.encode_special("<|user_start|>")
    user_end = tokenizer.encode_special("<|user_end|>")
    assistant_start = tokenizer.encode_special("<|assistant_start|>")
    assistant_end = tokenizer.encode_special("<|assistant_end|>")
    
    conversation_tokens = [bos]
    conversation_tokens.append(user_start)
    conversation_tokens.extend(tokenizer.encode(prompt))
    conversation_tokens.append(user_end)
    conversation_tokens.append(assistant_start)
    
    generate_kwargs = {
        "num_samples": 1,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_k": top_k,
    }
    
    print(f"User: {prompt}")
    print("Assistant: ", end="", flush=True)
    
    response_tokens = []
    with autocast_ctx:
        for token_column, token_masks in engine.generate(conversation_tokens, **generate_kwargs):
            token = token_column[0]
            response_tokens.append(token)
            token_text = tokenizer.decode([token])
            print(token_text, end="", flush=True)
    print("\n")

In [None]:
chat("Hello! Who are you?")

In [None]:
chat("Write a short poem about coding.")