In [26]:
%pip install gdown

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Setting up Modal

In [None]:
import modal

app = modal.App("llama-3-1-perplexity-eval")

image = (
    modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.13")
    .apt_install("git", "cmake", "build-essential", "libcurl4-openssl-dev", "ninja-build", "curl")
    .pip_install("transformers", "torch", "accelerate", "hf_transfer", "huggingface_hub", "datasets", "tqdm")
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .run_commands(
        "git clone https://github.com/ggerganov/llama.cpp.git /root/llama.cpp",
        "cd /root/llama.cpp && mkdir build && cd build && cmake .. -GNinja -DGGML_NATIVE=ON && ninja llama-bench"
    )
)

#Add hugging face model token, i removed mine because pushing a file containing an hf token to github is now allowed 

HF_TOKEN = "ADD HUGGINGFACE MODEL TOKEN HERE"

Running Perplexity on GPU

In [45]:
@app.function(
    image=image, 
    gpu="A10G", 
    timeout=3600, 
    secrets=[modal.Secret.from_dict({"HF_TOKEN": HF_TOKEN})]
)
def calculate_perplexity_wiki2(model_id: str):
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from datasets import load_dataset
    from tqdm import tqdm
    import numpy as np

    print("Loading model and dataset...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.float16, 
        device_map="auto"
    )
    model.eval()

    # Load WikiText-2 Test Set
    test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

    max_length = model.config.max_position_embeddings # 128k for Llama 3.1, but we'll use 2048 for bench
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    
    # Sliding window perplexity calculation
    print("Calculating PPL...")
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + 2048, seq_len)
        trg_len = end_loc - prev_end_loc
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to("cuda")
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100 # Mask tokens we aren't predicting

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood * trg_len)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    return {"WikiText-2 Perplexity": round(ppl.item(), 4)}

Running bench on CPU

In [None]:
@app.function(
    image=image, 
    cpu=8.0, 
    memory=32768,
    timeout=3600,
    secrets=[modal.Secret.from_dict({"HF_TOKEN": HF_TOKEN})]
)
def run_fp16_baseline_bench():
    from huggingface_hub import hf_hub_download
    import subprocess
    import os

    # VERIFIED REPO AND FILENAME FOR FP16
    repo_id = "matrixportal/Llama-3.1-8B-Instruct-GGUF"
    filename = "llama-3.1-8b-instruct-f16.gguf"
    
    print(f"Downloading FP16 Baseline from {repo_id}...")
    try:
        model_path = hf_hub_download(
            repo_id=repo_id, 
            filename=filename, 
            token=os.environ["HF_TOKEN"]
        )
    except Exception as e:
        return f"Download Failed. Ensure you have accepted the Llama 3.1 terms on HF. Error: {e}"
    
    # Verify file size (should be ~16GB)
    size_gb = os.path.getsize(model_path) / (1024**3)
    print(f"âœ… Success: {filename} is {size_gb:.2f} GB")

    bench_path = "/root/llama.cpp/build/bin/llama-bench"
    if not os.path.exists(bench_path):
        bench_path = "/root/llama.cpp/build/llama-bench"

    # Hardware benchmark
    cmd = f"{bench_path} -m {model_path} -p 512 -n 128 -t 8"
    print("Running FP16 Baseline Inference...")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    
    return result.stdout + "\n" + result.stderr

In [60]:
with modal.enable_output():
    with app.run():
        print("Deploying container to Modal...")
        raw_output = run_fp16_baseline_bench.remote()
        
        print("\n" + "="*50)
        print("RAW FP16 BENCHMARK OUTPUT")
        print("="*50)
        print(raw_output)

[2K[32mâœ“[0m Initialized. [37mView run at [0m
[4;37mhttps://modal.com/apps/rayedhafeez2/main/ap-S4OhdERbFyc8qyw1qA74Wk[0m
[2K[34m-[0m Initializing...
[2K[34m|[0m Creating objects...objects...
[2K[1A[2K[34m-[0m Creating objects...run_fp16_baseline_bench...
[90mâ””â”€â”€ [0mðŸ”¨ Created function run_fp16_baseline_bench.
[1A[2K[1A[2K[32mâœ“[0m Created objects.
[90mâ””â”€â”€ [0mðŸ”¨ Created function run_fp16_baseline_bench.
[2KDeploying container to Modal...
[2K[34m-[0m [34mWorker assigned...[0m [37mView app at [0m
[2K[1A[2K[34m|[0m [34mRunning...[0m [37mView app at [0myw1qA74Wk[0m
[2K[1A[2K[34m-[0m [34mLoading images (1 containers initializing)...[0m [37mView app at [0m
[2K[1A[2K[34m|[0m [34mLoading images (1 containers initializing)...[0m [37mView app at [0m
[2K[1A[2K[34mdal.com/apps/rayedhafeez2/main/ap-S4OhdERbFyc8qyw1qA74Wk[0m
== CUDA ==
[2K[34m|[0m[34m [0m[34mLoading images (1 containers initializing)...[0m

Checking CPU Specifications


In [None]:
@app.function(cpu=8.0, memory=32768)
def identify_cpu_hardware():
    import subprocess
    
    # Check vendor and specific flags
    vendor = subprocess.check_output("lscpu | grep 'Vendor ID'", shell=True).decode()
    flags = subprocess.check_output("lscpu | grep 'Flags'", shell=True).decode()
    
    # Ice Lake specific identifiers: 
    # 1. Vendor must be 'GenuineIntel'
    # 2. Must have 'avx512_vnni' and 'avx512_bitalg'
    is_intel = "GenuineIntel" in vendor
    has_avx512_vnni = "avx512_vnni" in flags
    
    return {
        "Vendor": vendor.strip(),
        "Ice Lake Capable (AVX-512 VNNI)": has_avx512_vnni,
        "Is Intel?": is_intel
    }

with app.run():
    print(identify_cpu_hardware.remote())

{'Vendor': 'Vendor ID:           GenuineIntel', 'Ice Lake Capable (AVX-512 VNNI)': True, 'Is Intel?': True}
