# Perplexity test on Llama GGUF model

This script has been made and tested on Google Collab, it is highly advised to run it there on a GPU runtime

To set it up fill the configuration constants at the beginning of the script layed out in upper case

In [None]:
print("Installing dependencies and downloading model")
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install -qqq --no-deps {xformers} trl peft accelerate bitsandbytes triton --progress-bar off
from unsloth import FastLanguageModel
from huggingface_hub import hf_hub_download, list_repo_files


TEST_DATA_URL = "" #https://cosmo.zip/pub/datasets/wikitext-2-raw/wiki.test.raw
REPO_ID = ""
SELECTED_MANUAL_FILE_FROM_REPO= "" #FineLlama-3.1-8B-Q8_0.gguf



# Compile llama.cpp with CUDA
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
# Clean build directory if exists
!rm -rf build
!mkdir build
%cd build
# Set up CUDA support
!cmake .. -DGGML_CUDA=ON
# Build the Release configuration
!cmake --build . --config Release -j 4
%cd ../..


try:
    files = list_repo_files(REPO_ID)
    # Filter .gguf files
    gguf_files = [f for f in files if f.endswith('.gguf')]

    if not gguf_files:
        raise ValueError("No GGUF files found in the repository")

    # Prefer Q8_0, otherwise take the first one
    selected_file = next((f for f in gguf_files if "Q8_0" in f), gguf_files[0])

    # Download model
    model_path = hf_hub_download(repo_id=REPO_ID, filename=selected_file)

except Exception as e:
    print(f"Error locating/downloading model: {e}")
    # Fallback to manual definition if list fails
    model_path = hf_hub_download(repo_id=REPO_ID, filename=SELECTED_MANUAL_FILE_FROM_REPO)

# Download test data
!wget -O wiki.test.raw {TEST_DATA_URL}

print("Running Perplexity Test...")

# The binary location might vary slightly depending on cmake version
perplexity_bin = "./llama.cpp/build/bin/llama-perplexity"

# Arguments:
# -m: Model path
# -f: Input file
# -ngl 99: Offload 99 layers to GPU
# -c 2048: Context size
cmd = f"{perplexity_bin} -m '{model_path}' -f wiki.test.raw -ngl 99 -c 2048"

!{cmd}

print("Benchmark complete")