# Lab 3.2.4 Solutions: GGUF Conversion

This notebook contains solutions to the exercises from Lab 3.2.4.

---

In [None]:
# Common imports
import torch
import subprocess
import os
import gc
import time

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

LLAMA_CPP = os.path.expanduser("~/llama.cpp")
print(f"llama.cpp path: {LLAMA_CPP}")
print(f"Exists: {os.path.exists(LLAMA_CPP)}")

## Exercise 1: Convert a Different Model

Convert Mistral-7B or Llama-2-7B to GGUF format with multiple quantization types.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def convert_to_gguf(
    model_id: str,
    output_dir: str = "./gguf_models",
    quant_types: list = ["Q4_K_M", "Q5_K_M", "Q8_0"]
) -> dict:
    """
    Convert a HuggingFace model to GGUF format with multiple quantization types.
    
    Args:
        model_id: HuggingFace model ID (e.g., "mistralai/Mistral-7B-v0.1")
        output_dir: Directory to save GGUF files
        quant_types: List of GGUF quantization types
    
    Returns:
        Dictionary mapping quantization types to file paths
    """
    os.makedirs(output_dir, exist_ok=True)
    model_name = model_id.split('/')[-1]
    
    # Step 1: Download and save HuggingFace model
    print(f"Step 1: Downloading {model_id}...")
    hf_dir = os.path.join(output_dir, "hf_model")
    os.makedirs(hf_dir, exist_ok=True)
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id, 
            torch_dtype=torch.float16
        )
        tokenizer.save_pretrained(hf_dir)
        model.save_pretrained(hf_dir)
        del model
        clear_memory()
        print(f"  Saved to {hf_dir}")
    except Exception as e:
        print(f"  Error downloading model: {e}")
        raise
    
    # Step 2: Convert to GGUF F16
    print("Step 2: Converting to GGUF F16...")
    f16_path = os.path.join(output_dir, f"{model_name}-f16.gguf")
    convert_script = os.path.join(LLAMA_CPP, "convert_hf_to_gguf.py")
    
    result = subprocess.run(
        ["python3", convert_script, hf_dir, "--outfile", f16_path, "--outtype", "f16"],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        print(f"  Error: {result.stderr}")
        raise RuntimeError("F16 conversion failed")
    
    f16_size = os.path.getsize(f16_path) / 1e9
    print(f"  Created: {f16_path} ({f16_size:.2f} GB)")
    
    # Step 3: Quantize to different types
    print("Step 3: Quantizing...")
    quantize_bin = os.path.join(LLAMA_CPP, "build", "bin", "llama-quantize")
    
    results = {'F16': {'path': f16_path, 'size_gb': f16_size}}
    
    for qtype in quant_types:
        print(f"  Creating {qtype}...")
        output_path = os.path.join(output_dir, f"{model_name}-{qtype}.gguf")
        
        result = subprocess.run(
            [quantize_bin, f16_path, output_path, qtype],
            capture_output=True,
            text=True
        )
        
        if result.returncode == 0 and os.path.exists(output_path):
            size_gb = os.path.getsize(output_path) / 1e9
            results[qtype] = {'path': output_path, 'size_gb': size_gb}
            print(f"    Size: {size_gb:.2f} GB ({f16_size/size_gb:.1f}x compression)")
        else:
            print(f"    Failed: {result.stderr}")
    
    # Summary
    print("\n" + "="*50)
    print("GGUF Conversion Summary")
    print("="*50)
    print(f"{'Type':<12} {'Size (GB)':>12} {'Compression':>12}")
    print("-"*50)
    for qtype, data in results.items():
        compression = f16_size / data['size_gb']
        print(f"{qtype:<12} {data['size_gb']:>12.2f} {compression:>11.1f}x")
    
    return results

# Example usage:
# results = convert_to_gguf("mistralai/Mistral-7B-v0.1")
# results = convert_to_gguf("meta-llama/Llama-2-7b-hf")
print("GGUF conversion function defined")

## Exercise 2: Perplexity Evaluation with llama.cpp

Use llama.cpp's built-in perplexity tool to evaluate quality.

In [None]:
def evaluate_gguf_perplexity(
    model_path: str, 
    test_file: str,
    n_gpu_layers: int = 99
) -> float:
    """
    Evaluate GGUF model perplexity using llama.cpp.
    
    Args:
        model_path: Path to GGUF file
        test_file: Path to text file for evaluation
        n_gpu_layers: Number of layers to offload to GPU
    
    Returns:
        Perplexity value
    """
    perplexity_bin = os.path.join(LLAMA_CPP, "build", "bin", "llama-perplexity")
    
    if not os.path.exists(perplexity_bin):
        print(f"Error: {perplexity_bin} not found")
        print("Make sure llama.cpp is built with: cmake --build build")
        return None
    
    print(f"Evaluating perplexity on {model_path}...")
    print(f"Test file: {test_file}")
    
    cmd = [
        perplexity_bin,
        "-m", model_path,
        "-f", test_file,
        "-ngl", str(n_gpu_layers)
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    # Parse perplexity from output
    perplexity = None
    for line in result.stdout.split('\n'):
        if 'perplexity' in line.lower():
            print(line)
            # Try to extract the number
            try:
                parts = line.split()
                for i, part in enumerate(parts):
                    if 'perplexity' in part.lower() and i + 1 < len(parts):
                        perplexity = float(parts[i + 1].strip('=:'))
            except:
                pass
    
    return perplexity


def create_test_file(filepath: str = "./test_data.txt") -> str:
    """
    Create a test file for perplexity evaluation.
    """
    test_text = """Machine learning is a subset of artificial intelligence that enables systems to learn from data.
Deep learning uses neural networks with multiple layers to extract complex patterns.
Natural language processing allows computers to understand and generate human language.
Computer vision enables machines to interpret and analyze visual information from the world.
Reinforcement learning trains agents through rewards and penalties in an environment.
Transfer learning leverages knowledge from one task to improve performance on another.
The transformer architecture revolutionized natural language processing in 2017.
Attention mechanisms help models focus on relevant parts of the input.
Large language models can generate human-like text and answer questions.
Quantization reduces model precision to enable efficient deployment."""
    
    with open(filepath, 'w') as f:
        f.write(test_text)
    
    print(f"Created test file: {filepath}")
    return filepath


# Example usage:
# test_file = create_test_file()
# ppl = evaluate_gguf_perplexity("./model-Q4_K_M.gguf", test_file)
print("Perplexity evaluation functions defined")

In [None]:
# Compare perplexity across quantization types

def compare_gguf_quality(
    model_dir: str,
    model_name: str,
    quant_types: list = ["F16", "Q8_0", "Q5_K_M", "Q4_K_M", "Q4_0", "Q2_K"]
) -> dict:
    """
    Compare perplexity across different GGUF quantization types.
    
    Args:
        model_dir: Directory containing GGUF files
        model_name: Base model name (e.g., "Mistral-7B-v0.1")
        quant_types: Quantization types to compare
    
    Returns:
        Dictionary with results
    """
    # Create test file
    test_file = create_test_file("./perplexity_test.txt")
    
    results = {}
    
    for qtype in quant_types:
        # Construct filename
        if qtype == "F16":
            filename = f"{model_name}-f16.gguf"
        else:
            filename = f"{model_name}-{qtype}.gguf"
        
        filepath = os.path.join(model_dir, filename)
        
        if os.path.exists(filepath):
            print(f"\nEvaluating {qtype}...")
            ppl = evaluate_gguf_perplexity(filepath, test_file)
            size_gb = os.path.getsize(filepath) / 1e9
            
            results[qtype] = {
                'perplexity': ppl,
                'size_gb': size_gb
            }
        else:
            print(f"Skipping {qtype}: {filepath} not found")
    
    # Summary
    if results:
        print("\n" + "="*60)
        print("GGUF Quality Comparison")
        print("="*60)
        print(f"{'Type':<12} {'Size (GB)':>12} {'Perplexity':>12} {'PPL Delta':>12}")
        print("-"*60)
        
        baseline_ppl = results.get('F16', {}).get('perplexity', 0)
        
        for qtype, data in results.items():
            ppl = data['perplexity']
            if ppl and baseline_ppl:
                delta = ppl - baseline_ppl
                delta_str = f"+{delta:.2f}" if delta > 0 else f"{delta:.2f}"
            else:
                delta_str = "N/A"
            
            ppl_str = f"{ppl:.2f}" if ppl else "N/A"
            print(f"{qtype:<12} {data['size_gb']:>12.2f} {ppl_str:>12} {delta_str:>12}")
    
    return results

# Example usage:
# results = compare_gguf_quality("./gguf_models", "Mistral-7B-v0.1")
print("Quality comparison function defined")

---

## Key Takeaways

1. **GGUF is portable** - Works on CPU, GPU, Metal, anywhere
2. **K-quants are smart** - They protect important layers
3. **Q4_K_M is the sweet spot** - Best balance of size and quality
4. **Use llama-perplexity** - Built-in tool for quality evaluation