# Lab 3.2.6: GGUF Conversion - Solutions

This notebook contains solutions for all exercises in Lab 3.2.6.

In [None]:
import numpy as np
import struct
from pathlib import Path

## Exercise 1 Solution: GGUF Format Parser

Implement a parser to read GGUF file headers.

In [None]:
class GGUFParser:
    """
    Parser for GGUF file format.
    
    GGUF is the standard format for llama.cpp models.
    """
    
    GGUF_MAGIC = 0x46554747  # 'GGUF' in little-endian
    GGUF_VERSION = 3
    
    # GGML types
    GGML_TYPES = {
        0: 'F32',
        1: 'F16',
        2: 'Q4_0',
        3: 'Q4_1',
        6: 'Q5_0',
        7: 'Q5_1',
        8: 'Q8_0',
        9: 'Q8_1',
        10: 'Q2_K',
        11: 'Q3_K',
        12: 'Q4_K',
        13: 'Q5_K',
        14: 'Q6_K',
        15: 'Q8_K',
        16: 'IQ2_XXS',
        17: 'IQ2_XS',
        18: 'IQ3_XXS',
        19: 'IQ1_S',
        20: 'IQ4_NL',
    }
    
    def __init__(self, filepath: str):
        self.filepath = filepath
        self.metadata = {}
        self.tensors = {}
    
    def _read_string(self, f) -> str:
        """Read a GGUF string (length-prefixed)."""
        length = struct.unpack('<Q', f.read(8))[0]
        return f.read(length).decode('utf-8')
    
    def _read_value(self, f, value_type: int):
        """Read a metadata value based on type."""
        if value_type == 0:  # UINT8
            return struct.unpack('<B', f.read(1))[0]
        elif value_type == 1:  # INT8
            return struct.unpack('<b', f.read(1))[0]
        elif value_type == 2:  # UINT16
            return struct.unpack('<H', f.read(2))[0]
        elif value_type == 3:  # INT16
            return struct.unpack('<h', f.read(2))[0]
        elif value_type == 4:  # UINT32
            return struct.unpack('<I', f.read(4))[0]
        elif value_type == 5:  # INT32
            return struct.unpack('<i', f.read(4))[0]
        elif value_type == 6:  # FLOAT32
            return struct.unpack('<f', f.read(4))[0]
        elif value_type == 7:  # BOOL
            return struct.unpack('<B', f.read(1))[0] != 0
        elif value_type == 8:  # STRING
            return self._read_string(f)
        elif value_type == 9:  # ARRAY
            array_type = struct.unpack('<I', f.read(4))[0]
            array_len = struct.unpack('<Q', f.read(8))[0]
            return [self._read_value(f, array_type) for _ in range(array_len)]
        elif value_type == 10:  # UINT64
            return struct.unpack('<Q', f.read(8))[0]
        elif value_type == 11:  # INT64
            return struct.unpack('<q', f.read(8))[0]
        elif value_type == 12:  # FLOAT64
            return struct.unpack('<d', f.read(8))[0]
        else:
            raise ValueError(f"Unknown value type: {value_type}")
    
    def parse_header(self) -> dict:
        """
        Parse GGUF file header.
        
        Returns:
            Dictionary with file information
        """
        with open(self.filepath, 'rb') as f:
            # Read magic
            magic = struct.unpack('<I', f.read(4))[0]
            if magic != self.GGUF_MAGIC:
                raise ValueError(f"Invalid GGUF magic: {hex(magic)}")
            
            # Read version
            version = struct.unpack('<I', f.read(4))[0]
            
            # Read counts
            tensor_count = struct.unpack('<Q', f.read(8))[0]
            metadata_count = struct.unpack('<Q', f.read(8))[0]
            
            # Read metadata
            for _ in range(metadata_count):
                key = self._read_string(f)
                value_type = struct.unpack('<I', f.read(4))[0]
                value = self._read_value(f, value_type)
                self.metadata[key] = value
            
            # Read tensor info
            for _ in range(tensor_count):
                name = self._read_string(f)
                n_dims = struct.unpack('<I', f.read(4))[0]
                dims = [struct.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
                ggml_type = struct.unpack('<I', f.read(4))[0]
                offset = struct.unpack('<Q', f.read(8))[0]
                
                self.tensors[name] = {
                    'dims': dims,
                    'type': self.GGML_TYPES.get(ggml_type, f'UNKNOWN({ggml_type})'),
                    'offset': offset
                }
        
        return {
            'version': version,
            'tensor_count': tensor_count,
            'metadata_count': metadata_count,
            'metadata': self.metadata,
            'tensors': self.tensors
        }


# Demonstrate parser (would need actual GGUF file)
print("GGUF Parser Implementation")
print("="*50)
print("\nUsage:")
print("  parser = GGUFParser('model.gguf')")
print("  info = parser.parse_header()")
print("  print(info['metadata'])")
print("  print(info['tensors'])")

## Exercise 2 Solution: Quantization Type Comparison

Compare different GGUF quantization types.

In [None]:
def simulate_gguf_quantization(values: np.ndarray, quant_type: str) -> tuple:
    """
    Simulate different GGUF quantization types.
    
    Args:
        values: Values to quantize
        quant_type: One of Q2_K, Q3_K, Q4_0, Q4_K, Q5_K, Q6_K, Q8_0
        
    Returns:
        Tuple of (dequantized_values, effective_bits, mse)
    """
    # Block sizes for different types
    block_configs = {
        'Q2_K': {'bits': 2, 'block_size': 256, 'super_block': True},
        'Q3_K': {'bits': 3, 'block_size': 256, 'super_block': True},
        'Q4_0': {'bits': 4, 'block_size': 32, 'super_block': False},
        'Q4_K': {'bits': 4, 'block_size': 256, 'super_block': True},
        'Q5_0': {'bits': 5, 'block_size': 32, 'super_block': False},
        'Q5_K': {'bits': 5, 'block_size': 256, 'super_block': True},
        'Q6_K': {'bits': 6, 'block_size': 256, 'super_block': True},
        'Q8_0': {'bits': 8, 'block_size': 32, 'super_block': False},
    }
    
    if quant_type not in block_configs:
        raise ValueError(f"Unknown quant type: {quant_type}")
    
    config = block_configs[quant_type]
    bits = config['bits']
    block_size = config['block_size']
    super_block = config['super_block']
    
    # Pad to block size
    n = len(values)
    n_padded = ((n + block_size - 1) // block_size) * block_size
    padded = np.zeros(n_padded, dtype=np.float32)
    padded[:n] = values
    
    # Quantize per block
    qmax = 2 ** bits - 1
    num_blocks = n_padded // block_size
    
    dequantized = np.zeros(n_padded, dtype=np.float32)
    
    for b in range(num_blocks):
        start = b * block_size
        end = start + block_size
        block = padded[start:end]
        
        # Compute scale (use min/max for simplicity)
        b_min, b_max = block.min(), block.max()
        scale = (b_max - b_min) / qmax if b_max > b_min else 1.0
        
        # Quantize
        q = np.clip(np.round((block - b_min) / (scale + 1e-10)), 0, qmax)
        
        # Dequantize
        dequantized[start:end] = q * scale + b_min
    
    # Calculate effective bits (including scale overhead)
    weight_bits = n * bits
    scale_bits = num_blocks * 16  # FP16 scale
    min_bits = num_blocks * 16 if not super_block else num_blocks * 8
    total_bits = weight_bits + scale_bits + min_bits
    effective_bits = total_bits / n
    
    # MSE
    mse = np.mean((values - dequantized[:n]) ** 2)
    
    return dequantized[:n], effective_bits, mse


# Compare all quantization types
np.random.seed(42)
weights = np.random.randn(4096).astype(np.float32) * 0.02

quant_types = ['Q2_K', 'Q3_K', 'Q4_0', 'Q4_K', 'Q5_0', 'Q5_K', 'Q6_K', 'Q8_0']

print("GGUF Quantization Type Comparison")
print("="*60)
print(f"{'Type':<10} {'Eff. Bits':<12} {'Compression':<12} {'MSE':<15}")
print("-"*60)

results = {}
for qt in quant_types:
    dequant, eff_bits, mse = simulate_gguf_quantization(weights, qt)
    compression = 32 / eff_bits
    results[qt] = {'eff_bits': eff_bits, 'compression': compression, 'mse': mse}
    print(f"{qt:<10} {eff_bits:<12.2f} {compression:<12.1f}x {mse:<15.8f}")

print("\nRecommendations:")
print("  Q4_K: Best balance for most use cases")
print("  Q5_K: Good quality, slightly larger")
print("  Q2_K: Maximum compression, use for testing")

## Exercise 3 Solution: Custom GGUF Writer

Implement a simple GGUF file writer.

In [None]:
class GGUFWriter:
    """
    Simple GGUF file writer.
    """
    
    GGUF_MAGIC = 0x46554747
    GGUF_VERSION = 3
    
    def __init__(self, filepath: str):
        self.filepath = filepath
        self.metadata = {}
        self.tensors = []
        self.tensor_data = []
    
    def add_metadata(self, key: str, value, value_type: str = 'auto'):
        """
        Add metadata to GGUF file.
        
        Args:
            key: Metadata key
            value: Metadata value
            value_type: Type hint ('string', 'int', 'float', 'auto')
        """
        if value_type == 'auto':
            if isinstance(value, str):
                value_type = 'string'
            elif isinstance(value, bool):
                value_type = 'bool'
            elif isinstance(value, int):
                value_type = 'uint32'
            elif isinstance(value, float):
                value_type = 'float32'
            else:
                raise ValueError(f"Cannot auto-detect type for {type(value)}")
        
        self.metadata[key] = (value, value_type)
    
    def add_tensor(self, name: str, data: np.ndarray, quant_type: str = 'F32'):
        """
        Add tensor to GGUF file.
        
        Args:
            name: Tensor name
            data: Tensor data
            quant_type: Quantization type
        """
        # Convert to appropriate dtype
        if quant_type == 'F32':
            data = data.astype(np.float32)
            ggml_type = 0
        elif quant_type == 'F16':
            data = data.astype(np.float16)
            ggml_type = 1
        else:
            # For quantized types, would need actual quantization
            data = data.astype(np.float32)
            ggml_type = 0
        
        self.tensors.append({
            'name': name,
            'dims': list(data.shape),
            'type': ggml_type
        })
        self.tensor_data.append(data)
    
    def _write_string(self, f, s: str):
        """Write GGUF string."""
        data = s.encode('utf-8')
        f.write(struct.pack('<Q', len(data)))
        f.write(data)
    
    def _write_metadata_value(self, f, value, value_type: str):
        """Write metadata value."""
        type_map = {
            'uint8': (0, '<B'),
            'int8': (1, '<b'),
            'uint16': (2, '<H'),
            'int16': (3, '<h'),
            'uint32': (4, '<I'),
            'int32': (5, '<i'),
            'float32': (6, '<f'),
            'bool': (7, '<B'),
            'string': (8, None),
            'uint64': (10, '<Q'),
            'int64': (11, '<q'),
            'float64': (12, '<d'),
        }
        
        type_id, fmt = type_map[value_type]
        f.write(struct.pack('<I', type_id))
        
        if value_type == 'string':
            self._write_string(f, value)
        elif value_type == 'bool':
            f.write(struct.pack(fmt, 1 if value else 0))
        else:
            f.write(struct.pack(fmt, value))
    
    def write(self):
        """
        Write GGUF file.
        """
        with open(self.filepath, 'wb') as f:
            # Write header
            f.write(struct.pack('<I', self.GGUF_MAGIC))
            f.write(struct.pack('<I', self.GGUF_VERSION))
            f.write(struct.pack('<Q', len(self.tensors)))
            f.write(struct.pack('<Q', len(self.metadata)))
            
            # Write metadata
            for key, (value, value_type) in self.metadata.items():
                self._write_string(f, key)
                self._write_metadata_value(f, value, value_type)
            
            # Calculate tensor data offset
            data_offset = f.tell()
            
            # Account for tensor info headers
            for tensor in self.tensors:
                data_offset += 8 + len(tensor['name'].encode('utf-8'))  # name
                data_offset += 4  # n_dims
                data_offset += 8 * len(tensor['dims'])  # dims
                data_offset += 4  # type
                data_offset += 8  # offset
            
            # Align to 32 bytes
            data_offset = ((data_offset + 31) // 32) * 32
            
            # Write tensor info
            current_offset = 0
            for i, tensor in enumerate(self.tensors):
                self._write_string(f, tensor['name'])
                f.write(struct.pack('<I', len(tensor['dims'])))
                for dim in tensor['dims']:
                    f.write(struct.pack('<Q', dim))
                f.write(struct.pack('<I', tensor['type']))
                f.write(struct.pack('<Q', current_offset))
                
                # Update offset for next tensor
                current_offset += self.tensor_data[i].nbytes
            
            # Pad to alignment
            current_pos = f.tell()
            padding = data_offset - current_pos
            if padding > 0:
                f.write(b'\x00' * padding)
            
            # Write tensor data
            for data in self.tensor_data:
                f.write(data.tobytes())
        
        print(f"Written GGUF file: {self.filepath}")
        print(f"  Tensors: {len(self.tensors)}")
        print(f"  Metadata: {len(self.metadata)}")


# Demonstrate writer
print("GGUF Writer Implementation")
print("="*50)

# Create sample GGUF
writer = GGUFWriter('../data/sample.gguf')

# Add metadata
writer.add_metadata('general.architecture', 'llama')
writer.add_metadata('general.name', 'test-model')
writer.add_metadata('llama.context_length', 4096)
writer.add_metadata('llama.embedding_length', 4096)
writer.add_metadata('llama.block_count', 32)

# Add sample tensors
np.random.seed(42)
writer.add_tensor('token_embd.weight', np.random.randn(32000, 4096).astype(np.float32) * 0.02)

# Write file
writer.write()

## Summary

Key findings:

1. **GGUF format** is well-structured with magic, version, metadata, and tensors
2. **K-quant types** (Q4_K, Q5_K) provide better quality than simple quants
3. **Q4_K** is the recommended choice for most applications
4. **GGUF files** can be parsed and written with standard binary I/O