In [1]:
def calculate_gpu_memory(parameters: int, quant_bits: int) -> float:
    """
    Calculate the required GPU memory for serving a model.

    Args:
        parameters (int): The number of parameters in the model (P).
        quant_bits (int): The number of bits used for quantization (Q).

    Returns:
        float: Required GPU memory in gigabytes (GB).
    """
    bytes_per_param = 4  # 4 bytes for each parameter (32-bit precision)
    overhead_factor = 1.2  # 20% overhead

    # Calculate the required memory in bytes
    memory_bytes = (parameters * bytes_per_param) / (32 / quant_bits)
    
    # Add the overhead
    memory_bytes *= overhead_factor

    # Convert to gigabytes (1 GB = 10^9 bytes)
    memory_gb = memory_bytes / 1e9

    return memory_gb

# Example usage
parameters = 7 * 10**9  # 7 billion parameters
quant_bits = 8  # 8-bit quantization
required_memory = calculate_gpu_memory(parameters, quant_bits)
print(f"Required GPU Memory: {required_memory:.2f} GB")

Required GPU Memory: 8.40 GB
