In [None]:
import torch
import time

def gpu_benchmark():
    if not torch.cuda.is_available():
        print("CUDA is not available on this system.")
        return
    
    # Get the GPU name
    device = torch.device("cuda:0")
    gpu_name = torch.cuda.get_device_name(device)
    print(f"Using GPU: {gpu_name}\n")

    # Warm up GPU
    print("Warming up the GPU...")
    for _ in range(10):
        torch.randn(1000, 1000, device=device).matmul(torch.randn(1000, 1000, device=device))
    torch.cuda.synchronize()

    # Benchmark settings
    sizes = [10000, 20000]
    results = []

    for size in sizes:
        print(f"Benchmarking matrix multiplication of size {size}x{size}...")
        a = torch.randn(size, size, device=device)
        b = torch.randn(size, size, device=device)

        start_time = time.time()
        for _ in range(10):  # Run the operation 10 times
            c = a.matmul(b)
        torch.cuda.synchronize()
        end_time = time.time()

        avg_time = (end_time - start_time) / 10
        tflops = 2 * (size ** 3) / (avg_time * 1e12)
        results.append((size, avg_time, tflops))

    print("\nBenchmark Results:")
    print(f"{'Size':<10}{'Avg Time (s)':<15}{'TFLOPS':<10}")
    for size, avg_time, tflops in results:
        print(f"{size:<10}{avg_time:<15.6f}{tflops:<10.2f}")

if __name__ == "__main__":
    gpu_benchmark()
