In [1]:
import torch
import time

# 1. Check if MPS is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device, f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "")

# 2. Create tensors and move to MPS
x = torch.randn(10000, 10000).to(device)
y = torch.randn(10000, 10000).to(device)

# 3. Warm-up (crucial for accurate timing)
for _ in range(10):
    z = torch.matmul(x, y)
torch.cuda.synchronize() # Wait for GPU to finish

# 4. Benchmark
start_time = time.time()
for _ in range(50):
    z = torch.matmul(x, y)
torch.cuda.synchronize() # Wait for GPU to finish
end_time = time.time()

print(f"Time taken: {(end_time - start_time):.4f} seconds")


Using device:  cuda (NVIDIA GeForce RTX 4070)
Time taken: 5.1684 seconds
