In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
from IPython.display import clear_output

In [2]:
def check_gpu():
    """Check GPU status and memory usage"""
    print("GPU Status:")
    !nvidia-smi
    
    if torch.cuda.is_available():
        print("\nPyTorch GPU Info:")
        print(f"GPU Device: {torch.cuda.get_device_name(0)}")
        print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"Allocated Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"Cached Memory: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

In [3]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(1000, 2000),
            nn.ReLU(),
            nn.Linear(2000, 1000),
            nn.ReLU(),
            nn.Linear(1000, 100)
        )
        
    def forward(self, x):
        return self.layers(x)

In [4]:
def train_model(model, num_batches=100, batch_size=1000, monitor_every=10):
    """Train model and monitor GPU usage"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())
    
    for batch in range(num_batches):
        x = torch.randn(batch_size, 1000).to(device)
        y = torch.randn(batch_size, 100).to(device)
        
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        
        loss.backward()
        optimizer.step()
        
        if batch % monitor_every == 0:
            clear_output(wait=True)
            print(f"Batch {batch}/{num_batches}")
            print(f"Loss: {loss.item():.4f}")
            check_gpu()
            time.sleep(1) 

In [5]:
print("Initial GPU Status:")
check_gpu()

print("\nStarting Training...")
model = SimpleNN()
train_model(model, num_batches=100)

print("\nFinal GPU Status:")
check_gpu()

Batch 90/100
Loss: 0.9951
GPU Status:
Wed Dec  4 17:29:57 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.03             Driver Version: 535.216.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:81:00.0 Off |                    0 |
| N/A   38C    P0             128W / 250W |   3578MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                              