In [2]:
import os, base64
from IPython.display import HTML, display
import torch
import torch.nn as nn
import torch.optim as optim
from torch.profiler import profile, ProfilerActivity
import torch.cuda as cuda
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from torch.utils.data import TensorDataset, DataLoader

print(f"GPU: {cuda.get_device_name(0)}")
device = "cuda"

# Datos: flatten 64 features ‚Üí 10 clases
digits = load_digits()
X, Y = digits.data, digits.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=11)

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(Y_train))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                             torch.tensor(Y_test))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

# ‚úÖ SOLO nn.Linear(64 ‚Üí 10) UNA CAPA
model = nn.Sequential(nn.Linear(64,128),
                      nn.ReLU(),
                      nn.Linear(128,10)).to(device)

optimizer = optim.SGD(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Warmup batch
it = iter(train_loader)
batch = next(it)
x = batch[0].to(device)      # [64, 64]
targets = batch[1].to(device) # [64]

print(f"Input: {x.shape} ‚Üí Linear(64,10) ‚Üí Output: [64,10]")
print("üöÄ Profiling nn.Linear P100...")

activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
with profile(activities=activities, 
             with_stack=True, 
             record_shapes=True, 
             with_modules=True) as prof:
    
    optimizer.zero_grad()
    outputs = model(x)           
    loss = loss_fn(outputs, targets)
    loss.backward()
    optimizer.step()
    
    torch.cuda.synchronize()     

# Export trace
trace_path = "/tmp/trace_linear_P100.json"
prof.export_chrome_trace(trace_path)

# Download button
with open(trace_path, "rb") as f:
    data = base64.b64encode(f.read()).decode()

html = f"""
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); 
            color: white; padding: 30px; border-radius: 20px; text-align: center; 
            box-shadow: 0 15px 35px rgba(0,0,0,0.3);">
    <h2>‚úÖ nn.Linear √öNICA CAPA P100</h2>
    <p><b>Batch 64 | [64,64] ‚Üí Linear(64,10) | Sync optimizado</b></p>
    <p>üîç <b>Busca: aten::addmm (GEMM CUDA) | Stream 7 | cudaMemcpyHtoD</b></p>
    <a href="data:application/json;base64,{data}" 
       download="trace_Sequential2Linear_P100.json" 
       style="background: #FF6B6B; color: white; padding: 20px 60px; 
              font-size: 22px; font-weight: bold; border-radius: 15px; 
              box-shadow: 0 10px 25px rgba(255,107,107,0.4); 
              text-decoration: none;">
        üöÄ DESCARGAR trace_Sequential2Linear_P100.json
    </a>
    <br><br>
    <small>‚ö° addmm CUDA kernel ~100-300¬µs | PCIe bound peque√±o batch | Perfecto baseline</small>
</div>
"""
display(HTML(html))



GPU: Tesla P100-PCIE-16GB
Input: torch.Size([64, 64]) ‚Üí Linear(64,10) ‚Üí Output: [64,10]
üöÄ Profiling nn.Linear P100...
