In [1]:
import torch
import torch.nn as nn
import os
from torch.utils.data import Dataset, DataLoader
import tqdm.auto

In [2]:
def get_device():
    if torch.cuda.is_available(): 
     dev = "cuda:0" 
    else: 
     dev = "cpu" 
    return torch.device(dev) 

In [3]:
device = get_device()
print(device)

cpu


In [4]:
class VanishingNet(nn.Module):
    def __init__(self):
        super(VanishingNet, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(784, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
       return self.network(x)

In [5]:
# Initialize the network and a dummy input
model = VanishingNet()
input_data = torch.randn(1, 784)  # A random input

# Define a loss function and an optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [6]:
# Forward pass
output = model(input_data)
target = torch.randn(1, 10)  # Random target
loss = criterion(output, target)
    
# Backward pass
loss.backward()

In [7]:
# Print out the gradients of each layer
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Gradients for {name}: {param.grad.norm()}")

Gradients for network.0.weight: 7.15666175210572e-08
Gradients for network.0.bias: 2.520903263913965e-09
Gradients for network.2.weight: 2.152677467393005e-07
Gradients for network.2.bias: 1.82332673404062e-08
Gradients for network.4.weight: 1.4937688774807611e-06
Gradients for network.4.bias: 1.3155286637811514e-07
Gradients for network.6.weight: 1.1041301149816718e-05
Gradients for network.6.bias: 9.594491530151572e-07
Gradients for network.8.weight: 7.962269592098892e-05
Gradients for network.8.bias: 6.92343792252359e-06
Gradients for network.10.weight: 0.0005476514925248921
Gradients for network.10.bias: 4.826540680369362e-05
Gradients for network.12.weight: 0.003924783784896135
Gradients for network.12.bias: 0.0003431191435083747
Gradients for network.14.weight: 0.027769051492214203
Gradients for network.14.bias: 0.0024162540212273598
Gradients for network.16.weight: 0.196633979678154
Gradients for network.16.bias: 0.01708238758146763
Gradients for network.18.weight: 1.44890189170

In [8]:
class ExplodingNet(nn.Module):
    def __init__(self):
        super(ExplodingNet, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(784, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10)
        )
    def forward(self, x):
        return self.network(x)

In [9]:
# Initialize the network and a dummy input
model = ExplodingNet()
input_data = torch.randn(1, 784)  # A random input

# Define a loss function and an optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Set large initial weights to simulate exploding gradients
for param in model.parameters():
    nn.init.uniform_(param, a=10.0, b=20.0)  # Initialize weights in a large range

In [10]:
# Forward pass
output = model(input_data)
target = torch.randn(1, 10)  # Random target
loss = criterion(output, target)
    
# Backward pass
loss.backward()

In [11]:
# Print out the gradients of each layer
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Gradients for {name}: {param.grad.norm()}")

Gradients for network.0.weight: 0.0
Gradients for network.0.bias: 0.0
Gradients for network.2.weight: 0.0
Gradients for network.2.bias: 9009178345472.0
Gradients for network.4.weight: 1178562199552.0
Gradients for network.4.bias: 6631713280.0
Gradients for network.6.weight: 4210119671808.0
Gradients for network.6.bias: 17728724.0
