In [1]:
# !pip install torch -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch
  Downloading torch-2.0.0-cp39-cp39-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cudnn-cu11==8.5.0.96
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-cupti-cu11==11.7.101
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader,TensorDataset,random_split
from torchvision import datasets, transforms

import matplotlib.pyplot as plt
import torchvision
import numpy as np
import time, os, copy, random
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

  warn(f"Failed to load image Python extension: {e}")


PyTorch Version:  2.0.0+cu117
Torchvision Version:  0.14.1+cu116


# Create artifical data

Set random seeds.


In [6]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

Set the number of batches and the batch size. For these early tests, 2 batches of size 3 should give good insights while not being to complicated.

In [4]:
n_batches = 5
batch_size = 3
n_samples = n_batches * batch_size

The network architecture, initial weights and test data is similar to this source:
https://www.kaggle.com/code/sironghuang/understanding-pytorch-hooks.

In the linked notebook, only one datapoint is evaluated. Here, this datapoint will be repeated to include the effects of using batches.

In [8]:
artifical_data = torch.empty((n_samples,2))
artifical_data[:,0] = 0.05
artifical_data[:,1] = 0.1
print(f'dataset size :{artifical_data.shape}')
print(f'single sample, size: {artifical_data[0,:].shape} | values: {artifical_data[0,:]}')

dataset size :torch.Size([15, 2])
single sample, size: torch.Size([2]) | values: tensor([0.0500, 0.1000])


In [9]:
artifical_labels = torch.empty_like(artifical_data)
artifical_labels[:,0] = 0.01
artifical_labels[:,1] = 0.99
print(f'label set size :{artifical_labels.shape}')
print(f'single label, size: {artifical_labels[0,:].shape} | values: {artifical_labels[0,:]}')

label set size :torch.Size([15, 2])
single label, size: torch.Size([2]) | values: tensor([0.0100, 0.9900])


Next, the datasets and dataloader are created from the tensors. The first 4*batch_size samples are being used as the training set and the remaining batch_size samples are the test set. The splitting of datasets is not necessary for now but will make extension easy later on.

Tensordata requires a 2D tensor, where each line represents one training sample. Targets may be 1-D or 2-D.

In [10]:
train_set = TensorDataset(artifical_data[:4*batch_size,], artifical_labels[:4*batch_size,])
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False)
print(f'Number of batches in the training set is {len(train_set)}')

Number of batches in the training set is 12


In [11]:
eval_set = TensorDataset(artifical_data[4*batch_size:,], artifical_labels[4*batch_size:,])
eval_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False)
print(f'Number of batches in the evaluation set is {len(eval_set)}')

Number of batches in the evaluation set is 3


In [12]:
dataloaders = {'train':train_loader,
               'eval':eval_loader}

# Create sample model

The model architecture and weights are taken from [here](https://www.kaggle.com/code/sironghuang/understanding-pytorch-hooks) for reference.

In [13]:
class TestModel(nn.Module):
  def __init__(self, dropout_rate = 0.5):
        super().__init__()
        self.fc1 = nn.Linear(2,2)
        self.s1 = nn.Sigmoid()
        self.fc2 = nn.Linear(2,2)
        self.s2 = nn.Sigmoid()
        self.fc1.weight = torch.nn.Parameter(torch.Tensor([[0.15,0.2],[0.250,0.30]]))
        self.fc1.bias = torch.nn.Parameter(torch.Tensor([0.35]))
        self.fc2.weight = torch.nn.Parameter(torch.Tensor([[0.4,0.45],[0.5,0.55]]))
        self.fc2.bias = torch.nn.Parameter(torch.Tensor([0.6]))

  def forward(self, x):
      x = self.fc1(x)
      x = self.s1(x)
      x = self.fc2(x)
      x = self.s2(x)
      return x

In [14]:
model = TestModel()
print(model)

TestModel(
  (fc1): Linear(in_features=2, out_features=2, bias=True)
  (s1): Sigmoid()
  (fc2): Linear(in_features=2, out_features=2, bias=True)
  (s2): Sigmoid()
)


In [15]:
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Layer: fc1.weight | Size: torch.Size([2, 2]) | Values : tensor([[0.1500, 0.2000],
        [0.2500, 0.3000]], grad_fn=<SliceBackward0>) 

Layer: fc1.bias | Size: torch.Size([1]) | Values : tensor([0.3500], grad_fn=<SliceBackward0>) 

Layer: fc2.weight | Size: torch.Size([2, 2]) | Values : tensor([[0.4000, 0.4500],
        [0.5000, 0.5500]], grad_fn=<SliceBackward0>) 

Layer: fc2.bias | Size: torch.Size([1]) | Values : tensor([0.6000], grad_fn=<SliceBackward0>) 



# Prepare optimizer and loss function

In [16]:
sgd_parameters = {
    'lr':1e-3,        # undefined
    'momentum':0,   # 0
    'dampening':0,    # 0
    'weight_decay':0  # 0
}
optimizer = torch.optim.SGD(model.parameters(), **sgd_parameters)

In [17]:
loss_fn = nn.MSELoss()

# Hooks

Create two hooks for debugging purposes:
- the forward hook will print the input and output tensor produced during the forward pass.
- the backward hook will print the gradient of the output (the gradient coming from the loss) and the gradient input (the gradient used for following calculations closer to the input layers) during the backward pass. 

In [None]:
def forward_debug_hook(module, input, output):
  print('forward hook')
  print(input)
  print(output)

def backward_debug_hook(module, grad_input, grad_output):
   print('backward hook')
   print(grad_input)
   print(grad_output)

# Model training

In [None]:
def train_model(model, dataloaders, loss_fn, optimizer, num_epochs=5):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    n_train_batches = len(dataloaders['train'])

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        ############ train phase ############
        phase = 'train'
        model.train()

        running_loss = 0.0
        running_corrects = 0

        for batch, (inputs, labels) in enumerate(dataloaders[phase]):
          optimizer.zero_grad()


          # handlef = affected_layer.register_forward_hook(forward_debug_hook)
          handleb = affected_layer.register_full_backward_hook(backward_debug_hook)
          # Get model outputs and calculate loss

          print('*'*5 + 'forward pass' + '*'*5)
          outputs = model(inputs)
          print('outputs')
          print(outputs)

          # outputs.backward(torch.tensor([[0.7414,-0.2171],[0.7414,-0.2171],[0.7414,-0.2171]],dtype=torch.float),retain_graph=True)

          print('*'*5 + 'loss calculation' + '*'*5)
          loss = loss_fn(outputs, labels.float())
          print('loss')
          print(loss)

          preds = (outputs>0.5).int()
          

          # backward + optimize
          print('*'*5 + 'backward pass' + '*'*5)
          
          # print(affected_layer._backward_hooks)
          loss.backward()          
          print('weights grad')
          print(affected_layer.weight.grad)
          print('bias grad')
          print(affected_layer.bias.grad)

          # handlef.remove()
          handleb.remove()

          # print(affected_layer._backward_hooks)
          optimizer.step()

          running_loss += loss.item() * inputs.size(0)
          running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(dataloaders[phase].dataset)
        epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
        print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

        ############ eval phase ############
        phase = 'eval'
        model.eval()

        running_loss = 0.0
        running_corrects = 0

        for batch, (inputs, labels) in enumerate(dataloaders[phase]):
          # disable gradient tracking for speedup
          with torch.set_grad_enabled(phase == 'train'):
            outputs = model(inputs)
            loss = loss_fn(outputs, labels.float())
            preds = (outputs>0.5).int()

          running_loss += loss.item() * inputs.size(0)
          running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(dataloaders[phase].dataset)
        epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
        print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))  

        val_acc_history.append(epoch_acc)
        print()


    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

    return val_acc_history

In [None]:
hist = train_model(model,
                   dataloaders,
                   loss_fn,
                   optimizer,
                   num_epochs=1
                   )

In [None]:
print(affected_layer._backward_hooks)
print(catch_hook.caught_grad)
catch_hook.close()
print(affected_layer._backward_hooks)

Erkenntnisse:
- layer.weight.grad wird erst in backward() befüllt.
- Im hook: gradient_out ist der bisherige **Gesamt**gradient. gradient_in ist  gradient_out*dgradient_out/dgradient_in.
- Der gradient für z.B. weight wird berechnet durch entsprechendes summe_batch(gradient_out*dgradient_out/dgewicht).
- um also den zur Idee passenden gradienten abzufangen, nutze einen full_backward_hook auf der passenden layer (hier z.B. fc2) und extrahiere den gradient_out.
- das wiedereinsetzen sollte ebenfalls durch einen hook funktionieren


# Miscellanous code
This is mostly code that was used to comprehend and retrace what is happening under the hood.

In [None]:
# artifical_labels = torch.randint(low=0,high=2,size=(n_samples,1))
# artifical_labels_batch = artifical_labels[0:batch_size]
# print(artifical_labels_batch.shape)
# print(artifical_labels_batch)

In [None]:
# all points inside the unit-sphere with radius 0.7 are marked as 1
# artifical_labels = ((artifical_data[:,0]**2 + artifical_data[:,1]**2) < 0.7).int().unsqueeze(1)
artifical_labels = torch.empty_like(artifical_data)
artifical_labels[:,0] = 0.01
artifical_labels[:,1] = 0.99

artifical_labels_batch = artifical_labels[0:batch_size,]
print(artifical_labels_batch.shape)
print(artifical_labels_batch)

In [None]:
model.eval()
output = model(artifical_data_sample)
print(output.shape)
print(output)

In [None]:
output_batch = model(artifical_data_batch)
print(output_batch.shape)
print(output_batch)

In [None]:
for name, parameter in model.linear_relu_stack.named_parameters():
  print(name)
  print(parameter)

In [None]:
input = artifical_data_sample = artifical_data_batch[:2,:]
print(input)

In [None]:
after0 = model.linear_relu_stack[0](input)
# = print(torch.matmul(model.linear_relu_stack[0].weight,input.t())+model.linear_relu_stack[0].bias)
after0

In [None]:
after1 = model.linear_relu_stack[1](after0)
after1

In [None]:
after2 = model.linear_relu_stack[2](after1)
after2

In [None]:
after3 = model.linear_relu_stack[3](after2)
after3

In [None]:
after4 = model.linear_relu_stack[4](after3)
after4

In [None]:
after5 = model.linear_relu_stack[5](after4)
after5