In [None]:
from time import perf_counter

import torch
import numpy as np
from matplotlib import pyplot as plt

from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Compose, Lambda
from torch.utils.tensorboard import SummaryWriter

%load_ext tensorboard

In [None]:
# I hear adding noise to the data improves generalization!
# it's called regularization! I learned it in the lecture
train_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=Compose([ToTensor(), Lambda(lambda x: x + torch.distributions.Uniform(-5., 5.).sample(x.shape))]))

test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor())

100%|██████████| 9.91M/9.91M [00:00<00:00, 58.7MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.76MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 14.4MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 1.80MB/s]


In [None]:
batch_size = 128

# Create data loaders.
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True,
                              num_workers=8)
test_dataloader = DataLoader(test_data, batch_size=batch_size,
                             num_workers=8)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break



Shape of X [N, C, H, W]: torch.Size([128, 1, 28, 28])
Shape of y: torch.Size([128]) torch.int64


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

n_layers = 2
hidden_dim = 128
input_dim = 784
model = nn.Sequential(nn.Flatten())
for ind in range(n_layers):
    model.add_module("a" + str(ind), nn.Linear(input_dim if ind == 0 else hidden_dim, hidden_dim))
    model.add_module("b" + str(ind), nn.ReLU())
model.add_module("final", nn.Linear(hidden_dim, 10))

model = model.to(device)
print(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.03)

Using cpu device
Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (a0): Linear(in_features=784, out_features=128, bias=True)
  (b0): ReLU()
  (a1): Linear(in_features=128, out_features=128, bias=True)
  (b1): ReLU()
  (final): Linear(in_features=128, out_features=10, bias=True)
)


In [None]:
def train_model(model: nn.Module,
                loss_fn: nn.Module,
                optimizer: torch.optim.Optimizer,
                training_loader: DataLoader,
                validation_loader: DataLoader,
                n_epochs: int,
                verbose: bool = True):
    n_training_examples = len(training_loader.dataset)
    batches_per_epoch = n_training_examples // training_loader.batch_size
    print("Running {} epochs at {} steps per epoch.".format(n_epochs, batches_per_epoch))

    # note, for training we only track the average over the epoch.
    # this is somewhat imprecise, as the model changes over the epoch.
    # so the metrics at the end of the epoch will usually be better than at the start,
    # but we average over everything.
    # we could record train metrics more often to get a better picture of training progress.
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []

    for epoch in range(n_epochs):
        if verbose:
            print("Starting epoch {}...".format(epoch + 1), end=" ")

        start_time = perf_counter()
        epoch_train_losses = []
        epoch_train_accuracies = []

        model.train()
        for batch_ind, (input_batch, label_batch) in enumerate(training_loader):
            batch_loss, batch_accuracy = train_step(input_batch, label_batch, model, loss_fn, optimizer)
            epoch_train_losses.append(batch_loss.item())
            epoch_train_accuracies.append(batch_accuracy.item())

        end_time = perf_counter()
        time_taken = end_time - start_time

        # evaluate after each epoch
        val_loss, val_accuracy = evaluate(model, validation_loader, loss_fn)

        val_losses.append(val_loss.item())
        val_accuracies.append(val_accuracy.item())
        train_losses.append(np.mean(epoch_train_losses))
        train_accuracies.append(np.mean(epoch_train_accuracies))

        if verbose:
            print("Time taken: {} seconds".format(time_taken))
            print("\tTrain/val loss: {} / {}".format(train_losses[-1], val_losses[-1]))
            print("\tTrain/val accuracy: {} / {}".format(train_accuracies[-1], val_accuracies[-1]))

    return {"train_loss": np.array(train_losses), "train_accuracy": np.array(train_accuracies),
            "val_loss": np.array(val_losses), "val_accuracy": np.array(val_accuracies)}


def train_step(input_batch: torch.tensor,
               label_batch: torch.tensor,
               model: nn.Module,
               loss_fn: nn.Module,
               optimizer: torch.optim.Optimizer):
    input_batch = input_batch.to(device)
    label_batch = label_batch.to(device)
    output_batch = model(input_batch)
    batch_loss = loss_fn(output_batch, label_batch)

    batch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    with torch.no_grad():
        batch_accuracy = accuracy(label_batch, output_batch)
    return batch_loss, batch_accuracy


def evaluate(model: nn.Module,
             dataloader: DataLoader,
             loss_fn: nn.Module):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    val_loss, correct = 0, 0

    with torch.no_grad():
        for input_batch, label_batch in dataloader:
            input_batch = input_batch.to(device)
            label_batch = label_batch.to(device)
            predictions = model(input_batch)
            val_loss += loss_fn(predictions, label_batch)
            correct += (predictions.argmax(axis=1) == label_batch).type(torch.float).sum()

        val_loss /= num_batches
        val_accuracy = correct / size
    return val_loss, val_accuracy


def accuracy(labels: torch.tensor,
             outputs: torch.tensor) -> torch.tensor:
    predictions = torch.argmax(outputs, axis=-1)
    matches = labels == predictions
    return matches.float().mean()

In [None]:
train_model(model, loss_fn, optimizer, train_dataloader, test_dataloader, 15)

Running 15 epochs at 468 steps per epoch.
Starting epoch 1... Time taken: 19.704791262000015 seconds
	Train/val loss: 2.196601983573702 / 1.7648091316223145
	Train/val accuracy: 0.19766626602564102 / 0.6640999913215637
Starting epoch 2... Time taken: 18.082438747000026 seconds
	Train/val loss: 1.931831603630995 / 1.4513981342315674
	Train/val accuracy: 0.31547142094017094 / 0.7355999946594238
Starting epoch 3... Time taken: 19.866539338999985 seconds
	Train/val loss: 1.877176999536335 / 1.3872276544570923
	Train/val accuracy: 0.33842481303418803 / 0.7653999924659729
Starting epoch 4... Time taken: 18.624165148999964 seconds
	Train/val loss: 1.8575618534516065 / 1.3524956703186035
	Train/val accuracy: 0.35067775106837606 / 0.7914000153541565
Starting epoch 5... Time taken: 19.594487929999957 seconds
	Train/val loss: 1.849320159253911 / 1.3355538845062256
	Train/val accuracy: 0.3540831997863248 / 0.7961999773979187
Starting epoch 6... Time taken: 18.32497814300001 seconds
	Train/val loss

{'train_loss': array([2.19660198, 1.9318316 , 1.877177  , 1.85756185, 1.84932016,
        1.83745398, 1.83189816, 1.82954613, 1.82622564, 1.82364054,
        1.82369965, 1.81682925, 1.81087966, 1.80923808, 1.8116803 ]),
 'train_accuracy': array([0.19766627, 0.31547142, 0.33842481, 0.35067775, 0.3540832 ,
        0.35738849, 0.35972556, 0.36010951, 0.36092748, 0.36229634,
        0.36196247, 0.36436632, 0.36708734, 0.3679554 , 0.36656985]),
 'val_loss': array([1.76480913, 1.45139813, 1.38722765, 1.35249567, 1.33555388,
        1.34642339, 1.3165592 , 1.31610191, 1.32215524, 1.31860948,
        1.31535983, 1.31714785, 1.30332327, 1.29230058, 1.28957999]),
 'val_accuracy': array([0.66409999, 0.73559999, 0.76539999, 0.79140002, 0.79619998,
        0.7985    , 0.80339998, 0.78850001, 0.81370002, 0.82770002,
        0.82929999, 0.82260001, 0.85479999, 0.83050001, 0.8387    ])}

In [None]:
%tensorboard --logdir=runs

In [None]:
from collections import defaultdict

activation_stats = defaultdict(list)

def get_activation_hook(name):
    def hook(model, input, output):
        act = output.detach()
        activation_stats[name].append({
            "mean": act.mean().item()
        })
    return hook

for name, module in model.named_modules():
    if isinstance(module, nn.Sigmoid):
        module.register_forward_hook(get_activation_hook(name))

from torch.utils.tensorboard import SummaryWriter
from typing import Optional

def train_model(model: nn.Module,
                loss_fn: nn.Module,
                optimizer: torch.optim.Optimizer,
                training_loader: DataLoader,
                validation_loader: DataLoader,
                n_epochs: int,
                verbose: bool = True,
                logdir: Optional[str] = None):
    n_training_examples = len(training_loader.dataset)
    batches_per_epoch = n_training_examples // training_loader.batch_size
    print("Running {} epochs at {} steps per epoch.".format(n_epochs, batches_per_epoch))

    # note, for training we only track the average over the epoch.
    # this is somewhat imprecise, as the model changes over the epoch.
    # so the metrics at the end of the epoch will usually be better than at the start,
    # but we average over everything.
    # we could record train metrics more often to get a better picture of training progress.
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []


    writer = SummaryWriter(logdir)

    for epoch in range(n_epochs):
        if verbose:
            print("Starting epoch {}...".format(epoch + 1), end=" ")

        start_time = perf_counter()
        epoch_train_losses = []
        epoch_train_accuracies = []

        model.train()
        for batch_ind, (input_batch, label_batch) in enumerate(training_loader):

            # Total Step
            total_step = batch_ind + batches_per_epoch * epoch

            batch_loss, batch_accuracy = train_step(input_batch, label_batch, model, loss_fn, optimizer, writer, total_step)
            epoch_train_losses.append(batch_loss.item())
            epoch_train_accuracies.append(batch_accuracy.item())

        end_time = perf_counter()
        time_taken = end_time - start_time

        # evaluate after each epoch
        val_loss, val_accuracy = evaluate(model, validation_loader, loss_fn, writer, total_step)

        val_losses.append(val_loss.item())
        val_accuracies.append(val_accuracy.item())
        train_losses.append(np.mean(epoch_train_losses))
        train_accuracies.append(np.mean(epoch_train_accuracies))

        if verbose:
            print("Time taken: {} seconds".format(time_taken))
            print("\tTrain/val loss: {} / {}".format(train_losses[-1], val_losses[-1]))
            print("\tTrain/val accuracy: {} / {}".format(train_accuracies[-1], val_accuracies[-1]))

            writer.add_scalar("epoch_train_loss", batch_loss.item(), total_step)

    return {"train_loss": np.array(train_losses), "train_accuracy": np.array(train_accuracies),
            "val_loss": np.array(val_losses), "val_accuracy": np.array(val_accuracies)}


def train_step(input_batch: torch.tensor,
               label_batch: torch.tensor,
               model: nn.Module,
               loss_fn: nn.Module,
               optimizer: torch.optim.Optimizer,
               writer: SummaryWriter,
               total_step: int):

    input_batch = input_batch.to(device)
    label_batch = label_batch.to(device)
    output_batch = model(input_batch)

    batch_loss = loss_fn(output_batch, label_batch)

    writer.add_scalar("step_train_loss", batch_loss.item(), total_step)

    batch_loss.backward()

    for name, param in model.named_parameters():
        if param.grad is not None:
            writer.add_scalars(f'gradients_{name}', {
                        'mean': param.grad.mean().item()
                    }, total_step)
            writer.add_histogram(f'gradients_{name}', param.grad, total_step)

    with torch.no_grad():
            for name, parameter in model.named_parameters():
                writer.add_scalar("gradient_" + name, torch.sqrt((parameter.grad**2).sum()), total_step)
                writer.add_histogram(name, parameter, total_step)
            if total_step % 100 == 0:
              writer.add_images("images/train", input_batch[:1], total_step)


    for layer_name, stats in activation_stats.items():
        if len(stats) > 0:
            last = stats[-1]  # Most recent batch stats
            writer.add_scalars(f'activations_{layer_name}', {
                'mean': last['mean']
            }, total_step)

    optimizer.step()
    optimizer.zero_grad()
    with torch.no_grad():
        batch_accuracy = accuracy(label_batch, output_batch)
    return batch_loss, batch_accuracy

def evaluate(model: nn.Module,
             dataloader: DataLoader,
             loss_fn: nn.Module,
             writer: SummaryWriter,
             total_step: int):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    val_loss, correct = 0, 0

    with torch.no_grad():
        for input_batch, label_batch in dataloader:
            input_batch = input_batch.to(device)
            label_batch = label_batch.to(device)
            predictions = model(input_batch)
            val_loss += loss_fn(predictions, label_batch)
            correct += (predictions.argmax(axis=1) == label_batch).type(torch.float).sum()
        if total_step % 100 == 0:
              writer.add_images("images/validate", input_batch[:1], total_step)

        val_loss /= num_batches
        val_accuracy = correct / size
    return val_loss, val_accuracy

NameError: name 'model' is not defined

In [None]:
from torchvision.transforms import ToTensor, Compose, Lambda
# I hear adding noise to the data improves generalization!
# it's called regularization! I learned it in the lecture
train_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=Compose([ToTensor(), Lambda(lambda x: x + torch.distributions.Uniform(-0.1, 0.1).sample(x.shape))]))

test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor())