In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.models import resnet18
from torchvision.transforms import Compose, Normalize, ToTensor

from ignite.engine import Engine, Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import TensorboardLogger, global_step_from_engine
import ignite
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_path = r"C:\Users\Shizh\OneDrive - Maastricht University\Data"
ignite.__version__

'0.4.13'

In [2]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        
        # Changed the output layer to output 10 classes instead of 1000 classes
        self.model = resnet18(num_classes=10)

        # Changed the input layer to take grayscale images for MNIST instead of RGB images
        self.model.conv1 = nn.Conv2d(
            1, 64, kernel_size=3, padding=1, bias=False
        )

    def forward(self, x):
        return self.model(x)


model = Net().to(device)

In [3]:
data_transform = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))])

train_loader = DataLoader(
    MNIST(download=False, root=data_path, transform=data_transform, train=True), batch_size=128, shuffle=True
)

val_loader = DataLoader(
    MNIST(download=False, root=data_path, transform=data_transform, train=False), batch_size=128, shuffle=False
)

In [4]:
imgs, labels = next(iter(train_loader))
imgs.shape

torch.Size([128, 1, 28, 28])

In [5]:
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()

Pay attention here, this is the part to automatic generate the training loop

In [6]:
trainer = create_supervised_trainer(model, optimizer, criterion, device)

val_metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(criterion)
}

train_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device)
val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device)

to have more control over the training loop, use the code below

In [None]:
def train_step(engine, batch):
    model.train()
    optimizer.zero_grad()
    x, y = batch[0].to(device), batch[1].to(device)
    y_pred = model(x)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    return loss.item()

trainer = Engine(train_step)

def validation_step(engine, batch):
    model.eval()
    with torch.no_grad():
        x, y = batch[0].to(device), batch[1].to(device)
        y_pred = model(x)
        return y_pred, y

train_evaluator = Engine(validation_step)
val_evaluator = Engine(validation_step)

# Attach metrics to the evaluators
for name, metric in val_metrics.items():
    metric.attach(train_evaluator, name)

for name, metric in val_metrics.items():
    metric.attach(val_evaluator, name)

We can customize the code further by adding all kinds of event handlers. Engine allows adding handlers on various events that are triggered during the run. When an event is triggered, attached handlers (functions) are executed. Thus, for logging purposes we add a function to be executed at the end of every log_interval-th iteration:

In [7]:
# How many batches to wait before logging training status
log_interval = 100

def log_training_loss(engine):
    print(f"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f}")

trainer.add_event_handler(Events.ITERATION_COMPLETED, log_training_loss)

<ignite.engine.events.RemovableEventHandle at 0x27afc184a10>

After an epoch ends during training, we can compute the training and validation metrics by running train_evaluator on train_loader and val_evaluator on val_loader respectively. Hence we will attach two additional handlers to trainer when an epoch completes:

In [7]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    train_evaluator.run(train_loader)
    metrics = train_evaluator.state.metrics
    print(f"Training Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")


@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
    val_evaluator.run(val_loader)
    metrics = val_evaluator.state.metrics
    print(f"Validation Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")

we can use ModelCheckpoint() as shown below to save the n_saved best models determined by a metric (here accuracy) after each epoch is completed. We attach model_checkpoint to val_evaluator because we want the two models with the highest accuracies on the validation dataset rather than the training dataset. This is why we defined two separate evaluators (val_evaluator and train_evaluator) before.

In [10]:
# Score function to return current value of any metric we defined above in val_metrics
def score_function(engine):
    return engine.state.metrics["accuracy"]

# Checkpoint to store n_saved best models wrt score function
model_checkpoint = ModelCheckpoint(
    "checkpoint",
    n_saved=2,
    filename_prefix="best",
    score_function=score_function,
    score_name="accuracy",
    global_step_transform=global_step_from_engine(trainer), # helps fetch the trainer's state
)
  
# Save the model after every epoch of val_evaluator is completed
val_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model})


<ignite.engine.events.RemovableEventHandle at 0x28cabef8810>

We will use TensorboardLogger() to log trainer’s loss, and training and validation metrics separately.

In [11]:
# Define a Tensorboard logger
tb_logger = TensorboardLogger(log_dir="tb-logger")

# Attach handler to plot trainer's loss every 100 iterations
tb_logger.attach_output_handler(
    trainer,
    event_name=Events.ITERATION_COMPLETED(every=100),
    tag="training",
    output_transform=lambda loss: {"batch_loss": loss},
)

# Attach handler for plotting both evaluators' metrics after every epoch completes
for tag, evaluator in [("training", train_evaluator), ("validation", val_evaluator)]:
    tb_logger.attach_output_handler(
        evaluator,
        event_name=Events.EPOCH_COMPLETED,
        tag=tag,
        metric_names="all",
        global_step_transform=global_step_from_engine(trainer),
    )

In [12]:
trainer.run(train_loader, max_epochs=5)

Training Results - Epoch[1] Avg accuracy: 0.97 Avg loss: 0.11
Validation Results - Epoch[1] Avg accuracy: 0.97 Avg loss: 0.11
Training Results - Epoch[2] Avg accuracy: 0.98 Avg loss: 0.06
Validation Results - Epoch[2] Avg accuracy: 0.98 Avg loss: 0.06
Training Results - Epoch[3] Avg accuracy: 0.99 Avg loss: 0.03
Validation Results - Epoch[3] Avg accuracy: 0.99 Avg loss: 0.04
Training Results - Epoch[4] Avg accuracy: 0.96 Avg loss: 0.11
Validation Results - Epoch[4] Avg accuracy: 0.96 Avg loss: 0.12
Training Results - Epoch[5] Avg accuracy: 0.99 Avg loss: 0.03
Validation Results - Epoch[5] Avg accuracy: 0.99 Avg loss: 0.04


State:
	iteration: 2345
	epoch: 5
	epoch_length: 469
	max_epochs: 5
	output: 0.044322434812784195
	batch: <class 'list'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

When the training finished, by using 
~~~
tensorboard --logdir=.
~~~
to view the training log

In [5]:
# Let's close the logger and inspect our results
tb_logger.close()

%load_ext tensorboard

%tensorboard --logdir=.

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
