# Training a PyTorch MNIST model with MlFlow and Optuna

## Imports

In [51]:
from __future__ import print_function
import os
import time

import optuna
import mlflow
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from mlflow import pytorch
from pprint import pformat
from urllib.parse import urlparse

## Network 

In [52]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output
    

## Train and validation functions

In [53]:
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    train_set_size = len(train_loader.dataset)
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args["log_interval"] == 0:
            # Log batch loss using mlflow
            mlflow.log_metric("train_batch_loss", loss.item())
            batch_size = len(data)
            print(f"Train Epoch: {epoch} [{batch_idx * batch_size}/{train_set_size} "
                  f"({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")


def test(model, device, test_loader):
    model.eval()
    test_set_size = len(test_loader.dataset)
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= test_set_size
    mlflow.log_metric("test_loss", test_loss)

    print(f"Test set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{test_set_size} "
          f"({100. * correct / test_set_size:.0f}%)")

## Main

In [54]:
def get_artifact_path(active_run):
    parsed_uri = urlparse(active_run.info.artifact_uri)
    artifact_path = os.path.abspath(os.path.join(parsed_uri.netloc, parsed_uri.path))
    return artifact_path

In [55]:
def suggest_hyperparameters(trial):
    lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
    dropout = trial.suggest_float("dropout", 0.0, 0.8, step=0.1)
    batch_size = trial.suggest_int("batch_size", 32, 256, step=4)
    optimizer = trial.suggest_categorical("optim", ["Adam", "Adadelta"])
    print(f"Trial parameters: {pformat([trial.params])}")
    return lr, dropout, optimizer, batch_size

In [56]:
def objective(trial, experiment, params=None):
    # Start mlflow run
    with mlflow.start_run(experiment_id=experiment.experiment_id):
        active_run = mlflow.active_run()
        artifact_path = get_artifact_path(active_run)
        print(f"Artifact path for this run: {artifact_path}")
        
        # Use mlflow to log params
        mlflow.log_params(params)

        # Use CUDA if GPU is available
        use_cuda = params["use_cuda"] and torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        # Log mlflow device parameter
        mlflow.log_param("device", device)

        torch.manual_seed(params["seed"])

        # Use hyperparameter suggestions created by Optuna
        lr, dropout, optimizer_name, batch_size = suggest_hyperparameters(trial)
        
        # Load the MNIST train and test dataset and save it to ./data
        mnist_train = datasets.MNIST('./data', train=True, download=True, transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307,), (0.3081,))
                                       ]))
        train_loader = torch.utils.data.DataLoader(mnist_train,
                                                   batch_size=batch_size,
                                                   shuffle=True)
        mnist_test = datasets.MNIST('./data', train=False, download=True, transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307,), (0.3081,))
                                       ]))
        test_loader = torch.utils.data.DataLoader(mnist_test,
                                                  batch_size=params["validation_batch_size"],
                                                  shuffle=True)

        model = Net().to(device)
        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        if optimizer_name == "Adadelta":
            optimizer = optim.Adadelta(model.parameters(), lr=lr)
        scheduler = StepLR(optimizer, step_size=1, gamma=params["gamma"])

        for epoch in range(0, params["epochs"]):
            train(params, model, device, train_loader, optimizer, epoch)
            test(model, device, test_loader)
            scheduler.step()
    
        if params["save_model"]:
            pytorch.save_model(model, f"{artifact_path}/mnist_model")

In [57]:
def main(params=None):
    # create mlflow experiment if it doesn't exist already
    experiment_name = params["experiment_name"]
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        mlflow.create_experiment(experiment_name)
        experiment = mlflow.get_experiment_by_name(experiment_name)
    mlflow.set_experiment(experiment_name)

    # Create the optuna study whih has the same name as the experiment
    study = optuna.create_study(study_name=experiment_name, direction="minimize")
    study.optimize(lambda trial: objective(trial, experiment, params), n_trials=5)

    pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
    complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


In [None]:
PARAMS = {
    "experiment_name": "pytorch-optuna-mlflow",
    "batch_size": 128,
    "validation_batch_size": 1000,
    "epochs": 3,
    "gamma": 0.7,
    "use_cuda": False,
    "seed": 42,
    "log_interval": 10,
    "save_model": True
}
main(params=PARAMS)

Artifact path for this run: /home/steffi/dev/blog/pytorch-mlflow-optuna/mlruns/0/4578e1800fb0494aaaa90f25bca103f6/artifacts
Trial parameters: [{'batch_size': 232,
  'dropout': 0.2,
  'lr': 0.007334813642845111,
  'optim': 'Adadelta'}]
Test set: Average loss: 0.5984, Accuracy: 8574/10000 (86%)
Test set: Average loss: 0.3996, Accuracy: 8947/10000 (89%)
