# PyTorch: Using MLflow and Optuna for experiment tracking and hyperparameter optimization

## Imports

In [155]:
from __future__ import print_function
import os
import time

import optuna
import mlflow
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from mlflow import pytorch
from pprint import pformat
from urllib.parse import urlparse

## Network 

In [156]:
class Net(nn.Module):
    def __init__(self, dropout=0.0):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(dropout)
        self.dropout2 = nn.Dropout2d(dropout)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output
    

## Train and validation functions

In [157]:
def train(options, model, device, train_loader, optimizer, epoch):
    model.train()
    train_set_size = len(train_loader.dataset)
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % options["log_interval"] == 0:
            # Log batch loss using mlflow
            mlflow.log_metric("train_batch_nll_loss", loss.item())
            batch_size = len(data)
            print(f"Train Epoch: {epoch} [{batch_idx * batch_size}/{train_set_size} "
                  f"({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")


def test(model, device, test_loader):
    model.eval()
    test_set_size = len(test_loader.dataset)
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= test_set_size
    # Log average test set loss using mlflow
    mlflow.log_metric("test_nll_loss", test_loss)

    print(f"Test set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{test_set_size} "
          f"({100. * correct / test_set_size:.0f}%)\n")
    return test_loss

## Main

In [158]:
def get_artifact_path(active_run):
    parsed_uri = urlparse(active_run.info.artifact_uri)
    artifact_path = os.path.abspath(os.path.join(parsed_uri.netloc, parsed_uri.path))
    return artifact_path

In [159]:
# Obtain hyperparameters for this trial
def suggest_hyperparameters(trial):
    # Obtain the learning rate on a logarithmic scale
    lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
    # Obtain the dropout ratio in a range from 0.0 to 0.9 with step size 0.1
    dropout = trial.suggest_float("dropout", 0.0, 0.9, step=0.1)
    # Obtain the batch size (as power of 2)
    batch_size = 2 ** trial.suggest_int("batch_size_power", 5, 8, step=1)
    # Obtain the optimizer to use by name
    optimizer_name = trial.suggest_categorical("optimizer_name", ["Adam", "Adadelta"])
    
    print(f"Suggested hyperparameters: \n{pformat(trial.params)}")
    # Log the obtained trial parameters using mlflow
    mlflow.log_params(trial.params)
    return lr, dropout, optimizer_name, batch_size

In [160]:
def objective(trial, experiment, options=None):
    # Start mlflow run
    with mlflow.start_run(experiment_id=experiment.experiment_id):
        print(f"\n**************************")

        active_run = mlflow.active_run()
        print(f"Starting run {active_run.info.run_id}")

        artifact_path = get_artifact_path(active_run)
        print(f"Artifact path for this run: {artifact_path}")
        
        # Use mlflow to log params
        mlflow.log_params(options)

        # Use CUDA if GPU is available
        use_cuda = options["use_cuda"] and torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        # Log mlflow device parameter
        mlflow.log_param("device", device)

        # Get hyperparameter suggestions created by optuna
        lr, dropout, optimizer_name, batch_size = suggest_hyperparameters(trial)

        # Load the MNIST train and test datasets and save them to ./data
        mnist_train = datasets.MNIST('./data', train=True, download=True, transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307,), (0.3081,))
                                       ]))
        train_loader = torch.utils.data.DataLoader(mnist_train,
                                                   batch_size=batch_size,
                                                   shuffle=True)
        mnist_test = datasets.MNIST('./data', train=False, download=True, transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307,), (0.3081,))
                                       ]))
        test_loader = torch.utils.data.DataLoader(mnist_test,
                                                  batch_size=1000,
                                                  shuffle=True)
        # Initialize network
        model = Net(dropout=dropout).to(device)

        # Pick an optimizer based on optuna's parameter suggestion
        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        if optimizer_name == "Adadelta":
            optimizer = optim.Adadelta(model.parameters(), lr=lr)
        scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

        # Training loop
        for epoch in range(0, options["epochs"]):
            train(options, model, device, train_loader, optimizer, epoch)
            loss = test(model, device, test_loader)
            scheduler.step()

        # Save the model using mlflow
        if options["save_model"]:
            pytorch.save_model(model, f"{artifact_path}/mnist_model")

    # Return the test loss to be minimized by the network
    return loss

In [161]:
def main(options=None):
    # Create mlflow experiment if it doesn't exist already
    experiment_name = options["experiment_name"]
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        mlflow.create_experiment(experiment_name)
        experiment = mlflow.get_experiment_by_name(experiment_name)
    mlflow.set_experiment(experiment_name)

    # Propagate logs to the root logger.
    optuna.logging.set_verbosity(verbosity=optuna.logging.INFO)

    # Create the optuna study which shares the experiment name
    study = optuna.create_study(study_name=experiment_name, direction="minimize")
    study.optimize(lambda trial: objective(trial, experiment, options), n_trials=2)

    # Filter optuna trials by state
    pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
    complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

    print("\n++++++++++++++++++++++++++++++++++\n")
    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Trial number: ", trial.number)
    print("  Loss (trial value): ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


In [None]:
OPTIONS = {
    "experiment_name": "pytorch-optuna-mlflow",
    "epochs": 1,
    "use_cuda": False,
    "log_interval": 10,
    "save_model": True
}
main(options=OPTIONS)


**************************
Starting run 98ee2469c0a9442495da21a907e433fb
Artifact path for this run: /home/steffi/dev/blog/pytorch-mlflow-optuna/mlruns/0/98ee2469c0a9442495da21a907e433fb/artifacts
Suggested hyperparameters: 
{'batch_size_power': 7,
 'dropout': 0.5,
 'lr': 0.00019711211359145977,
 'optimizer_name': 'Adam'}
Test set: Average loss: 0.1128, Accuracy: 9649/10000 (96%)


**************************
Starting run 81314c513a634e768a41a356c92a7cd6
Artifact path for this run: /home/steffi/dev/blog/pytorch-mlflow-optuna/mlruns/0/81314c513a634e768a41a356c92a7cd6/artifacts
Suggested hyperparameters: 
{'batch_size_power': 8,
 'dropout': 0.7000000000000001,
 'lr': 0.008041911553354405,
 'optimizer_name': 'Adam'}


  "type " + obj.__name__ + ". It won't be checked "
