# PyTorch: Using MLflow and Optuna for experiment tracking and hyperparameter optimization

## Imports

In [17]:
import os

import optuna
import mlflow
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from mlflow import pytorch
from pprint import pformat
from urllib.parse import urlparse

## Network 

In [18]:
class Net(nn.Module):
    def __init__(self, dropout=0.0):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(dropout)
        self.dropout2 = nn.Dropout2d(dropout)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output
    

## Train and validation functions

In [19]:
# Training loop
def train(options, model, device, train_loader, optimizer, epoch):
    model.train()
    train_set_size = len(train_loader.dataset)
    num_batches = len(train_loader)
    train_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        if batch_idx % options["log_interval"] == 0:
            batch_size = len(data)
            print(f"Train Epoch: {epoch} [{batch_idx * batch_size}/{train_set_size} "
                  f"({100. * batch_idx / num_batches:.0f}%)]\tLoss: {loss.item():.6f}")
    avg_train_loss = train_loss / num_batches
    return avg_train_loss

# Validation loop
def validate(model, device, test_loader):
    model.eval()
    test_set_size = len(test_loader.dataset)
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= test_set_size

    print(f"Test set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{test_set_size} "
          f"({100. * correct / test_set_size:.0f}%)\n")
    return test_loss

## Main

In [20]:
# Get the local path of the active mlflow run to save artifacts to
def get_artifact_path(active_run):
    parsed_uri = urlparse(active_run.info.artifact_uri)
    artifact_path = os.path.abspath(os.path.join(parsed_uri.netloc, parsed_uri.path))
    return artifact_path

In [21]:
# Obtain hyperparameters for this trial
def suggest_hyperparameters(trial):
    # Obtain the learning rate on a logarithmic scale
    lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
    # Obtain the dropout ratio in a range from 0.0 to 0.9 with step size 0.1
    dropout = trial.suggest_float("dropout", 0.0, 0.9, step=0.1)
    # Obtain the batch size (as power of 2)
    batch_size = 2 ** trial.suggest_int("batch_size_power", 5, 8, step=1)
    # Obtain the optimizer to use by name
    optimizer_name = trial.suggest_categorical("optimizer_name", ["Adam", "Adadelta"])

    print(f"Suggested hyperparameters: \n{pformat(trial.params)}")
    # Log the obtained trial parameters using mlflow
    mlflow.log_params(trial.params)
    return lr, dropout, optimizer_name, batch_size

In [22]:
def get_mnist_dataloaders(batch_size=8):
    # Load the MNIST train and test datasets and save them to ./data
    mnist_train = datasets.MNIST('./data', train=True, download=True, transform=transforms.Compose([
                                               transforms.ToTensor(),
                                               transforms.Normalize((0.1307,), (0.3081,))
                                           ]))
    train_loader = torch.utils.data.DataLoader(mnist_train,
                                                       batch_size=batch_size,
                                                       shuffle=True)
    mnist_test = datasets.MNIST('./data', train=False, download=True, transform=transforms.Compose([
                                               transforms.ToTensor(),
                                               transforms.Normalize((0.1307,), (0.3081,))
                                           ]))
    val_loader = torch.utils.data.DataLoader(mnist_test,
                                                      batch_size=1000,
                                                      shuffle=True)
    return train_loader, val_loader

In [26]:
def objective(trial, experiment, options=None):
    # Initialize the best validation loss, which is the value to be minimized by the network
    best_val_loss = float('Inf')
    
    # Start mlflow run
    with mlflow.start_run(experiment_id=experiment.experiment_id):
        # Use mlflow to log experiment options
        mlflow.log_params(options)
        
        # Get hyperparameter suggestions created by optuna
        lr, dropout, optimizer_name, batch_size = suggest_hyperparameters(trial)
        
        print(f"\n**************************")

        active_run = mlflow.active_run()
        print(f"Starting run {active_run.info.run_id} and trial {trial.number}")

        # Parse the active mlflow run's artifact_uri and convert it into a system path
        parsed_uri = urlparse(active_run.info.artifact_uri)
        artifact_path = os.path.abspath(os.path.join(parsed_uri.netloc, parsed_uri.path))
        print(f"Artifact path for this run: {artifact_path}")
        
        # Use CUDA if GPU is available, else CPU
        use_cuda = options["use_cuda"] and torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        # Log mlflow device parameter
        mlflow.log_param("device", device)

        # Obtain the MNIST train and validation loaders using a helper function
        train_loader, val_loader = get_mnist_dataloaders()
        
        # Initialize network
        model = Net(dropout=dropout).to(device)

        # Pick an optimizer based on optuna's parameter suggestion
        if optimizer_name == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=lr)
        if optimizer_name == "Adadelta":
            optimizer = optim.Adadelta(model.parameters(), lr=lr)
        scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

        # Network training & validation loop
        for epoch in range(0, options["epochs"]):
            avg_train_loss = train(options, model, device, train_loader, optimizer, epoch)
            avg_val_loss = validate(model, device, val_loader)
            
            if avg_val_loss <= best_val_loss:
                best_val_loss = avg_val_loss
            
            # Report intermediate objective value.
            trial.report(avg_val_loss, step=epoch)
            
            # Handle pruning based on the intermediate value.
            if trial.should_prune():
                raise optuna.TrialPruned()

            # Log average train and test set loss for the current epoch using mlflow
            mlflow.log_metric("avg_train_losses", avg_train_loss, step=epoch)
            mlflow.log_metric("avg_val_loss", avg_val_loss, step=epoch)
            scheduler.step()

        # Save the final network model to the current mlflow run's directory 
        if options["save_model"]:
            pytorch.save_model(model, f"{artifact_path}/mnist_model")

    # Return the best validation loss
    return best_val_loss

In [27]:
def main():
    # Experiment options
    options = {
        "experiment_name": "pytorch-optuna-mlflow",
        "epochs": 2,
        "use_cuda": False,
        "log_interval": 10,
        "save_model": True
    }

    # Create mlflow experiment if it doesn't exist already
    experiment_name = options["experiment_name"]
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        mlflow.create_experiment(experiment_name)
        experiment = mlflow.get_experiment_by_name(experiment_name)
    mlflow.set_experiment(experiment_name)

    # Propagate logs to the root logger.
    optuna.logging.set_verbosity(verbosity=optuna.logging.INFO)

    # Create the optuna study which shares the experiment name
    study = optuna.create_study(study_name=experiment_name, direction="minimize")
    study.optimize(lambda trial: objective(trial, experiment, options), n_trials=5)

    # Filter optuna trials by state
    pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
    complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

    print("\n++++++++++++++++++++++++++++++++++\n")
    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Trial number: ", trial.number)
    print("  Loss (trial value): ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


In [28]:
main()



[32m[I 2020-11-16 13:27:39,449][0m A new study created in memory with name: pytorch-optuna-mlflow[0m


Suggested hyperparameters: 
{'batch_size_power': 7,
 'dropout': 0.5,
 'lr': 0.00037477516268037106,
 'optimizer_name': 'Adam'}

**************************
Starting run 68725cd95b4a4c1ab1229e9517f49f8c and trial 0
Artifact path for this run: /Users/sstoppel/PycharmProjects/pytorch-mlflow-optuna/mlruns/1/68725cd95b4a4c1ab1229e9517f49f8c/artifacts








Test set: Average loss: 0.0541, Accuracy: 9819/10000 (98%)













[32m[I 2020-11-16 13:36:21,923][0m Trial 0 finished with value: 0.04096794166564941 and parameters: {'lr': 0.00037477516268037106, 'dropout': 0.5, 'batch_size_power': 7, 'optimizer_name': 'Adam'}. Best is trial 0 with value: 0.04096794166564941.[0m


Test set: Average loss: 0.0410, Accuracy: 9871/10000 (99%)

Suggested hyperparameters: 
{'batch_size_power': 6,
 'dropout': 0.30000000000000004,
 'lr': 0.00020249468909917502,
 'optimizer_name': 'Adadelta'}

**************************
Starting run 4896b51e90a6402990f54c4b88574af0 and trial 1
Artifact path for this run: /Users/sstoppel/PycharmProjects/pytorch-mlflow-optuna/mlruns/1/4896b51e90a6402990f54c4b88574af0/artifacts








Test set: Average loss: 1.8707, Accuracy: 6900/10000 (69%)













[32m[I 2020-11-16 13:45:05,371][0m Trial 1 finished with value: 1.344887548828125 and parameters: {'lr': 0.00020249468909917502, 'dropout': 0.30000000000000004, 'batch_size_power': 6, 'optimizer_name': 'Adadelta'}. Best is trial 0 with value: 0.04096794166564941.[0m


Test set: Average loss: 1.3449, Accuracy: 7619/10000 (76%)

Suggested hyperparameters: 
{'batch_size_power': 8,
 'dropout': 0.6000000000000001,
 'lr': 0.03012607761008102,
 'optimizer_name': 'Adadelta'}

**************************
Starting run 8777b101bbf5478287fd21e78f145e27 and trial 2
Artifact path for this run: /Users/sstoppel/PycharmProjects/pytorch-mlflow-optuna/mlruns/1/8777b101bbf5478287fd21e78f145e27/artifacts








Test set: Average loss: 0.1742, Accuracy: 9468/10000 (95%)













[32m[I 2020-11-16 13:53:43,982][0m Trial 2 finished with value: 0.12528440704345703 and parameters: {'lr': 0.03012607761008102, 'dropout': 0.6000000000000001, 'batch_size_power': 8, 'optimizer_name': 'Adadelta'}. Best is trial 0 with value: 0.04096794166564941.[0m


Test set: Average loss: 0.1253, Accuracy: 9605/10000 (96%)

Suggested hyperparameters: 
{'batch_size_power': 6,
 'dropout': 0.1,
 'lr': 0.00010368692432161317,
 'optimizer_name': 'Adadelta'}

**************************
Starting run 0951a4fa56014682881d485b0179c8e2 and trial 3
Artifact path for this run: /Users/sstoppel/PycharmProjects/pytorch-mlflow-optuna/mlruns/1/0951a4fa56014682881d485b0179c8e2/artifacts








Test set: Average loss: 2.0191, Accuracy: 6584/10000 (66%)













[32m[I 2020-11-16 14:01:36,764][0m Trial 3 finished with value: 1.70433662109375 and parameters: {'lr': 0.00010368692432161317, 'dropout': 0.1, 'batch_size_power': 6, 'optimizer_name': 'Adadelta'}. Best is trial 0 with value: 0.04096794166564941.[0m


Test set: Average loss: 1.7043, Accuracy: 7355/10000 (74%)

Suggested hyperparameters: 
{'batch_size_power': 7,
 'dropout': 0.4,
 'lr': 0.00013298862799605266,
 'optimizer_name': 'Adadelta'}

**************************
Starting run a0ed460ec8dd4b2e97ffd653fa1b978b and trial 4
Artifact path for this run: /Users/sstoppel/PycharmProjects/pytorch-mlflow-optuna/mlruns/1/a0ed460ec8dd4b2e97ffd653fa1b978b/artifacts








Test set: Average loss: 2.1142, Accuracy: 5058/10000 (51%)













[32m[I 2020-11-16 14:38:49,003][0m Trial 4 finished with value: 1.8854495361328125 and parameters: {'lr': 0.00013298862799605266, 'dropout': 0.4, 'batch_size_power': 7, 'optimizer_name': 'Adadelta'}. Best is trial 0 with value: 0.04096794166564941.[0m


Test set: Average loss: 1.8854, Accuracy: 6848/10000 (68%)


++++++++++++++++++++++++++++++++++

Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  5
Best trial:
  Trial number:  0
  Loss (trial value):  0.04096794166564941
  Params: 
    lr: 0.00037477516268037106
    dropout: 0.5
    batch_size_power: 7
    optimizer_name: Adam
