In [None]:
!pip install ray
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray
  Downloading ray-2.2.0-cp38-cp38-manylinux2014_x86_64.whl (57.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting virtualenv>=20.0.24
  Downloading virtualenv-20.17.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
Collecting distlib<1,>=0.3.6
  Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.5/468.5 KB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: distlib, virtualenv, ray
Successfully installed distlib-0.3.6 ray-2.2.0 virtualenv-20.17.1


In [None]:
# from model_factory import create_model

# MNIST:
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Compose

# PyTorch:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# Other:
from typing import Tuple
from tqdm import tqdm

In [None]:
_transform_list = [
    ToTensor(),
    lambda x: x.view(-1),
]


def load_data(root='./data') -> Tuple[DataLoader, DataLoader]:
    """
    Get the MNIST data from torchvision.

    Arguments:
        None

    Returns:
        train_loader (DataLoader): The training data loader.
        test_loader (DataLoader): The test data loader.

    """
    # Get the training data:
    train_data = MNIST(
        root=root, train=True, download=True, transform=Compose(_transform_list)
    )
    # Create a data loader for the training data:
    # train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
    # Get the test data:
    test_data = MNIST(
        root=root, train=False, download=True, transform=Compose(_transform_list)
    )
    # Create a data loader for the test data:
    # test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
    # Return the data loaders:
    return train_data, test_data

In [None]:
import torch
from typing import Callable
import torch.nn as nn


class MLP(nn.Module):
    """MLP class"""

    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_classes: int,
        hidden_count: int = 1,
        activation: Callable = torch.nn.ReLU,
        initializer: Callable = torch.nn.init.ones_,
    ) -> None:
        """
        Initialize the MLP.

        Arguments:
            input_size: The dimension D of the input data.
            hidden_size: The number of neurons H in the hidden layer.
            num_classes: The number of classes C.
            activation: The activation function to use in the hidden layer.
            initializer: The initializer to use for the weights.
        """
        super(MLP, self).__init__()

        self.input_size = input_size
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        self.hidden_count = hidden_count
        self.activation = activation()
        self.initializer = initializer
        self.layers = nn.ModuleList()

        for _ in range(hidden_count):
            # next_num_inputs = hidden_size
            layer = nn.Linear(input_size, hidden_size)
            layer.weight = initializer(layer.weight)
            layer.bias = torch.nn.Parameter(torch.tensor(0.0))
            self.layers += [layer, nn.BatchNorm1d(hidden_size), nn.Dropout(0.15)]
            input_size = hidden_size

        # Create final layer
        self.out = nn.Linear(input_size, num_classes)
        self.out.weight = initializer(self.out.weight)
        self.out.bias = torch.nn.Parameter(torch.tensor(0.0))

    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        Forward pass of the network.

        Arguments:
            x: The input data.

        Returns:
            The output of the network.
        """
        # Flatten inputs to 2D (if more than that)
        x = x.flatten(end_dim=-2)

        # Get activations of each layer
        for layer in self.layers:
            if isinstance(layer, nn.Linear):
                x = self.activation(layer(x))
            else:
                x = layer(x)

        # Get outputs
        x = self.out(x)

        return x


In [None]:
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = MLP(784, config['hidden_size'], 10, config['hidden_count'], config['actv'], config['init'])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=64,
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=64,
        shuffle=True,
        num_workers=8)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [None]:
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [None]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    config = {
        "hidden_size": tune.choice([16, 32, 64]),
        "hidden_count": tune.choice([3, 4, 5, 6]),
        "actv": tune.choice([torch.nn.ReLU, torch.nn.Tanh, torch.nn.Sigmoid, torch.nn.PReLU, torch.nn.ELU, torch.nn.Hardswish]),
        "init": tune.choice([torch.nn.init.xavier_normal_, torch.nn.init.kaiming_normal_, torch.nn.init.kaiming_uniform_,
                             torch.nn.init.trunc_normal_, torch.nn.init.orthogonal_])
    }
    scheduler = ASHAScheduler(
        metric="accuracy",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    # best_trained_model = MLP(784, config['hidden_size'], 10, config['hidden_count'], config['actv'], config['init'])
    # device = "cpu"
    # if torch.cuda.is_available():
    #     device = "cuda:0"
    #     if gpus_per_trial > 1:
    #         best_trained_model = nn.DataParallel(best_trained_model)
    # best_trained_model.to(device)

    # best_checkpoint_dir = best_trial.checkpoint.value
    # model_state, optimizer_state = torch.load(os.path.join(
    #     best_checkpoint_dir, "checkpoint"))
    # best_trained_model.load_state_dict(model_state)

    # test_acc = test_accuracy(best_trained_model, device)
    # print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)



== Status ==
Current time: 2023-02-05 18:24:39 (running for 00:00:00.19)
Memory usage on this node: 2.2/12.7 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.42 GiB heap, 0.0/3.71 GiB objects
Result logdir: /root/ray_results/train_cifar_2023-02-05_18-24-39
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+-------------------------+----------+-------------------+----------------------+----------------+---------------+----------------------+
| Trial name              | status   | loc               | actv                 |   hidden_count |   hidden_size | init                 |
|-------------------------+----------+-------------------+----------------------+----------------+---------------+----------------------|
| train_cifar_59dd7_00000 | RUNNING  | 172.28.0.12:18231 | <class 'torch.n_8810 |              3 |            16 | <function kaimi_db80 |
| train_cifar_59dd7_



== Status ==
Current time: 2023-02-05 18:24:46 (running for 00:00:07.76)
Memory usage on this node: 2.5/12.7 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.42 GiB heap, 0.0/3.71 GiB objects
Result logdir: /root/ray_results/train_cifar_2023-02-05_18-24-39
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+-------------------------+----------+-------------------+----------------------+----------------+---------------+----------------------+
| Trial name              | status   | loc               | actv                 |   hidden_count |   hidden_size | init                 |
|-------------------------+----------+-------------------+----------------------+----------------+---------------+----------------------|
| train_cifar_59dd7_00000 | RUNNING  | 172.28.0.12:18231 | <class 'torch.n_8810 |              3 |            16 | <function kaimi_db80 |
| train_cifar_59dd7_

Trial name,accuracy,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_cifar_59dd7_00000,0.89425,2023-02-05_18-26-02,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,10,0.362707,172.28.0.12,18231,True,80.8953,7.7648,80.8953,1675621562,0,,10,59dd7_00000,0.00348854
train_cifar_59dd7_00001,0.850917,2023-02-05_18-26-19,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,2,0.529116,172.28.0.12,18231,True,16.8301,8.41784,16.8301,1675621579,0,,2,59dd7_00001,0.00348854
train_cifar_59dd7_00002,0.85825,2023-02-05_18-27-43,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,10,0.512762,172.28.0.12,18231,True,83.9333,8.44499,83.9333,1675621663,0,,10,59dd7_00002,0.00348854
train_cifar_59dd7_00003,0.879667,2023-02-05_18-28-50,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,8,0.401547,172.28.0.12,18231,True,67.2373,8.44603,67.2373,1675621730,0,,8,59dd7_00003,0.00348854
train_cifar_59dd7_00004,0.553333,2023-02-05_18-30-18,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,10,1.31623,172.28.0.12,18231,True,88.0426,9.06865,88.0426,1675621818,0,,10,59dd7_00004,0.00348854
train_cifar_59dd7_00005,0.881,2023-02-05_18-30-27,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,1,0.417048,172.28.0.12,18231,True,8.73121,8.73121,8.73121,1675621827,0,,1,59dd7_00005,0.00348854
train_cifar_59dd7_00006,0.901417,2023-02-05_18-30-36,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,1,0.320679,172.28.0.12,18231,True,9.0118,9.0118,9.0118,1675621836,0,,1,59dd7_00006,0.00348854
train_cifar_59dd7_00007,0.63775,2023-02-05_18-32-06,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,10,1.02535,172.28.0.12,18231,True,90.0812,9.07953,90.0812,1675621926,0,,10,59dd7_00007,0.00348854
train_cifar_59dd7_00008,0.869417,2023-02-05_18-33-14,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,8,0.438496,172.28.0.12,18231,True,68.2049,8.54494,68.2049,1675621994,0,,8,59dd7_00008,0.00348854
train_cifar_59dd7_00009,0.83525,2023-02-05_18-33-24,True,,eac50201129445bcbf41ceb4ddf7d059,42da89f9997b,1,0.563269,172.28.0.12,18231,True,9.35253,9.35253,9.35253,1675622004,0,,1,59dd7_00009,0.00348854


== Status ==
Current time: 2023-02-05 18:24:55 (running for 00:00:16.60)
Memory usage on this node: 2.5/12.7 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -0.8178333333333333
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.42 GiB heap, 0.0/3.71 GiB objects
Result logdir: /root/ray_results/train_cifar_2023-02-05_18-24-39
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+-------------------------+----------+-------------------+----------------------+----------------+---------------+----------------------+----------+------------+----------------------+
| Trial name              | status   | loc               | actv                 |   hidden_count |   hidden_size | init                 |     loss |   accuracy |   training_iteration |
|-------------------------+----------+-------------------+----------------------+----------------+---------------+----------------------+----------+------------+----------------------|
| 

2023-02-05 18:33:24,471	INFO tune.py:762 -- Total run time: 525.45 seconds (525.31 seconds for the tuning loop).


== Status ==
Current time: 2023-02-05 18:33:24 (running for 00:08:45.32)
Memory usage on this node: 2.5/12.7 GiB 
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 8.000: -0.8614583333333333 | Iter 4.000: -0.786625 | Iter 2.000: -0.6600833333333334 | Iter 1.000: -0.7337916666666666
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.42 GiB heap, 0.0/3.71 GiB objects
Result logdir: /root/ray_results/train_cifar_2023-02-05_18-24-39
Number of trials: 10/10 (10 TERMINATED)
+-------------------------+------------+-------------------+----------------------+----------------+---------------+----------------------+----------+------------+----------------------+
| Trial name              | status     | loc               | actv                 |   hidden_count |   hidden_size | init                 |     loss |   accuracy |   training_iteration |
|-------------------------+------------+-------------------+----------------------+----------------+---------------+----------------------+----------+------