In [1]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [2]:
# исходники: https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html

In [3]:
def load_data(data_dir='./data/FashionMNIST'):
    transform = transforms.Compose([
        transforms.ToTensor()
    ])
    
    trainset = torchvision.datasets.FashionMNIST(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.FashionMNIST(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

In [4]:
# Build the neural network, expand on top of nn.Module
class AlexNet(nn.Module):
    # сеточка с прошлого ноутбука, только убрала stride=2
  def __init__(self, config=None):
    super(AlexNet, self).__init__()

    # define layers
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
    self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

    self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
    self.fc2 = nn.Linear(in_features=120, out_features=60)
    self.out = nn.Linear(in_features=60, out_features=10)


  # define forward function
  def forward(self, t):
    # conv 1
    t = self.conv1(t)
    t = F.relu(t)
    t = F.max_pool2d(t, kernel_size=2)

    # conv 2
    t = self.conv2(t)
    t = F.relu(t)
    t = F.max_pool2d(t, kernel_size=2)

    # fc1
    t = t.reshape(-1, 12*4*4) # x.view - оставила коммент на погуглить
    t = self.fc1(t)
    t = F.relu(t)

    # fc2
    t = self.fc2(t)
    t = F.relu(t)

    # output
    t = self.out(t)
    # don't need softmax here since we'll use cross-entropy as activation.

    return t

In [5]:
def train_cifar(config, epoch_num=2, 
                checkpoint_dir=None, data_dir=None): 
    
    net = AlexNet()
    
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

    net.to(device)
    # чтобы итерировать по оптимизаторам решила ввести параметр-строку, 
    # т.к. оптимизаторы при инициализации требуют ссылку на паремтры сетки:
    optimizers_dict={
        "SGD": optim.SGD(net.parameters(), lr=config["lr"]),
        "Adam": optim.Adam(net.parameters(), lr=config["lr"])
    }
    
    criterion = config["losses"] # для итерирования разных losses
    optimizer = optimizers_dict[config["optimizers_names"]]

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True)

    for epoch in range(epoch_num):  # loop over the dataset multiple times

        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [6]:
def test_accuracy(net, device = "cuda:0"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False)


    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)

            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [7]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    
    # заменила конфиг, чтобы был grid_search, надо пофиксить lr, была ли ошибка из-за lr или из-за памяти - не уверена, поэтому пока оставила так
    config = {        
        "lr":1e-2,   
        "batch_size": tune.grid_search([100, 1000]),
        "optimizers_names":  tune.grid_search(["Adam", "SGD"]), # ключи словаря
        "losses": nn.CrossEntropyLoss() # на попробовать tune.grid_search(["""nn.MultiLabelMarginLoss(), """ nn.CrossEntropyLoss()]) 
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "precision"]) # немного побаловалась, не нашла что еще добавить на посмотреть
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)
    
    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))
    
    
    best_trained_model = AlexNet(config=best_trial.config)
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device=device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)
    
    return best_trained_model
   



In [8]:
best_trained_model = main(num_samples=1, max_num_epochs=2, gpus_per_trial=1)

2020-12-28 14:08:54,356	INFO services.py:1173 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-12-28 14:08:56,417	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.
2020-12-28 14:08:56,503	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


== Status ==
Memory usage on this node: 8.3/503.6 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 1/80 CPUs, 0/8 GPUs, 0.0/341.75 GiB heap, 0.0/103.81 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /root/ray_results/DEFAULT_2020-12-28_14-08-56
Number of trials: 1/4 (1 RUNNING)
+---------------------+----------+-------+--------------+--------------------+
| Trial name          | status   | loc   |   batch_size | optimizers_names   |
|---------------------+----------+-------+--------------+--------------------|
| DEFAULT_394c3_00000 | RUNNING  |       |          100 | Adam               |
+---------------------+----------+-------+--------------+--------------------+


Result for DEFAULT_394c3_00001:
  accuracy: 0.7121666666666666
  date: 2020-12-28_14-09-10
  done: false
  experiment_id: 6dd0e5d5ab624380be63477185995cb9
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 0.7337659895420074
  node_ip: 172.17.0.2

2020-12-28 14:09:31,127	INFO tune.py:448 -- Total run time: 37.31 seconds (34.63 seconds for the tuning loop).


Result for DEFAULT_394c3_00000:
  accuracy: 0.8608333333333333
  date: 2020-12-28_14-09-31
  done: true
  experiment_id: ce8b4f2b85244ee2b91ff82d333d2ac7
  hostname: 2c4060a45fe3
  iterations_since_restore: 2
  loss: 0.37299724568923315
  node_ip: 172.17.0.2
  pid: 7428
  should_checkpoint: true
  time_since_restore: 33.90536332130432
  time_this_iter_s: 16.607629537582397
  time_total_s: 33.90536332130432
  timestamp: 1609164571
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 394c3_00000
  
== Status ==
Memory usage on this node: 8.4/503.6 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 2.000: None | Iter 1.000: -1.5130266120036444
Resources requested: 1/80 CPUs, 0/8 GPUs, 0.0/341.75 GiB heap, 0.0/103.81 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /root/ray_results/DEFAULT_2020-12-28_14-08-56
Number of trials: 4/4 (1 RUNNING, 3 TERMINATED)
+---------------------+------------+-----------------+--------------+--------------------+----------+-----------

In [10]:

gpus_per_trial=1
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
    if gpus_per_trial > 1:
        best_trained_model = nn.DataParallel(best_trained_model)

test_acc = test_accuracy(best_trained_model, device=device)
   

  0%|          | 0/26421880 [00:00<?, ?it/s]

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/train-images-idx3-ubyte.gz


26427392it [00:00, 33013786.95it/s]                             


Extracting ./data/FashionMNIST/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw


32768it [00:00, 247606.52it/s]                           
0it [00:00, ?it/s]

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/train-labels-idx1-ubyte.gz
Extracting ./data/FashionMNIST/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


4423680it [00:00, 13408048.56it/s]                           
8192it [00:00, 98936.45it/s]


Extracting ./data/FashionMNIST/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting ./data/FashionMNIST/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/FashionMNIST/raw
Processing...
Done!


In [11]:
 print("Best trial test set accuracy: {}".format(test_acc))

Best trial test set accuracy: 0.8534
