In [1]:
from functools import partial
import json
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler

from my_models import (AlexNet, VGG16, ResNet)

In [2]:
in_ch = 3
out_ch = 10

models_list = [AlexNet, VGG16, ResNet]

In [3]:
# создание папок для логов разых моделей:
checkpoint_dir="./data/checkpoints/"

for name in models_list:    
    if not os.path.isdir(checkpoint_dir+str(name)):
        os.makedirs(checkpoint_dir+str(name))


In [4]:
# исходники: https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html

In [5]:
def load_data(data_dir="./data/CIFAR"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

In [6]:
def train_cifar(config, model_name, epoch_num=2,
                checkpoint_dir=checkpoint_dir, data_dir=None): 
    
    net = model_name(in_ch, out_ch)
    
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)

    net.to(device)
    
    criterion = config["losses"]() # для итерирования разных losses
    optimizer = config["optimizers_names"](net.parameters(), lr=config["lr"])
    """
    if checkpoint_dir: 
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir+model_name, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    """
    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True)

    for epoch in range(epoch_num):  # loop over the dataset multiple times

        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [7]:
def main(model_name, num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    
    # заменила конфиг
    config = {        
        "lr":tune.grid_search([1e-2, 1e-1]),   
        "batch_size": tune.grid_search([100, 1000]),
        "optimizers_names":  tune.grid_search([optim.Adam, optim.SGD]), # ключи словаря
        "losses": nn.CrossEntropyLoss 
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = JupyterNotebookReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        overwrite = True,
        print_intermediate_tables = False,
        metric_columns=["loss", "accuracy", "precision"]) # немного побаловалась, не нашла что еще добавить на посмотреть
    result = tune.run(
        partial(train_cifar, data_dir=data_dir, model_name=model_name,
                checkpoint_dir=checkpoint_dir),
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
    local_dir= checkpoint_dir+str(model_name))
    
    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))    

    return best_trial
   

In [None]:
model_name = VGG16

best_trial = main(model_name, num_samples=1, max_num_epochs=2, gpus_per_trial=1)

Result for DEFAULT_e7c4b_00006:
  accuracy: 0.0997
  date: 2020-12-29_13-23-22
  done: false
  experiment_id: c4e4fe68778149199cd1dce6c7a29c70
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 2.3025827407836914
  node_ip: 172.17.0.2
  pid: 35033
  should_checkpoint: true
  time_since_restore: 49.90786695480347
  time_this_iter_s: 49.90786695480347
  time_total_s: 49.90786695480347
  timestamp: 1609248202
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e7c4b_00006
  


2020-12-29 13:23:22,610	INFO logger.py:721 -- Removed the following hyperparameter values when logging to tensorboard: {'optimizers_names': <class 'torch.optim.adam.Adam'>}


Result for DEFAULT_e7c4b_00003:
  accuracy: 0.1002
  date: 2020-12-29_13-23-22
  done: true
  experiment_id: 616074446e0c4be191b8956968f516a7
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 2.339796280860901
  node_ip: 172.17.0.2
  pid: 34978
  should_checkpoint: true
  time_since_restore: 50.39222431182861
  time_this_iter_s: 50.39222431182861
  time_total_s: 50.39222431182861
  timestamp: 1609248202
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e7c4b_00003
  


2020-12-29 13:23:23,008	INFO logger.py:721 -- Removed the following hyperparameter values when logging to tensorboard: {'optimizers_names': <class 'torch.optim.adam.Adam'>}


Result for DEFAULT_e7c4b_00001:
  accuracy: 0.0981
  date: 2020-12-29_13-23-22
  done: true
  experiment_id: 574fa66eb21c470cb62ac3db8b14eba2
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 2.3025920391082764
  node_ip: 172.17.0.2
  pid: 34946
  should_checkpoint: true
  time_since_restore: 50.79398727416992
  time_this_iter_s: 50.79398727416992
  time_total_s: 50.79398727416992
  timestamp: 1609248202
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e7c4b_00001
  


2020-12-29 13:23:23,297	INFO logger.py:721 -- Removed the following hyperparameter values when logging to tensorboard: {'optimizers_names': <class 'torch.optim.sgd.SGD'>}


Result for DEFAULT_e7c4b_00007:
  accuracy: 0.096
  date: 2020-12-29_13-23-23
  done: true
  experiment_id: 30a16e4af9374bb780e26fc3839497ef
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 2.3029632806777953
  node_ip: 172.17.0.2
  pid: 35051
  should_checkpoint: true
  time_since_restore: 50.925084590911865
  time_this_iter_s: 50.925084590911865
  time_total_s: 50.925084590911865
  timestamp: 1609248203
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e7c4b_00007
  
Result for DEFAULT_e7c4b_00002:
  accuracy: 0.0969
  date: 2020-12-29_13-23-23
  done: false
  experiment_id: 3e6ed5b6c1b64b8dba876be704070dd0
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 2.3025827407836914
  node_ip: 172.17.0.2
  pid: 34942
  should_checkpoint: true
  time_since_restore: 51.108625173568726
  time_this_iter_s: 51.108625173568726
  time_total_s: 51.108625173568726
  timestamp: 1609248203
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e7c4b_0

2020-12-29 13:23:23,530	INFO logger.py:721 -- Removed the following hyperparameter values when logging to tensorboard: {'optimizers_names': <class 'torch.optim.sgd.SGD'>}
2020-12-29 13:23:23,591	INFO logger.py:721 -- Removed the following hyperparameter values when logging to tensorboard: {'optimizers_names': <class 'torch.optim.sgd.SGD'>}


Result for DEFAULT_e7c4b_00004:
  accuracy: 0.0998
  date: 2020-12-29_13-23-23
  done: true
  experiment_id: bf7c363b65fe4c2bb3b4c32ef75d3578
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 2.303200833797455
  node_ip: 172.17.0.2
  pid: 34981
  should_checkpoint: true
  time_since_restore: 51.29124665260315
  time_this_iter_s: 51.29124665260315
  time_total_s: 51.29124665260315
  timestamp: 1609248203
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e7c4b_00004
  
Result for DEFAULT_e7c4b_00005:
  accuracy: 0.1033
  date: 2020-12-29_13-23-23
  done: true
  experiment_id: edced6402c2846dd9738ae899ed4d99e
  hostname: 2c4060a45fe3
  iterations_since_restore: 1
  loss: 2.3026355028152468
  node_ip: 172.17.0.2
  pid: 35041
  should_checkpoint: true
  time_since_restore: 51.215715408325195
  time_this_iter_s: 51.215715408325195
  time_total_s: 51.215715408325195
  timestamp: 1609248203
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e7c4b_00005

In [None]:
best_trial_conf = best_trial.config
best_trial_conf['path']=best_trial.checkpoint.value

In [None]:
# файлик, который говорит, где лежит чекпоинт нужной модели и какие параметры:

with open("./data/best_trial_dir.txt", "w") as file:        
    json.dump(best_trial_conf, file,  default=lambda o: str(o))