# MIT dataset

In [1]:
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random, io
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

from PIL import Image
import tqdm
import matplotlib.pyplot as plt

from functools import partial
import numpy as np
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu" 

ROOT = '/home/ubuntu/workspace/mitstates/'

In [2]:
from symnet.utils import config as symnet_cfg
from symnet.utils import dataset, utils
from symnet.utils.evaluator import CZSL_Evaluator

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# train [img, attr_id, obj_id, pair_id, img_feature, img, attr_id, obj_id, pair_id, img_feature, aff_mask]
# test [img, attr_id, obj_id, pair_id, img_feature, aff_mask]

train_dataloader = dataset.get_dataloader('MIT', 'train', batchsize=64, with_image=True, shuffle=True)
test_dataloader = dataset.get_dataloader('MIT', 'test', batchsize=64, with_image=True)

obj_class = len(train_dataloader.dataset.obj2idx.keys())
attr_class = len(train_dataloader.dataset.attr2idx.keys())

print(f"obj_class: {obj_class}, attr_class: {attr_class}")

53753 activations loaded
#images = 34562
53753 activations loaded
#images = 19191
obj_class: 245, attr_class: 115


# ResNet

In [4]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

class MLP(nn.Module):
  def __init__(self, in_features, out_features):
    super(MLP, self).__init__()

    self.mlp = nn.Sequential(
        nn.Linear(in_features, in_features),
        nn.BatchNorm1d(in_features),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(in_features, out_features))
    
  def forward(self, x):
    return self.mlp(x)


class MLP2(nn.Module):
  def __init__(self, in_features, out_features):
    super(MLP2, self).__init__()

    self.mlp = nn.Sequential(
        nn.Linear(in_features, in_features//2),
        nn.BatchNorm1d(in_features//2),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(in_features//2, in_features//4),
        nn.BatchNorm1d(in_features//4),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(in_features//4, out_features))
    
  def forward(self, x):
    return self.mlp(x)


class MLP3(nn.Module):
  def __init__(self, in_features, out_features):
    super(MLP3, self).__init__()

    self.mlp = nn.Sequential(
        nn.Linear(in_features, 1400),
        nn.BatchNorm1d(1400),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(1400, 800),
        nn.BatchNorm1d(800),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(800, 400),
        nn.BatchNorm1d(400),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(400, out_features))
    
  def forward(self, x):
    return self.mlp(x)


class HalvingMLP(nn.Module):
  def __init__(self, in_features, out_features, num_layers=None):
      super(HalvingMLP, self).__init__()
      layers = []
      for i in range(num_layers):
        layer = nn.Sequential(
          nn.Linear(in_features, in_features//2),
          nn.BatchNorm1d(in_features//2),
          nn.ReLU(),
          nn.Dropout())
        layers.append(layer)
        in_features //= 2
      layers.append(nn.Linear(in_features, out_features))
      self.mlp = nn.Sequential(*layers)
    
  def forward(self, x):
    return self.mlp(x)

def frozen(model):
    for param in model.parameters():
        param.requires_grad = False
    return model

class CompoResnet(nn.Module):
  def __init__(self, resnet, obj_class, attr_class, MLP=MLP):
    super(CompoResnet, self).__init__()
    in_features = resnet.fc.in_features # 2048 for resnet101
    resnet.fc = Identity()
    self.resnet = resnet
    self.obj_fc = MLP(in_features, obj_class)
    self.attr_fc = MLP(in_features, attr_class)

  def forward(self, x):
    img_features = self.resnet(x)
    obj_pred = self.obj_fc(img_features)
    attr_pred = self.attr_fc(img_features)
    return obj_pred, attr_pred

In [5]:
#resnet = frozen(torch.hub.load('pytorch/vision:v0.9.0', 'resnet101', pretrained=True))
# compoResnet = CompoResnet(resnet, obj_class, attr_class, MLP3).to(dev)

# obj_loss_history = []
# attr_loss_history = []
# optimizer = torch.optim.Adam(compoResnet.parameters())
# criterion = nn.CrossEntropyLoss()
# curr_epoch = 0

# model_dir = '/content/drive/MyDrive/compoResnet/models/'
# model_name = None
# model_path = None if not model_name else os.path.join(model_dir, model_name)

# if model_path:
#   #checkpoint = torch.load(model_path), map_location=torch.device('cpu'))
#   checkpoint = torch.load(model_path)
#   compoResnet.load_state_dict(checkpoint['model_state_dict'])
#   optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#   curr_epoch = checkpoint['epoch']
#   obj_loss_history = checkpoint['obj_loss']
#   attr_loss_history = checkpoint['attr_loss']

In [6]:
def train_with_config(config, checkpoint_dir=None, num_epochs=1):
  lr = config['lr']
  resnet_name = config['resnet']
  num_mlp_layers = config['num_mlp_layers']
  mlp = partial(HalvingMLP, num_layers=num_mlp_layers)
  batch_size = 64

  resnet = frozen(torch.hub.load('pytorch/vision:v0.9.0', resnet_name, pretrained=True))
  compoResnet = CompoResnet(resnet, obj_class, attr_class, mlp).to(dev)
  obj_loss_history = []
  attr_loss_history = []
  optimizer = optim.Adam(compoResnet.parameters(), lr=lr)
  criterion = nn.CrossEntropyLoss()

  if checkpoint_dir:
    model_state, optimizer_state, obj_loss_history, attr_loss_history = torch.load(
        os.path.join(checkpoint_dir, "checkpoint"))
    compoResnet.load_state_dict(model_state)
    optimizer.load_state_dict(optimizer_state)


  train(compoResnet, optimizer, criterion, num_epochs, obj_loss_history, attr_loss_history, batch_size, use_tune=True)

def train(net, optimizer, criterion, num_epochs, obj_loss_history, attr_loss_history, batch_size, curr_epoch=0, use_tune=False):
  dset = dataset.get_dataloader('MIT', 'train', with_image=True).dataset
  test_abs = int(len(dset) * 0.8)
  train_subset, val_subset = random_split(
        dset, [test_abs, len(dset) - test_abs])
  train_dataloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
  val_dataloader = DataLoader(val_subset, batch_size=batch_size, shuffle=True)

  net.train()
  for epoch in range(curr_epoch, curr_epoch+num_epochs):
    epoch_steps = 0
    obj_running_loss = 0.0
    attr_running_loss = 0.0
    for i, batch in tqdm.tqdm(
        enumerate(train_dataloader),
        total=len(train_dataloader),
        disable=use_tune,
        position=0,
        leave=True,
        postfix='Train: epoch %d/%d'%(epoch, num_epochs)):
      optimizer.zero_grad()
      img, attr_id, obj_id = batch[:3]
      if len(img) == 1:
        # Batchnorm doesn't accept batch with size 1
        continue
      obj_pred, attr_pred = net(img.to(dev))
      obj_loss = criterion(obj_pred, obj_id.to(dev))
      attr_loss = criterion(attr_pred, attr_id.to(dev))
      obj_loss.backward(retain_graph=True)
      attr_loss.backward()
      optimizer.step()

      obj_running_loss += obj_loss.item()
      attr_running_loss += attr_loss.item()
      epoch_steps += 1
      if i % 100 == 99:
          print("[%d, %5d] obj_loss: %.3f, attr_loss: %.3f" % (epoch+1, i + 1,
                                          obj_running_loss / epoch_steps, attr_running_loss / epoch_steps))
          obj_loss_history.append(obj_running_loss/epoch_steps)
          attr_loss_history.append(attr_running_loss/epoch_steps)
          running_loss = 0.0

    # Validation loss
    obj_val_loss = 0.0
    attr_val_loss = 0.0
    val_steps = 0
    net.eval()

    for i, batch in tqdm.tqdm(
          enumerate(val_dataloader),
          total=len(val_dataloader),
          disable=use_tune,
          position=0,
          leave=True):
        with torch.no_grad():
            img, attr_id, obj_id = batch[:3]
            obj_pred, attr_pred = net(img.to(dev))
            obj_loss = criterion(obj_pred, obj_id.to(dev))
            attr_loss = criterion(attr_pred, attr_id.to(dev))

            obj_val_loss += obj_loss.cpu().numpy()
            attr_val_loss += attr_loss.cpu().numpy()
            val_steps += 1

    acc = calc_acc(net, val_dataloader, use_tune)
    if use_tune:
      with tune.checkpoint_dir(epoch) as checkpoint_dir: 
          path = os.path.join(checkpoint_dir, "checkpoint")
          torch.save({
                      'model_state_dict': net.state_dict(),
                      'optimizer_state_dict': optimizer.state_dict(),
                      'obj_loss': obj_loss_history,
                      'attr_loss': attr_loss_history,
                      }, path)

      tune.report(loss=((obj_val_loss+attr_val_loss) / val_steps), accuracy=acc)
    else:
      print('Validation accuracy:', acc)
    print("Finished training.")

In [7]:
def calc_acc(model, test_dataloader, use_tune=False):  
  def match(labels, preds):
    preds = torch.argmax(preds, axis=-1)
    return torch.sum(preds == labels)

  def compoMatch(obj_labels, obj_preds, attr_labels, attr_preds):
    obj_preds = torch.argmax(obj_preds, axis=-1)
    attr_preds = torch.argmax(attr_preds, axis=-1)
    comp_match = (obj_labels == obj_preds) * (attr_labels == attr_preds)
    return torch.sum(comp_match)

  obj_match, attr_match, comp_match = 0, 0, 0
  with torch.no_grad():
    model.eval()
    for i, batch in tqdm.tqdm(
        enumerate(test_dataloader),
        total=len(test_dataloader),
        disable=use_tune,
        position=0,
        leave=True):
      img, attr_id, obj_id = batch[:3]
      obj_preds, attr_preds = model(img.to(dev))
      obj_preds, attr_preds = obj_preds.to('cpu'), attr_preds.to('cpu')
      obj_match += match(obj_id, obj_preds)
      attr_match += match(attr_id, attr_preds)
      comp_match += compoMatch(obj_id, obj_preds, attr_id, attr_preds)
  model.train()
  return np.array([obj_match, attr_match, comp_match]) / len(test_dataloader.dataset)

In [8]:
config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "resnet": tune.choice(['resnet18', 'resnet50', 'resnet101']),
    "num_mlp_layers": tune.choice([1,2,4,6]),
}

In [None]:
num_samples = 12
num_epochs = 6
scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)
reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])
result = tune.run(
    partial(train_with_config, num_epochs=num_epochs),
    resources_per_trial={"cpu": 1, "gpu": 0.32},
    config=config,
    num_samples=num_samples,
    scheduler=scheduler,
    progress_reporter=reporter)

[2021-04-02 14:05:31,895] ray.tune.ray_trial_executor: Initializing Ray automatically.For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run`.
2021-04-02 14:05:32,449	INFO services.py:1174 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-04-02 14:05:34,086	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Memory usage on this node: 1.9/15.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1/4 CPUs, 0.32/1 GPUs, 0.0/8.5 GiB heap, 0.0/2.93 GiB objects (0/1.0 accelerator_type:T4)
Result logdir: /home/ubuntu/ray_results/DEFAULT_2021-04-02_14-05-34
Number of trials: 1/12 (1 RUNNING)
+---------------------+----------+-------+------------+------------------+----------+
| Trial name          | status   | loc   |         lr |   num_mlp_layers | resnet   |
|---------------------+----------+-------+------------+------------------+----------|
| DEFAULT_7df76_00000 | RUNNING  |       | 0.00104695 |                4 | resnet50 |
+---------------------+----------+-------+------------+------------------+----------+




[2m[36m(pid=13589)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0
[2m[36m(pid=13590)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0
[2m[36m(pid=13588)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=13588)[0m 53753 activations loaded
[2m[36m(pid=13589)[0m 53753 activations loaded
[2m[36m(pid=13588)[0m #images = 34562
[2m[36m(pid=13590)[0m 53753 activations loaded
[2m[36m(pid=13589)[0m #images = 34562
[2m[36m(pid=13590)[0m #images = 34562
[2m[36m(pid=13589)[0m [1,   100] obj_loss: 4.470, attr_loss: 3.944
[2m[36m(pid=13590)[0m [1,   100] obj_loss: 5.365, attr_loss: 4.483
[2m[36m(pid=13588)[0m [1,   100] obj_loss: 5.248, attr_loss: 4.830
[2m[36m(pid=13589)[0m [1,   200] obj_loss: 4.112, attr_loss: 3.756
[2m[36m(pid=13588)[0m [1,   200] obj_loss: 4.591, attr_loss: 4.233
[2m[36m(pid=13590)[0m [1,   200] obj_loss: 5.096, attr_loss: 4.300
[2m[36m(pid=13589)[0m [1,   300] obj_loss: 3.923, attr_loss: 3.642
[2m[36m(pid=13588)[0m [1,   300] obj_loss: 4.340, attr_loss: 4.019
[2m[36m(pid=13590)[0m [1,   300] obj_loss: 4.893, attr_loss: 4.157
[2m[36m(pid=13589)[0m [1,   400] obj_loss: 3.809, attr_loss: 3.571
[2m[36m(pid=13588)[0m [1

[2m[36m(pid=13591)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=13591)[0m 53753 activations loaded
[2m[36m(pid=13591)[0m #images = 34562
Result for DEFAULT_7df76_00000:
  accuracy: '[0.12700709 0.15434688 0.02300014]'
  date: 2021-04-02_14-14-10
  done: true
  experiment_id: 84996da4a7464225babda0e6c6f871e3
  hostname: ip-172-31-45-75
  iterations_since_restore: 1
  loss: 7.339719846707966
  node_ip: 172.31.45.75
  pid: 13590
  should_checkpoint: true
  time_since_restore: 514.3741519451141
  time_this_iter_s: 514.3741519451141
  time_total_s: 514.3741519451141
  timestamp: 1617372850
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 7df76_00000
  
== Status ==
Memory usage on this node: 11.5/15.3 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: -6.632352487756572
Resources requested: 3/4 CPUs, 0.96/1 GPUs, 0.0/8.5 GiB heap, 0.0/2.93 GiB objects (0/1.0 accelerator_type:T4)
Result logdir: /home/ubuntu/ray_results/DEFAULT_2021-04-02_14-05-34
Number of trials: 5/12 (1 P

[2m[36m(pid=13800)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=13800)[0m 53753 activations loaded
[2m[36m(pid=13800)[0m #images = 34562
[2m[36m(pid=13589)[0m [2,   200] obj_loss: 3.214, attr_loss: 3.225
[2m[36m(pid=13591)[0m [1,   100] obj_loss: 4.887, attr_loss: 4.635
[2m[36m(pid=13589)[0m [2,   300] obj_loss: 3.240, attr_loss: 3.236
[2m[36m(pid=13800)[0m [1,   100] obj_loss: 4.405, attr_loss: 4.077
[2m[36m(pid=13589)[0m [2,   400] obj_loss: 3.245, attr_loss: 3.224
[2m[36m(pid=13591)[0m [1,   200] obj_loss: 4.288, attr_loss: 4.078
[2m[36m(pid=13800)[0m [1,   200] obj_loss: 3.985, attr_loss: 3.794
[2m[36m(pid=13800)[0m [1,   300] obj_loss: 3.766, attr_loss: 3.643
[2m[36m(pid=13591)[0m [1,   300] obj_loss: 4.066, attr_loss: 3.876
Result for DEFAULT_7df76_00002:
  accuracy: '[0.3053667  0.24215247 0.10183712]'
  date: 2021-04-02_14-19-07
  done: false
  experiment_id: bd6c19eef66348fb8cfd581da5b4bf1f
  hostname: ip-172-31-45-75
  iterations_since_restore: 2
  loss: 5.888648895632237
  node_ip: 172.31.45.7

[2m[36m(pid=14044)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=14044)[0m 53753 activations loaded
[2m[36m(pid=14044)[0m #images = 34562
[2m[36m(pid=13589)[0m [5,   200] obj_loss: 2.951, attr_loss: 3.007
[2m[36m(pid=13800)[0m [3,   200] obj_loss: 2.649, attr_loss: 2.766
[2m[36m(pid=14044)[0m [1,   100] obj_loss: 4.815, attr_loss: 4.107
[2m[36m(pid=13589)[0m [5,   300] obj_loss: 2.958, attr_loss: 3.009
[2m[36m(pid=13800)[0m [3,   300] obj_loss: 2.675, attr_loss: 2.776
[2m[36m(pid=13589)[0m [5,   400] obj_loss: 2.957, attr_loss: 3.018
[2m[36m(pid=14044)[0m [1,   200] obj_loss: 4.389, attr_loss: 3.847
[2m[36m(pid=13800)[0m [3,   400] obj_loss: 2.685, attr_loss: 2.790
[2m[36m(pid=14044)[0m [1,   300] obj_loss: 4.138, attr_loss: 3.695
Result for DEFAULT_7df76_00002:
  accuracy: '[0.3256184  0.26674382 0.12339071]'
  date: 2021-04-02_14-38-52
  done: false
  experiment_id: bd6c19eef66348fb8cfd581da5b4bf1f
  hostname: ip-172-31-45-75
  iterations_since_restore: 5
  loss: 5.685606788057799
  node_ip: 172.31.45.7

[2m[36m(pid=14189)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=14189)[0m 53753 activations loaded
[2m[36m(pid=14189)[0m #images = 34562
[2m[36m(pid=13800)[0m [4,   400] obj_loss: 2.562, attr_loss: 2.709
[2m[36m(pid=14044)[0m [2,   300] obj_loss: 3.142, attr_loss: 3.147
[2m[36m(pid=14189)[0m [1,   100] obj_loss: 5.450, attr_loss: 4.605
[2m[36m(pid=14044)[0m [2,   400] obj_loss: 3.114, attr_loss: 3.134
[2m[36m(pid=14189)[0m [1,   200] obj_loss: 5.250, attr_loss: 4.450
Result for DEFAULT_7df76_00004:
  accuracy: '[0.36597714 0.30420946 0.1664979 ]'
  date: 2021-04-02_14-49-29
  done: false
  experiment_id: b822e2a2fb6a4b778850867d2e3f1831
  hostname: ip-172-31-45-75
  iterations_since_restore: 4
  loss: 5.317572844137839
  node_ip: 172.31.45.75
  pid: 13800
  should_checkpoint: true
  time_since_restore: 2117.2785868644714
  time_this_iter_s: 533.1544060707092
  time_total_s: 2117.2785868644714
  timestamp: 1617374969
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 7df76_00004
  
== Status ==
Memory u

[2m[36m(pid=14334)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=14044)[0m [3,   400] obj_loss: 2.921, attr_loss: 2.987
[2m[36m(pid=14334)[0m 53753 activations loaded
[2m[36m(pid=14334)[0m #images = 34562
Result for DEFAULT_7df76_00004:
  accuracy: '[0.38116592 0.31346738 0.1764791 ]'
  date: 2021-04-02_14-58-31
  done: false
  experiment_id: b822e2a2fb6a4b778850867d2e3f1831
  hostname: ip-172-31-45-75
  iterations_since_restore: 5
  loss: 5.28111893130005
  node_ip: 172.31.45.75
  pid: 13800
  should_checkpoint: true
  time_since_restore: 2659.40873336792
  time_this_iter_s: 542.1301465034485
  time_total_s: 2659.40873336792
  timestamp: 1617375511
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: 7df76_00004
  
== Status ==
Memory usage on this node: 11.6/15.3 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 4.000: -5.548458982498274 | Iter 2.000: -5.794764433773833 | Iter 1.000: -6.204957559568073
Resources requested: 3/4 CPUs, 0.96/1 GPUs, 0.0/8.5 GiB heap, 0.0/2.93 GiB objects (0/1.0 accelerator_type:T4)

[2m[36m(pid=14445)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=14445)[0m 53753 activations loaded
[2m[36m(pid=14445)[0m #images = 34562
[2m[36m(pid=14445)[0m [1,   100] obj_loss: 5.363, attr_loss: 4.497
Result for DEFAULT_7df76_00007:
  accuracy: '[0.17401996 0.14668017 0.03312599]'
  date: 2021-04-02_15-08-41
  done: true
  experiment_id: 8bbc59a3b18f461d860524d6d0f11025
  hostname: ip-172-31-45-75
  iterations_since_restore: 1
  loss: 7.055398337337949
  node_ip: 172.31.45.75
  pid: 14334
  should_checkpoint: true
  time_since_restore: 674.13511967659
  time_this_iter_s: 674.13511967659
  time_total_s: 674.13511967659
  timestamp: 1617376121
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 7df76_00007
  
== Status ==
Memory usage on this node: 11.4/15.3 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 4.000: -5.548458982498274 | Iter 2.000: -5.794764433773833 | Iter 1.000: -6.418655023662323
Resources requested: 3/4 CPUs, 0.96/1 GPUs, 0.0/8.5 GiB heap, 0.0/2.93 GiB objects (0/1.0 accelerator_type:T4)
Res

[2m[36m(pid=14471)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=14471)[0m 53753 activations loaded
[2m[36m(pid=14471)[0m #images = 34562
[2m[36m(pid=14445)[0m [1,   200] obj_loss: 5.127, attr_loss: 4.337
[2m[36m(pid=14471)[0m [1,   100] obj_loss: 4.617, attr_loss: 4.066
Result for DEFAULT_7df76_00005:
  accuracy: '[0.36337335 0.2979893  0.14971792]'
  date: 2021-04-02_15-10-08
  done: false
  experiment_id: a4b100ff2be246c7a19a24de2dba23dd
  hostname: ip-172-31-45-75
  iterations_since_restore: 4
  loss: 5.370271040758955
  node_ip: 172.31.45.75
  pid: 14044
  should_checkpoint: true
  time_since_restore: 2171.2818093299866
  time_this_iter_s: 545.4055845737457
  time_total_s: 2171.2818093299866
  timestamp: 1617376208
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 7df76_00005
  
== Status ==
Memory usage on this node: 11.6/15.3 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 4.000: -5.370271040758955 | Iter 2.000: -5.794764433773833 | Iter 1.000: -6.418655023662323
Resources requested: 3/4 CPUs, 0.96

[2m[36m(pid=14622)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=14622)[0m 53753 activations loaded
[2m[36m(pid=14622)[0m #images = 34562
[2m[36m(pid=14044)[0m [5,   400] obj_loss: 2.752, attr_loss: 2.859
Result for DEFAULT_7df76_00009:
  accuracy: '[0.24113988 0.20092579 0.06480544]'
  date: 2021-04-02_15-15-14
  done: false
  experiment_id: 29908ef327614799b2ad2830b71ed0f1
  hostname: ip-172-31-45-75
  iterations_since_restore: 1
  loss: 6.559065468814395
  node_ip: 172.31.45.75
  pid: 14471
  should_checkpoint: true
  time_since_restore: 391.8949337005615
  time_this_iter_s: 391.8949337005615
  time_total_s: 391.8949337005615
  timestamp: 1617376514
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 7df76_00009
  
== Status ==
Memory usage on this node: 11.4/15.3 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 4.000: -5.370271040758955 | Iter 2.000: -5.794764433773833 | Iter 1.000: -6.595708978285483
Resources requested: 3/4 CPUs, 0.96/1 GPUs, 0.0/8.5 GiB heap, 0.0/2.93 GiB objects (0/1.0 accelerator_type:

[2m[36m(pid=14954)[0m Using cache found in /home/ubuntu/.cache/torch/hub/pytorch_vision_v0.9.0


[2m[36m(pid=14954)[0m 53753 activations loaded
[2m[36m(pid=14954)[0m #images = 34562
[2m[36m(pid=14044)[0m [6,   300] obj_loss: 2.690, attr_loss: 2.795
[2m[36m(pid=14954)[0m [1,   100] obj_loss: 5.478, attr_loss: 4.570
Result for DEFAULT_7df76_00010:
  accuracy: '[0.3285115  0.26804571 0.12498192]'
  date: 2021-04-02_15-22-43
  done: false
  experiment_id: 6f4438121e1d491a8ae64777d3100d08
  hostname: ip-172-31-45-75
  iterations_since_restore: 1
  loss: 5.707631003966025
  node_ip: 172.31.45.75
  pid: 14622
  should_checkpoint: true
  time_since_restore: 530.6368312835693
  time_this_iter_s: 530.6368312835693
  time_total_s: 530.6368312835693
  timestamp: 1617376963
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 7df76_00010
  
== Status ==
Memory usage on this node: 11.4/15.3 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 4.000: -5.370271040758955 | Iter 2.000: -5.888648895632237 | Iter 1.000: -6.559065468814395
Resources requested: 3/4 CPUs, 0.96/1

In [None]:
best_trial = result.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result["accuracy"]))

resnet = frozen(torch.hub.load('pytorch/vision:v0.9.0', best_trial.config["resnet"], pretrained=True))
best_mlp = partial(HalvingMLP, num_layers=best_trial.config["num_mlp_layers"])
best_trained_model = CompoResnet(resnet, obj_class, attr_class, best_mlp).to(dev)

best_checkpoint_dir = best_trial.checkpoint.value
model_state = torch.load(os.path.join(
    best_checkpoint_dir, "checkpoint"))['model_state_dict']
best_trained_model.load_state_dict(model_state)

test_acc = calc_acc(best_trained_model, test_dataloader)
print("\nBest trial test set accuracy: {}".format(test_acc))

Matches:

[0.30456985, 0.15528112, 0.02720025] : MLP2, 30 Epochs, Adam