### Installations

In [None]:
%%capture
!pip install torchfile 
!pip install tensorboardX
import os

### Git Operations

In [None]:
# Clone git repository 
!git clone 'https://github.com/XkunW/Image_Translation.git'

Cloning into 'Image_Translation'...
remote: Enumerating objects: 381, done.[K
remote: Counting objects: 100% (89/89), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 381 (delta 50), reused 53 (delta 23), pack-reused 292
Receiving objects: 100% (381/381), 382.56 MiB | 25.37 MiB/s, done.
Resolving deltas: 100% (159/159), done.


In [None]:
! git pull
# ! git status
# ! git checkout utils.py

fatal: not a git repository (or any of the parent directories): .git


### Drive mounting and unzipping data 

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
data_dir = '/content/drive/MyDrive/CSC2516_Project/Datasets/' #same for Tina and Sophie

# data zip 
summer2winter = data_dir+'summer2winter_yosemite_small_dataset.zip' 
monet2photo = data_dir+'monet2photo_small_dataset.zip'

Mounted at /content/drive


In [None]:
# change to UNIT folder
#%cd '/content/Image_Translation/UNIT'
os.chdir('Image_Translation/UNIT')
os.getcwd()

'/content/Image_Translation/UNIT'

In [None]:
# Unzipping datasets to the target folder
%%capture
!unzip "$summer2winter" -d '/content/Image_Translation/UNIT/datasets/'
#!unzip "$monet2photo" -d '/content/Image_Translation/UNIT/datasets/'

In [None]:
# copy vgg16 model weights into the models folder in github repo
!cp "/content/drive/MyDrive/CSC2516_Project/UNIT_colab/VGG_model/vgg16.weight" "/content/Image_Translation/UNIT/models"

### Training Code

In [None]:
from utils import get_all_data_loaders, prepare_sub_folder, write_html, write_loss, get_config, write_2images
import argparse
from trainer import UNIT_Trainer
import torch.backends.cudnn as cudnn
import torch
try:
    from itertools import izip as zip
except ImportError: # will be 3.x series
    pass
import sys
from tensorboardX import SummaryWriter
import shutil
from tqdm import tqdm 

In [None]:
# Choose the configuration file to use for 

# configs/unit_summer2winter_yosemite256_folder.yaml
# configs/unit_summer2winter_yosemite256_folder.yaml
config_file = 'configs/unit_summer2winter_yosemite256_folder.yaml'
output_path = '/content/drive/MyDrive/CSC2516_Project/UNIT_summer2winter_small'

parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, default=config_file, help='Path to the config file.')
parser.add_argument('--output_path', type=str, default=output_path, help="outputs path")
# parser.add_argument("--resume", action="store_true")
parser.add_argument('--trainer', type=str, default='UNIT', help="UNIT")
parser.add_argument('-f')
opts = parser.parse_args()

### loop

In [None]:
cudnn.benchmark = True
resume = False

# Load experiment setting
config = get_config(opts.config)
display_size = config['display_size']
config['vgg_model_path'] = '.'
max_iter = 80000
config['max_iter'] = max_iter
config['snapshot_save_iter'] = int(max_iter / 10)  
config['image_save_iter'] =  int(max_iter / 10)
config['image_display_iter'] =  int(max_iter / 10)  

skip_val = [(0.01,0.01), (0.01,0.1), (0.1,0.01)]
batch_sizes = [1] #[1, 4]
learning_rate = [0.0001]#, 0.0005] #initial learning rate
recon_kl_weight = [0.01, 0.1] #since: in paper the authors found setting the weights of the KL terms to 0.1 resulted in consistently good performance
recon_kl_cyc_weight = [0.01, 0.1]

for batch_size_val in batch_sizes:
  for lr_val in learning_rate:
      for recon_kl_w in recon_kl_weight:
          for recon_kl_cyc_w in recon_kl_cyc_weight:
              
              continue_Flag = False
              for kl_weight, kl_cyc_weight in skip_val:
                if recon_kl_w == kl_weight and recon_kl_cyc_w == kl_cyc_weight:
                  continue_Flag = True
                  print('continue of %.2f, %.2f' % (kl_weight, kl_cyc_weight))
                  break
              
              if continue_Flag:
                continue
                
              config['batch_size'] = batch_size_val
              config['lr'] = lr_val
              config['recon_kl_w'] = recon_kl_w
              config['recon_kl_cyc_w'] = recon_kl_cyc_w
              trainer = UNIT_Trainer(config)
              param_values = 'batch_size_'+ str(batch_size_val) \
              + '_recon_kl_w_' + str(recon_kl_w) \
              + '_recon_kl_clc_'+ str(recon_kl_cyc_w) \
              + '_lr_value_' + str(lr_val) \

              if torch.cuda.is_available():
                  trainer.cuda()

              train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders(config)
              if torch.cuda.is_available():
                  train_display_images_a = torch.stack([train_loader_a.dataset[i] for i in range(display_size)]).cuda()
                  train_display_images_b = torch.stack([train_loader_b.dataset[i] for i in range(display_size)]).cuda()
                  test_display_images_a = torch.stack([test_loader_a.dataset[i] for i in range(display_size)]).cuda()
                  test_display_images_b = torch.stack([test_loader_b.dataset[i] for i in range(display_size)]).cuda()
              else:
                  train_display_images_a = torch.stack([train_loader_a.dataset[i] for i in range(display_size)])
                  train_display_images_b = torch.stack([train_loader_b.dataset[i] for i in range(display_size)])
                  test_display_images_a = torch.stack([test_loader_a.dataset[i] for i in range(display_size)])
                  test_display_images_b = torch.stack([test_loader_b.dataset[i] for i in range(display_size)])

              # Setup logger and output folders, no need during hp tuning 
              model_name = os.path.splitext(os.path.basename(opts.config))[0]
              train_writer = SummaryWriter(os.path.join(opts.output_path + "/logs", model_name)) # + '_' + param_values))
              output_directory = os.path.join(opts.output_path + "/outputs", model_name) #+ '_' + param_values)
              checkpoint_directory, image_directory = prepare_sub_folder(output_directory)
              shutil.copy(opts.config, os.path.join(output_directory, 'config.yaml')) # copy config file to output folder

              # Start training
              iterations = trainer.resume(checkpoint_directory, hyperparameters=config) if resume else 0

              training_complete = False
              while not training_complete:
                  for it, (images_a, images_b) in enumerate(tqdm(zip(train_loader_a, train_loader_b))):
                      trainer.update_learning_rate()
                      if torch.cuda.is_available():
                          images_a, images_b = images_a.cuda().detach(), images_b.cuda().detach()
                      else:
                          images_a, images_b = images_a.detach(), images_b.detach()

                      # Main training code
                      trainer.dis_update(images_a, images_b, config)
                      trainer.gen_update(images_a, images_b, config)
                      if torch.cuda.is_available():
                          torch.cuda.synchronize()

                      # Dump training stats in log file
                      if (iterations + 1) % config['log_iter'] == 0:
                          write_loss(iterations, trainer, train_writer)

                      # Write images
                      if (iterations + 1) % config['image_save_iter'] == 0:
                          with torch.no_grad():
                              test_image_outputs = trainer.sample(test_display_images_a, test_display_images_b)
                              train_image_outputs = trainer.sample(train_display_images_a, train_display_images_b)
                          
                          #the last input is the postfix for filename - edit this so we can store images from different configs
                          write_2images(test_image_outputs, display_size, image_directory, 'test_%08d_%s' % (iterations + 1, param_values))
                          write_2images(train_image_outputs, display_size, image_directory, 'train_%08d_%s' % (iterations + 1, param_values))
                          # HTML
                          write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images', param_values)

                      if (iterations + 1) % config['image_display_iter'] == 0:
                          with torch.no_grad():
                              image_outputs = trainer.sample(train_display_images_a, train_display_images_b)
                          write_2images(image_outputs, display_size, image_directory, 'train_current_' + param_values)

                      # Save network weights
                      if (iterations + 1) % config['snapshot_save_iter'] == 0:    # save iter 0 sanity check
                          trainer.save(checkpoint_directory, iterations, param_values)

                      iterations += 1
                      if iterations >= config['max_iter']:
                          training_complete = True
                          break

164it [01:21,  2.01it/s]

### Resume

In [None]:
cudnn.benchmark = True

# Load experiment setting
config = get_config(opts.config)
display_size = config['display_size']
config['vgg_model_path'] = '.'
max_iter = 80000
config['max_iter'] = max_iter
config['snapshot_save_iter'] = int(max_iter / 10)  
config['image_save_iter'] =  int(max_iter / 10)
config['image_display_iter'] =  int(max_iter / 10)  

#skip_val = [(0.01,0.01), (0.01,0.1), (0.1,0.01)]
recon_kl_weight = [0.01, 0.1] #since: in paper the authors found setting the weights of the KL terms to 0.1 resulted in consistently good performance
recon_kl_cyc_weight = [0.01, 0.1]


batch_size_val = 1
lr_val = 0.0001
recon_kl_w = 0.1
recon_kl_cyc_w = 0.01
resume = True
iterations = 72000
gen_dir = '/content/drive/MyDrive/CSC2516_Project/UNIT_summer2winter_small/outputs/unit_summer2winter_yosemite256_folder/checkpoints/gen_00072000_batch_size_1_recon_kl_w_0.1_recon_kl_clc_0.01_lr_value_0.0001.pt'
dis_dir = '/content/drive/MyDrive/CSC2516_Project/UNIT_summer2winter_small/outputs/unit_summer2winter_yosemite256_folder/checkpoints/dis_00072000_batch_size_1_recon_kl_w_0.1_recon_kl_clc_0.01_lr_value_0.0001.pt'
opt_dir = '/content/drive/MyDrive/CSC2516_Project/UNIT_summer2winter_small/outputs/unit_summer2winter_yosemite256_folder/checkpoints/optimizer_batch_size_1_recon_kl_w_0.1_recon_kl_clc_0.01_lr_value_0.0001.pt'
  
config['batch_size'] = batch_size_val
config['lr'] = lr_val
config['recon_kl_w'] = recon_kl_w
config['recon_kl_cyc_w'] = recon_kl_cyc_w
trainer = UNIT_Trainer(config)
param_values = 'batch_size_'+ str(batch_size_val) \
+ '_recon_kl_w_' + str(recon_kl_w) \
+ '_recon_kl_clc_'+ str(recon_kl_cyc_w) \
+ '_lr_value_' + str(lr_val) \

if torch.cuda.is_available():
    trainer.cuda()

train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders(config)
if torch.cuda.is_available():
    train_display_images_a = torch.stack([train_loader_a.dataset[i] for i in range(display_size)]).cuda()
    train_display_images_b = torch.stack([train_loader_b.dataset[i] for i in range(display_size)]).cuda()
    test_display_images_a = torch.stack([test_loader_a.dataset[i] for i in range(display_size)]).cuda()
    test_display_images_b = torch.stack([test_loader_b.dataset[i] for i in range(display_size)]).cuda()
else:
    train_display_images_a = torch.stack([train_loader_a.dataset[i] for i in range(display_size)])
    train_display_images_b = torch.stack([train_loader_b.dataset[i] for i in range(display_size)])
    test_display_images_a = torch.stack([test_loader_a.dataset[i] for i in range(display_size)])
    test_display_images_b = torch.stack([test_loader_b.dataset[i] for i in range(display_size)])

# Setup logger and output folders, no need during hp tuning 
model_name = os.path.splitext(os.path.basename(opts.config))[0]
train_writer = SummaryWriter(os.path.join(opts.output_path + "/logs", model_name)) # + '_' + param_values))
output_directory = os.path.join(opts.output_path + "/outputs", model_name) #+ '_' + param_values)
checkpoint_directory, image_directory = prepare_sub_folder(output_directory)
shutil.copy(opts.config, os.path.join(output_directory, 'config.yaml')) # copy config file to output folder

# Start training
# code change in resume function in trainer: iterations = int(last_model_name.split('/')[-1][4:11])
trainer.resume(gen_dir=gen_dir, dis_dir=dis_dir, opt_dir=opt_dir, iterations=iterations, hyperparameters=config)

training_complete = False
while not training_complete:
    for it, (images_a, images_b) in enumerate(tqdm(zip(train_loader_a, train_loader_b))):
        trainer.update_learning_rate()
        if torch.cuda.is_available():
            images_a, images_b = images_a.cuda().detach(), images_b.cuda().detach()
        else:
            images_a, images_b = images_a.detach(), images_b.detach()

        # Main training code
        trainer.dis_update(images_a, images_b, config)
        trainer.gen_update(images_a, images_b, config)
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        # Dump training stats in log file
        if (iterations + 1) % config['log_iter'] == 0:
            write_loss(iterations, trainer, train_writer)

        # Write images
        if (iterations + 1) % config['image_save_iter'] == 0:
            with torch.no_grad():
                test_image_outputs = trainer.sample(test_display_images_a, test_display_images_b)
                train_image_outputs = trainer.sample(train_display_images_a, train_display_images_b)
            
            #the last input is the postfix for filename - edit this so we can store images from different configs
            write_2images(test_image_outputs, display_size, image_directory, 'test_%08d_%s' % (iterations + 1, param_values))
            write_2images(train_image_outputs, display_size, image_directory, 'train_%08d_%s' % (iterations + 1, param_values))
            # HTML
            write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images', param_values)

        if (iterations + 1) % config['image_display_iter'] == 0:
            with torch.no_grad():
                image_outputs = trainer.sample(train_display_images_a, train_display_images_b)
            write_2images(image_outputs, display_size, image_directory, 'train_current_' + param_values)

        # Save network weights
        if (iterations + 1) % config['snapshot_save_iter'] == 0:    # save iter 0 sanity check
            trainer.save(checkpoint_directory, iterations, param_values)

        iterations += 1
        if iterations >= config['max_iter']:
            training_complete = True
            break

  cpuset_checked))


hi
Resume from iteration 72000


400it [03:20,  1.99it/s]
400it [03:20,  2.00it/s]
400it [03:20,  2.00it/s]
400it [03:20,  2.00it/s]
400it [03:20,  2.00it/s]
400it [03:20,  2.00it/s]
400it [03:20,  2.00it/s]
400it [03:20,  2.00it/s]
400it [03:20,  2.00it/s]
400it [03:19,  2.00it/s]
400it [03:19,  2.00it/s]
400it [03:19,  2.01it/s]
400it [03:19,  2.01it/s]
400it [03:20,  2.00it/s]
400it [03:19,  2.00it/s]
400it [03:19,  2.00it/s]
400it [03:19,  2.00it/s]
400it [03:19,  2.00it/s]
400it [03:19,  2.00it/s]
399it [03:18,  2.03it/s]

In [None]:
trainer.resume

In [None]:
checkpoint_directory