In [1]:
#!/usr/bin/env python
# coding: utf-8
"""
#### Code adapted from the source code of ArtEmis dataset paper
####################################################################
Training a neural-speaker.

The MIT License (MIT)
Originally created at 6/16/20, for Python 3.x
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
####################################################################
"""
import pprint
import json
import torch
import time
import numpy as np
import os
import os.path as osp
from torch import nn
from termcolor import colored
from torch.utils.tensorboard import SummaryWriter
from ast import literal_eval
import pandas as pd
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
except ImportError:
    BICUBIC = Image.BICUBIC
    
from model.argument import parse_test_speaker_arguments,set_seed
from model.datasets_v2 import preprocess_dataset
from model.func_train_v2 import load_state_dicts
from model.func_test_v2 import read_saved_args,grounding_dataset_per_image_dummy,versatile_caption_sampler
from model.func_test_v2 import pickle_data



In [2]:
modelname = 'CLIPViTB16_1Gen' 
BS = 1

model_dir = f'output/Ours_ArtEmis/{modelname}'
outputfile = osp.join(model_dir,f'fullDB_test_BS{BS}.pkl')

args_val = parse_test_speaker_arguments(
        ['-speaker-saved-args',osp.join(model_dir,'config.json.txt'),
         '-speaker-checkpoint',osp.join(model_dir,'checkpoints/best_model.pt'),#best_model.pt model_epoch_20.pt last_model
         '-out-file',outputfile,
         '-img-dir', '',
         '--sampling-config-file', None,
         '--split', 'test',
         '--gpu','0']

)


Parameters Specified:
{'drop_bigrams': True,
 'drop_unk': True,
 'gpu': '0',
 'img_dir': '',
 'max_utterance_len': None,
 'out_file': 'output/Ours_ArtEmis/CLIPViTB16_1Gen/fullDB_test_BS1.pkl',
 'out_file_full': None,
 'random_seed': 2021,
 'sampling_config_file': 'None',
 'speaker_checkpoint': 'output/Ours_ArtEmis/CLIPViTB16_1Gen/checkpoints/best_model.pt',
 'speaker_saved_args': 'output/Ours_ArtEmis/CLIPViTB16_1Gen/config.json.txt',
 'split': 'test'}




In [3]:
args = read_saved_args(args_val.speaker_saved_args)
modeltype = args.modeltype 
use_vocabFAM = args.use_vocabFAM
print(pprint.pformat(vars(args)))

{'FreezeCase': 0,
 'accum_iter': 2,
 'backboneVisEnc': 'ViTB16',
 'batch_size': 32,
 'context_length': 65,
 'data_dir': '../Dataset/ArtEmis/ArtEmis',
 'debug': False,
 'droprate': 0.0,
 'gpu': '0',
 'image_resolution': 224,
 'img_dir': '../Dataset/ArtEmis/ArtEmis/../OriginalArtEmis/Images/CLIP_224',
 'lr_others': 0.001,
 'lr_patience': 2,
 'lr_textEnc': 0.001,
 'lr_visEnc': 1e-07,
 'max_epochs': 200,
 'modeltype': '1Gen',
 'no_transform': True,
 'output_dir': 'output/Ours_ArtEmis/CLIPViTB16_1Gen',
 'random_seed': 2021,
 'save_each_epoch': False,
 'train_patience': 5,
 'transformer_heads': 8,
 'transformer_layers': 8,
 'use_timestamp': False,
 'use_vocabFAM': True,
 'vocab_size': 15018}


In [4]:
## Load dataset
file_name = 'ArtEmis.csv'
df = pd.read_csv(osp.join(args.data_dir, file_name))
df = df.where(pd.notnull(df), 'None')
if args.random_seed != -1:
    set_seed(args.random_seed)
if args.debug:
    df = df.sample(50)
print(f'Loaded {len(df)} captions!!!')

  interactivity=interactivity, compiler=compiler, result=result)


Loaded 439135 captions!!!


In [5]:
if args.no_transform:
    img_transform = None
else:
    if 'CLIP' in modelname:
        img_transform = Compose([
            Resize(args.image_resolution, interpolation=BICUBIC),
            CenterCrop(args.image_resolution),
            lambda image: image.convert("RGB"),
            ToTensor(),
            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ])
    elif 'INRN34' in modelname:
        image_net_mean = [0.485, 0.456, 0.406]
        image_net_std = [0.229, 0.224, 0.225]
        resample_method = Image.LANCZOS
        normalize = Normalize(mean=image_net_mean, std=image_net_std)
        img_transform = Compose([Resize((args.image_resolution, args.image_resolution), resample_method),ToTensor(),normalize])
    elif 'INViTB16'in modelname:
        img_transform = Compose([
            Resize(size=248, interpolation=BICUBIC, max_size=None, antialias=None),
            CenterCrop(size=(224, 224)),
            ToTensor(),
            Normalize((0.5000, 0.5000, 0.5000), (0.5000, 0.5000, 0.5000))])
    else:
        raise ValueError(f"Do not support model = {model}")

In [6]:
if modeltype == 'full': 
    from model.model_v2 import fullArc as idcmodel
elif modeltype == 'woSG': 
    from model.model_v2 import woSGArc as idcmodel 
elif modeltype == '1Gen': 
    from model.model_v2 import oneGenArc as idcmodel 
else:
    raise ValueError(f"Do not support modeltype = {modeltype}!!!")

In [7]:
if use_vocabFAM:
    from model.vocabulary import Vocabulary
    from model.func_test_v2 import get_highest_prob_capt as get_highest_prob_capt
    vocab = Vocabulary.load(osp.join(args.data_dir, 'ArtEmis_Vocab.pkl'))
    eos_token = 2
    sos_token = 1
    and_token = vocab('and')
    unk_token  = vocab.unk
else: # Use original vocabulary of CLIP
    from model.clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
    from model.func_test_v2 import get_highest_prob_capt_CLIP as get_highest_prob_capt
    vocab = _Tokenizer()
    eos_token = args.vocab_size-1 # eos_token is the last number
    sos_token = args.vocab_size-2 #sos_token is the last second number
    and_token = vocab.encode('and')[0]
    unk_token  = []
    df['tokens_encoded'] = df['CLIP_tokens']
sos_token,eos_token,and_token, unk_token

(1, 2, 30, 3)

In [8]:
df.tokens_encoded = df.tokens_encoded.apply(literal_eval)
df.subject_encoded = df.subject_encoded.apply(literal_eval)
df.predicate_encoded = df.predicate_encoded.apply(literal_eval)

data_loaders, _ = preprocess_dataset(df, args,img_transform)
print('Will use {} annotations for training.'.format(len(data_loaders['train'].dataset)))
print('Will use {} annotations for validation.'.format(len(data_loaders['val'].dataset)))
print('Will use {} annotations for testing.'.format(len(data_loaders['test'].dataset)))

img_transforms: {'train': None, 'val': None, 'test': None}
Will use 348197 annotations for training.
Will use 32011 annotations for validation.
Will use 58927 annotations for testing.


In [9]:
working_data_loader = data_loaders[args_val.split]
if args_val.max_utterance_len is None:
    # use the maximum length in the underlying split.
    def utterance_len(tokens, eos_token=eos_token):
        return np.where(np.asarray(tokens) == eos_token)[0][0] -1 # -1 to remove sos
    args_val.max_utterance_len = working_data_loader.dataset.tokens_encoded.apply(utterance_len).max()
    print(args_val.max_utterance_len)
annotate_loader = grounding_dataset_per_image_dummy(working_data_loader)


63
Index(['image_files', 'tokens_encoded', 'subject_encoded',
       'predicate_encoded'],
      dtype='object')


In [10]:
## Describe model
torch.backends.cudnn.benchmark = True
device = torch.device("cuda:" + str(args.gpu)) # CHECK HERE
    
model = idcmodel(args.backboneVisEnc,args.image_resolution,
                args.context_length,args.vocab_size,sos_token,eos_token,
                args.transformer_heads,args.transformer_layers,
                args.droprate)

loaded_epoch = load_state_dicts(args_val.speaker_checkpoint, map_location='cpu', model=model)
model.to(device)
loaded_epoch

20

In [11]:
config = { "sampling_rule": "beam","temperature": 1.0,"beam_size": BS,'max_utterance_len':63, 
          'drop_unk':True, 'drop_bigrams':True}
final_results = []

In [12]:
print('Sampling with configuration: ', config)

if args.random_seed != -1:
    set_seed(args.random_seed)

gen_df = versatile_caption_sampler(model,modeltype, annotate_loader,vocab,
                                                    device, sos_token,eos_token,and_token, unk_token,
                                                    args.vocab_size,**config)
    
final_results.append([config, gen_df])
print('Done.')
pickle_data(args_val.out_file, final_results)

Sampling with configuration:  {'sampling_rule': 'beam', 'temperature': 1.0, 'beam_size': 1, 'max_utterance_len': 63, 'drop_unk': True, 'drop_bigrams': True}


100%|██████████| 5497/5497 [24:51<00:00,  3.69it/s]


Done.
