In [1]:
#!/usr/bin/env python
# coding: utf-8
"""
#### Code adapted from the source code of ArtEmis dataset paper
####################################################################
Training a neural-speaker.

The MIT License (MIT)
Originally created at 6/16/20, for Python 3.x
Copyright (c) 2021 Panos Achlioptas (ai.stanford.edu/~optas) & Stanford Geometric Computing Lab
####################################################################
"""
import pprint
import json
import torch
import time
import numpy as np
import os
import os.path as osp
from torch import nn
from termcolor import colored
from torch.utils.tensorboard import SummaryWriter
from model.argument import parse_test_speaker_arguments,set_seed
from model.datasets import preprocess_dataset_capGen,IdCDataset

from ast import literal_eval
import pandas as pd
from model.func_test import read_saved_args,grounding_dataset_per_image_dummy,versatile_caption_sampler
from model.func_test import pickle_data

from model.func_train import load_state_dicts


In [2]:
modelname = 'CLIPViTB16_full' #'CLIPViTB16_full','CLIPViTB16_woSG','INRN34_full','INRN34_woSG','INViTB16_full','INViTB16_woSG'

model_dir = f'output/{modelname}'
outputfile = osp.join(model_dir,'fullDB_test.pkl')

args_val = parse_test_speaker_arguments(
        ['-speaker-saved-args',osp.join(model_dir,'config.json.txt'),
         '-speaker-checkpoint',osp.join(model_dir,'checkpoints/best_model.pt'),#best_model.pt model_epoch_20.pt last_model
         '-out-file',outputfile,
         '-img-dir', '',
         '--sampling-config-file', None,
         '--split', 'test',
         '--gpu','0']

)


Parameters Specified:
{'drop_bigrams': True,
 'drop_unk': True,
 'gpu': '0',
 'img_dir': '',
 'max_utterance_len': None,
 'out_file': 'output/CLIPViTB16_full/fullDB_test.pkl',
 'out_file_full': None,
 'random_seed': 2021,
 'sampling_config_file': 'None',
 'speaker_checkpoint': 'output/CLIPViTB16_full/checkpoints/best_model.pt',
 'speaker_saved_args': 'output/CLIPViTB16_full/config.json.txt',
 'split': 'test'}




In [3]:
args = read_saved_args(args_val.speaker_saved_args)
modeltype = args.modeltype 
use_vocabFAM = args.use_vocabFAM
print(pprint.pformat(vars(args)))

{'FreezeCase': 0,
 'accum_iter': 2,
 'backboneVisEnc': 'ViTB16',
 'batch_size': 32,
 'context_length': 65,
 'data_dir': '../Dataset/ArtEmis/ArtEmis_IdC',
 'debug': False,
 'droprate': 0.0,
 'gpu': '0',
 'image_resolution': 224,
 'img_dir': '../Dataset/ArtEmis/ArtEmis_IdC/Images/CLIP_224',
 'lr_others': 0.001,
 'lr_patience': 2,
 'lr_textEnc': 0.001,
 'lr_visEnc': 1e-07,
 'max_epochs': 200,
 'modeltype': 'full',
 'output_dir': 'output2/CLIPViTB16_full',
 'random_seed': 2021,
 'save_each_epoch': False,
 'train_patience': 5,
 'transformer_heads': 8,
 'transformer_layers': 8,
 'use_timestamp': False,
 'use_vocabFAM': True,
 'vocab_size': 10506}


In [4]:
## Load dataset
file_name = 'ArtEmis_IdCI.csv'
df = pd.read_csv(osp.join(args.data_dir, file_name))
print(f'Loaded {len(df)} captions!!!')

Loaded 100393 captions!!!


In [5]:
if modeltype == 'full': 
    from model.model import fullArc as idcmodel
elif modeltype == 'woSG': 
    from model.model import woSGArc as idcmodel 
else:
    raise ValueError(f"Do not support modeltype = {modeltype}!!!")

In [6]:
if use_vocabFAM:
    from model.vocabulary import Vocabulary
    from model.func_test import get_highest_prob_capt as get_highest_prob_capt
    vocab = Vocabulary.load(osp.join(args.data_dir, 'ArtEmis_IdCI_Vocab.pkl'))
    eos_token = 2
    sos_token = 1
    and_token = vocab('and')
    unk_token  = vocab.unk
else: # Use original vocabulary of CLIP
    from model.clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
    from model.func_test import get_highest_prob_capt_CLIP as get_highest_prob_capt
    vocab = _Tokenizer()
    eos_token = args.vocab_size-1 # eos_token is the last number
    sos_token = args.vocab_size-2 #sos_token is the last second number
    and_token = vocab.encode('and')[0]
    unk_token  = []
    df['tokens_encoded'] = df['CLIP_tokens']
sos_token,eos_token,and_token, unk_token

(1, 2, 30, 3)

In [7]:
df.tokens_encoded = df.tokens_encoded.apply(literal_eval)
df.subject_encoded = df.subject_encoded.apply(literal_eval)
df.predicate_encoded = df.predicate_encoded.apply(literal_eval)

data_loaders, _ = preprocess_dataset_capGen(df, args)
print('Will use {} annotations for training.'.format(len(data_loaders['train'].dataset)))
print('Will use {} annotations for validation.'.format(len(data_loaders['val'].dataset)))
print('Will use {} annotations for testing.'.format(len(data_loaders['test'].dataset)))

Will use 75509 annotations for training.
Will use 9000 annotations for validation.
Will use 15884 annotations for testing.


In [8]:
working_data_loader = data_loaders[args_val.split]
if args_val.max_utterance_len is None:
    # use the maximum length in the underlying split.
    def utterance_len(tokens, eos_token=eos_token):
        return np.where(np.asarray(tokens) == eos_token)[0][0] -1 # -1 to remove sos
    args_val.max_utterance_len = working_data_loader.dataset.tokens_encoded.apply(utterance_len).max()
    print(args_val.max_utterance_len)
annotate_loader = grounding_dataset_per_image_dummy(working_data_loader)


63
Index(['image_files', 'tokens_encoded', 'subject_encoded',
       'predicate_encoded'],
      dtype='object')


In [9]:
## Describe model
torch.backends.cudnn.benchmark = True
device = torch.device("cuda:" + str(args.gpu))
    
model = idcmodel(args.backboneVisEnc,args.image_resolution,
                args.context_length,args.vocab_size,sos_token,eos_token,
                args.transformer_heads,args.transformer_layers,
                args.droprate)

loaded_epoch = load_state_dicts(args_val.speaker_checkpoint, map_location='cpu', model=model)
model.to(device)
loaded_epoch

16

In [10]:
config = { "sampling_rule": "beam","temperature": 1.0,"beam_size": 1,'max_utterance_len':63, 
          'drop_unk':True, 'drop_bigrams':True}
final_results = []

In [11]:
print('Sampling with configuration: ', config)

if args.random_seed != -1:
    set_seed(args.random_seed)

df = versatile_caption_sampler(model,modeltype, annotate_loader,vocab,
                                                    device, sos_token,eos_token,and_token, unk_token,
                                                    args.vocab_size,**config)
    
final_results.append([config, df])
print('Done.')
pickle_data(args_val.out_file, final_results)

Sampling with configuration:  {'sampling_rule': 'beam', 'temperature': 1.0, 'beam_size': 1, 'max_utterance_len': 63, 'drop_unk': True, 'drop_bigrams': True}


100%|██████████| 2497/2497 [04:52<00:00,  8.54it/s]


Done.


In [12]:
df

Unnamed: 0,art_style,painting,captions_predicted,prefs_predicted
0,Northern_Renaissance,robert-campin_saint-veronica-displaying-the-su...,the woman looks like a jester since and is a red,the woman
1,Impressionism,william-merritt-chase_portrait-of-harriet-hubb...,the woman looks like she is wearing a lot,the woman
2,Impressionism,willard-metcalf_passing-summer,the colors are bright and remind me of a summer,the colors are bright and
3,Expressionism,salvador-dali_filius-prodigus-1964,the woman looks like she is in pain and the ma...,the woman
4,Impressionism,ilya-mashkov_bakhchisarai-khan-s-palace-1925,the blue sky looks like a beautiful place to t...,the blue sky
...,...,...,...,...
2492,Post_Impressionism,vincent-van-gogh_standing-female-nude-seen-fro...,the woman looks like she is in a mermaid,the woman
2493,Impressionism,gustave-caillebotte_the-garden,the flowers look like they are in a nice,the flowers
2494,Northern_Renaissance,hieronymus-bosch_triptych-of-last-judgement,the man looks like he is about to die,the man
2495,Symbolism,gustave-moreau_the-unicorns,the women look like they are having fun and th...,the women
