In [1]:
import json
import torch
import torch.nn.functional as F
import torch.utils.data
import torchvision.transforms as transforms
from PIL import Image
from accelerate import Accelerator
from torch.utils.data import Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import GPT2Tokenizer, AutoConfig
from models.clip import ImageEncoder
from models.gpt import GPT2LMHeadModel
from utils import data_utils

In [3]:
# class_exist = hasattr(data_utils,proc_ans)
modlule_contents = dir(data_utils)
print(modlule_contents)
# print(class_exist)
# print(type(data_utils))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'articles', 'comma_strip', 'contractions', 'manual_map', 'period_strip', 'prep_ans', 'proc_ans', 'proc_ques', 'process_digit_article', 'process_punctuation', 'punct', 're']


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def change_requires_grad(model, req_grad):
    for p in model.parameters():
        p.requires_grad = req_grad

def load_checkpoint(ckpt_path, epoch):
    model_name = 'nle_model_{}'.format(str(epoch))
    tokenizer_name = 'nle_gpt2_tokenizer_0'
    filename = 'ckpt_stats_' + str(epoch) + '.tar'

    tokenizer = GPT2Tokenizer.from_pretrained(ckpt_path + tokenizer_name)  # load tokenizer
    model = GPT2LMHeadModel.from_pretrained(ckpt_path + model_name).to(device)  # load pretrained_model with config
    opt = torch.load(ckpt_path + filename)
    optimizer = get_optimizer(model, learning_rate)
    optimizer.load_state_dict(opt['optimizer_state_dict'])
    start_epoch = opt['epoch'] + 1
    scheduler_dic = opt['scheduler']
    del opt
    torch.cuda.empty_cache()

    return tokenizer, model, optimizer, scheduler_dic, start_epoch

In [26]:
def load_pretrained():
    model_path = 'pretrained_model/pretrain_model'
    tokenizer_path = 'pretrained_model/pretrain_tokenizer_0'
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)  # load tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_path).to(device)  # load pretrained_model with config
    return tokenizer, model


def save_checkpoint(epoch, unwrapped_model, optimizer, tokenizer, scheduler, ckpt_path, **kwargs):
    model_name = 'nle_model_{}'.format(str(epoch))
    tokenizer_name = 'nle_gpt2_tokenizer_{}'.format(str(epoch))
    filename = 'ckpt_stats_' + str(epoch) + '.tar'

    if epoch == 0:
        tokenizer.save_pretrained(ckpt_path + tokenizer_name)  # save tokenizer

    unwrapped_model.save_pretrained(ckpt_path + model_name, save_function=accelerator.save)

    opt = {'epoch': epoch,
           'optimizer_state_dict': optimizer.state_dict(),
           'scheduler': scheduler.state_dict(),
           **kwargs}

    torch.save(opt, ckpt_path + filename)


In [30]:
class MultiVQAXTrainDataset(Dataset):

    def __init__(self, path, transform, tokenizer, max_seq_len):

        self.tokenizer = tokenizer
        self.transform = transform
        self.max_seq_len = max_seq_len  # question + <bos> The answer is <answer> because <explanation> <eos>
        self.data = json.load(open(path, 'r'))
        self.ids_list = list(self.data.keys())

        for k, v in self.data.items():
            if len(v['explanation']) > 1:  # some questions have more than one explanation
                # duplicate them for loading. -1 because one explanation is already in ids_list
                self.ids_list += [str(k)] * (len(v['explanation']) - 1)

        self.index_tracker = {k: len(v['explanation']) - 1 for k, v in self.data.items()}

    def __getitem__(self, i):

        question_id = self.ids_list[i]
        sample = self.data[question_id]
        img_name = sample['image_name']
        text_a = sample['question']  # question
        answer = sample['answers']

        exp_idx = self.index_tracker[question_id]  # the index of the explanation for questions with multiple explanations
        if exp_idx > 0:
            self.index_tracker[question_id] -= 1  # decrease usage

        text_b = sample['explanation'][exp_idx]  # explanation

        # tokenization process
        q_segment_id, a_segment_id, e_segment_id = self.tokenizer.convert_tokens_to_ids(['<question>',
                                                                                         '<answer>',
                                                                                         '<explanation>'])
        tokens = self.tokenizer.tokenize(text_a)
        labels = [-100] * len(tokens)  # we do not want to predict the question, set to pad to ignore in XE
        segment_ids = [q_segment_id] * len(tokens)
        if isinstance(answer, str):
            answer = [self.tokenizer.bos_token] + self.tokenizer.tokenize(" the answer is " + answer)
        elif isinstance(answer, list) and len(answer)>0 and 'answer' in answer[0]:
            answer = [self.tokenizer.bos_token] + self.tokenizer.tokenize(" the answer is " + answer[0]['answer'])
        else:
            print("other structure than ones handled!!")
        
        answer_len = len(answer)
        tokens_b = self.tokenizer.tokenize(" because " + text_b) + [self.tokenizer.eos_token]
        exp_len = len(tokens_b)
        tokens += answer + tokens_b
        labels += [-100] + answer[1:] + tokens_b  # labels will be shifted in the pretrained_model, so for now set them same as tokens
        segment_ids += [a_segment_id] * answer_len
        segment_ids += [e_segment_id] * exp_len

        if len(tokens) > self.max_seq_len:
            tokens = tokens[:self.max_seq_len]
            labels = labels[:self.max_seq_len]
            segment_ids = segment_ids[:self.max_seq_len]

        assert len(tokens) == len(segment_ids)
        assert len(tokens) == len(labels)

        seq_len = len(tokens)
        padding_len = self.max_seq_len - seq_len
        tokens = tokens + ([self.tokenizer.pad_token] * padding_len)
        labels = labels + ([-100] * padding_len)

        segment_ids += ([e_segment_id] * padding_len)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = torch.tensor(input_ids, dtype=torch.long)

        labels = [self.tokenizer.convert_tokens_to_ids(t) if t != -100 else t for t in labels]
        labels = torch.tensor(labels, dtype=torch.long)

        segment_ids = torch.tensor(segment_ids, dtype=torch.long)

        folder = 'images/'
        img_path = folder + img_name
        img = Image.open(img_path).convert('RGB')
        img = self.transform(img)
        qid = torch.LongTensor([int(question_id)])

        return (img, qid, input_ids, labels, segment_ids)

    def __len__(self):
        return len(self.ids_list)

In [28]:
def get_optimizer(model, learning_rate):
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    return optimizer

In [32]:
accelerator = Accelerator()
device = accelerator.device

finetune_pretrained = False  # if True, finetunes from the image captioning pretrained_model
img_size = 224
ckpt_path = 'ckpts/'
nle_data_train_path = 'nle_data/nle_data_train.json'
max_seq_len = 40
load_from_epoch = None
no_sample = True
top_k = 0
top_p = 0.9
batch_size = 32  # per GPU
num_train_epochs = 30
weight_decay = 0
learning_rate = 2e-5 if not finetune_pretrained else 1e-5
gradient_accumulation_steps = 1
start_epoch = 0
temperature = 1

image_encoder = ImageEncoder(device).to(device)
change_requires_grad(image_encoder, False)

if load_from_epoch is not None:
    tokenizer, model, optimizer, scheduler_dic, start_epoch = load_checkpoint(ckpt_path, load_from_epoch)

else:

    if finetune_pretrained:
        tokenizer, model = load_pretrained()
        optimizer = get_optimizer(model, learning_rate)
    else:
        tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
        orig_num_tokens = len(tokenizer.encoder)

        num_new_tokens = tokenizer.add_special_tokens({'pad_token': '<pad>',
                                                       'additional_special_tokens': ['<question>', '<answer>',
                                                                                     '<explanation>']})

        assert len(tokenizer) == orig_num_tokens + num_new_tokens
        config = AutoConfig.from_pretrained('distilgpt2')

        # Add configs
        setattr(config, 'img_size', None)
        setattr(config, 'max_seq_len', None)
        config.img_size = img_size
        config.max_seq_len = max_seq_len
        config.add_cross_attention = True

        model = GPT2LMHeadModel.from_pretrained('distilgpt2', config=config)
        model.resize_token_embeddings(len(tokenizer))
        model = model.to(device)
        optimizer = get_optimizer(model, learning_rate)

print("Model Setup Ready...")

img_transform = transforms.Compose([transforms.Resize((img_size, img_size)),
                                    transforms.ToTensor(),
                                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

train_dataset = MultiVQAXTrainDataset(path=nle_data_train_path,
                                      transform=img_transform,
                                      tokenizer=tokenizer,
                                      max_seq_len=max_seq_len)

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           pin_memory=True)


t_total = (len(train_loader) // gradient_accumulation_steps) * num_train_epochs
warmup_steps = 0  # 0.10 * t_total
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

if load_from_epoch is not None:
    scheduler.load_state_dict(scheduler_dic)

for epoch in range(start_epoch, num_train_epochs):

    model.train()
    accum_loss = 0

    for step, batch in enumerate(train_loader):

        batch = tuple(input_tensor.to(device) for input_tensor in batch)
        img, _, input_ids, labels, segment_ids = batch

        img_embeddings = image_encoder(img)

        outputs = model(input_ids=input_ids,
                        past_key_values=None,
                        attention_mask=None,
                        token_type_ids=segment_ids,
                        position_ids=None,
                        encoder_hidden_states=img_embeddings,
                        encoder_attention_mask=None,
                        labels=labels,
                        use_cache=False,
                        return_dict=True)

        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        accum_loss += loss.item()

        if step % gradient_accumulation_steps == 0 or step == len(train_loader) - 1:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            accelerator.print("\rEpoch {} / {}, Iter {} / {}, Loss: {:.3f}".format(epoch,
                                                                                   num_train_epochs,
                                                                                   step, len(train_loader),
                                                                                   accum_loss),
                              end='          ')
            accum_loss = 0


    save_checkpoint(epoch, model, optimizer, tokenizer, scheduler, ckpt_path)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['transformer.h.2.crossattention.c_proj.bias', 'transformer.h.3.crossattention.q_attn.weight', 'transformer.h.3.crossattention.masked_bias', 'transformer.h.2.crossattention.masked_bias', 'transformer.h.4.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.5.crossattention.bias', 'transformer.h.5.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.0.crossattention.masked_bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.5.crossattention.q_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.2.crossattention.q_attn.weight', 'transformer.h.4.crossattention.q_attn.weight', 'transformer.h.2.ln_cross_attn.weight', 'transformer.h.3.crossattention.c_proj.bias', 'transformer.h.1.crossattention.masked_bias', 'transform

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 2.00 GiB of which 0 bytes is free. Of the allocated memory 1.33 GiB is allocated by PyTorch, and 3.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF