In [6]:
from transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer
import torch

In [7]:
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

In [8]:
SPECIAL_TOKENS = {"bos_token": "<bos>", "eos_token": "<eos>", 
                  "additional_special_tokens": ["<speaker1>", "<speaker2>"],
                  "pad_token": "<pad>"}

num_added_toks = tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(len(tokenizer))
#model.set_num_special_tokens(len(SPECIAL_TOKENS))

Embedding(40483, 768)

In [14]:
from itertools import chain

persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]
bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

def build_inputs(persona, history, reply):
    sequence = [[bos] + list(chain(*persona))] + history + [reply + [eos]]
    sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s
                                    for i, s in enumerate(sequence[1:])]

    '''
    [['<bos>',
      'i',
      'like',
      'playing',
      'football',
      '.',
      'i',
      'am',
      'from',
      'NYC',
      '.'],
     ['<speaker1>', 'hello', 'how', 'are', 'you', '?'],
     ['<speaker2>', 'i', 'am', 'fine', 'thanks', '.'],
     ['<speaker1>', 'great', 'to', 'hear', '<eos>']]
    '''
    words = list(chain(*sequence)) #a flat list with special tokens
    segments = [speaker2 if i % 2 else speaker1             # segment tokens
                    for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))
    
    return words, segments, position, sequence

words, segments, position, sequence = build_inputs(persona, history, reply)
# >>> print(sequence)  # Our inputs looks like this:
# [['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.'],
#  ['<speaker1>', 'hello', 'how', 'are', 'you', '?'],
#  ['<speaker2>', 'i', 'am', 'fine', 'thanks', '.'],
#  ['<speaker1>', 'great', 'to', 'hear', '<eos>']]
#>>> print(words)
#['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.', '<speaker1>', 'hello', 'how', 'are', 'you', '?', '<speaker2>', 'i', 'am', 'fine', 'thanks', '.', '<speaker1>', 'great', 'to', 'hear', '<eos>']
#>>> print(segments)
#['<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker2>', '<speaker2>', '<speaker2>', '<speaker2>', '<speaker2>', '<speaker2>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker1>', '<speaker2>', '<speaker2>', '<speaker2>', '<speaker2>', '<speaker2>']
#>>> print(position)
#[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]

In [15]:
words = tokenizer.convert_tokens_to_ids(words)
segments = tokenizer.convert_tokens_to_ids(segments)

In [21]:
distractor = ["sorry", "to", "hear", "that"]

words_distractor, segments_distractor, _, _ = build_inputs(persona, history, distractor)
words_distractor = tokenizer.convert_tokens_to_ids(words_distractor)
segments_distractor = tokenizer.convert_tokens_to_ids(segments_distractor)

lm_targets = ([-100] * sum(len(s) for s in sequence[:-1])) \
             + [-100] + tokenizer.convert_tokens_to_ids(sequence[-1][1:])
#[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5201, 571, 863, 40479]
lm_distractor = [-100] * len(words_distractor)

last_token = len(words) - 1 #<eos>
last_token_distractor = len(words_distractor) - 1

padding_length = max(len(words), len(words_distractor))

def pad(x, padding):
    return x + [padding] * (padding_length - len(x))

(words, words_distractor,
 segments, segments_distractor) = [pad(x, tokenizer.convert_tokens_to_ids('<pad>'))
                                   for x in (words, words_distractor,
                                             segments, segments_distractor)]

(lm_targets, lm_distractor) = [pad(x, -100) for x in (lm_targets, lm_distractor)]
 
input_ids = torch.tensor([[words, words_distractor]], dtype=torch.long) #torch.Size([1, 2, 29])

token_type_ids = torch.tensor([[segments, segments_distractor]], dtype=torch.long)

mc_token_ids = torch.tensor([[last_token, last_token_distractor]], dtype=torch.long) #tensor([[28, 28]])
# Language modeling labels
lm_labels = torch.tensor([[lm_targets, lm_distractor]], dtype=torch.long)
# Next-sentence prediction labels
mc_labels = torch.tensor([0], dtype=torch.long)  # Gold reply is 1st (index 0)

lm_logits, mc_logits, *_ = model(input_ids=input_ids, 
                         mc_token_ids=mc_token_ids, 
#                          lm_labels=lm_labels, 
#                          mc_labels=mc_labels, 
                         token_type_ids=token_type_ids)
# mc_token_ids, lm_labels.shape, mc_labels.shape

In [37]:
torch.max(mc_logits, 1)[1], mc_labels

(tensor([0]), tensor([0]))

In [95]:
lm_coef = 2.0
mc_coef = 1.0
total_loss = lm_loss * lm_coef + mc_loss * mc_coef

In [1]:
import os
import math
import logging
from pprint import pformat
from argparse import ArgumentParser
from collections import defaultdict
from itertools import chain

import torch
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from ignite.engine import Engine, Events
from ignite.handlers import ModelCheckpoint
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                  GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)

from transfer_learning_conv_ai.utils import get_dataset, make_logdir
from tqdm import tqdm, trange

In [2]:
class args:
    dataset_path=''
    dataset_cache='./dataset_cache'
    model_checkpoint='openai-gpt'
    output_dir = ''
    num_candidates=2
    max_history=2
    per_gpu_train_batch_size=4
    per_gpu_eval_batch_size=1
    gradient_accumulation_steps=8
    lr=6.25e-5
    lm_coef=1.0
    mc_coef=1.0
    max_norm=1.0
    #n_epochs=3
    personality_permutations=1
    eval_before_start=True
    device="cuda" if torch.cuda.is_available() else "cpu"
    fp16='O1'
    local_rank=-1
    distributed = (local_rank != -1)
    max_step = -1
    num_train_epochs = 3
    evaluate_during_training = True
    logging_steps = 8
    n_gpu=1
    max_steps = -1
    tpu = False
    max_grad_norm = 1.0

args=args 

In [3]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {"bos_token": "<bos>", "eos_token": "<eos>", 
                  "additional_special_tokens": ["<speaker1>", "<speaker2>"],
                  "pad_token": "<pad>"}
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
logger.info("Arguments: %s", pformat(args))
    
# def average_distributed_scalar(scalar, args):
#     """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
#     if args.local_rank == -1:
#         return scalar
#     scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
#     torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
#     return scalar_t.item()


# num_added_toks = tokenizer.add_special_tokens(SPECIAL_TOKENS)
# model.resize_token_embeddings(len(tokenizer))

INFO:__main__:Arguments: <class '__main__.args'>


In [4]:
logger.info("Prepare tokenizer, pretrained model and optimizer.")
tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path
tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
model = model_class.from_pretrained(args.model_checkpoint)
model.to(args.device)

INFO:__main__:Prepare tokenizer, pretrained model and optimizer.
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json from cache at /home/ubuntu/.cache/torch/transformers/4ab93d0cd78ae80e746c27c9cd34e90b470abdabe0590c9ec742df61625ba310.b9628f6fe5519626534b82ce7ec72b22ce0ae79550325f45c604a25c0ad87fd6
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt from cache at /home/ubuntu/.cache/torch/transformers/0f8de0dbd6a2bb6bde7d758f4c120dd6dd20b46f2bf0a47bc899c89f46532fde.20808570f9a3169212a577f819c845330da870aeb14c40f7319819fce10c3b76
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json from cache at /home/ubuntu/.cache/torch/transformers/a27bb7c70e9002d7558d2682d5a95f3c0a8b31034616309459e0b51ef07ade09.bd0797be126548711309ad2174d2afb16e3c37e891707667603d85e35a4ad00

OpenAIGPTDoubleHeadsModel(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(40478, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwis

In [5]:
def add_special_tokens_(model, tokenizer):
    """ Add special tokens to the tokenizer and the model if they have not already been added. """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    if num_added_tokens > 0:
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
        
add_special_tokens_(model, tokenizer)

INFO:transformers.tokenization_utils:Adding <bos> to the vocabulary
INFO:transformers.tokenization_utils:Assigning <bos> to the bos_token key of the tokenizer
INFO:transformers.tokenization_utils:Adding <eos> to the vocabulary
INFO:transformers.tokenization_utils:Assigning <eos> to the eos_token key of the tokenizer
INFO:transformers.tokenization_utils:Adding <speaker1> to the vocabulary
INFO:transformers.tokenization_utils:Adding <speaker2> to the vocabulary
INFO:transformers.tokenization_utils:Assigning ['<speaker1>', '<speaker2>'] to the additional_special_tokens key of the tokenizer
INFO:transformers.tokenization_utils:Adding <pad> to the vocabulary
INFO:transformers.tokenization_utils:Assigning <pad> to the pad_token key of the tokenizer


In [6]:
# optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)
# if args.fp16:
#     from apex import amp  # Apex is only required if we use fp16 training
#     model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)

# from transformers import cached_path
# import json

# logger.info("Prepare datasets")
# #train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

# PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"

# dataset_path = PERSONACHAT_URL

# personachat_file = cached_path(dataset_path)

# with open(personachat_file, "r", encoding="utf-8") as f:
#     dataset = json.loads(f.read())

# def tokenize(obj):
#     if isinstance(obj, str):
#         return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
#     if isinstance(obj, dict):
#         return dict((n, tokenize(o)) for n, o in obj.items())
#     return list(tokenize(o) for o in obj)

# dataset = tokenize(dataset)

In [6]:
datasets = {"train": defaultdict(list), "valid": defaultdict(list)}

dataset_cache = args.dataset_cache
dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ 

def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    #[[bos+persona], [history], [reply+eos]]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = [-100] * len(instance["input_ids"])
    if lm_labels: #if the current candidate is lm_labels, [-100]*[len(persona)+len(history)+1(speaker2)]+current candidate
        instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
    return instance

def pad_dataset(dataset, padding=0):
    """ Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. """
    max_l = max(len(x) for x in dataset["input_ids"])
    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
    return dataset

def get_data_loaders(args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    personachat = torch.load(dataset_cache)#get_dataset(tokenizer, args.dataset_path, args.dataset_cache)

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"]) #n_candidates are same for all 17878 dialogs
        if args.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(args.num_candidates, num_candidates) #min(2,20)
        for dialog in dataset: #17878 dialogs
            persona = dialog["personality"].copy()
            for _ in range(args.personality_permutations):
                for utterance in dialog["utterances"]: #7个utterances in the first dialog
                    history = utterance["history"][-(2*args.max_history+1):]
                    for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                        lm_labels = bool(j == num_candidates-1) #the last sentence in candidate is the correct response
                        instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
                        #instance["input_ids"] of length of the sequence: bos+persona+history+candiate+eos
                        #, instance["token_type_ids"], instance["mc_token_ids"], instance["lm_labels"] 
                        for input_name, input_array in instance.items():
                            datasets[dataset_name][input_name].append(input_array)
                        #datasets['train']['input_ids'] of [[c1 in u1],[c2 in u1],..,[c2 in u7]] 14 sublists [n_candidate* # of utterances in a dialog] e.g. [2*7]
                        # the first is the sequence with wrong candidate, second is the sequence with correct candidate
                    datasets[dataset_name]["mc_labels"].append(num_candidates - 1) #7
                    datasets[dataset_name]["n_candidates"] = num_candidates #an int =2
                persona = [persona[-1]] + persona[:-1]  # permuted personalities

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name]) #1 dialog->7 or 6 or 8 untterences == 17878 dialogs -> sum(utterances in each dialog) =131438
            #np.sum([len(dialog['utterances']) for dialog in personachat['train']])
            if input_name != "mc_labels":
                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
#     train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
#     valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
#     train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed))
#     valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)

    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) #torch.Size([131438 utterences, 2 candidates, 282 max_lens])
    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
    return train_dataset, valid_dataset#, train_loader, valid_loader, train_sampler, valid_sampler


In [7]:
train_dataset, valid_dataset = get_data_loaders(args, tokenizer)

INFO:__main__:Build inputs and labels
INFO:__main__:Pad inputs and convert to Tensor
INFO:__main__:Build train and validation dataloaders
INFO:__main__:Train dataset (Batch, Candidates, Seq length): torch.Size([131438, 2, 282])
INFO:__main__:Valid dataset (Batch, Candidates, Seq length): torch.Size([7801, 20, 179])


In [8]:
# args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
# train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

# args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# eval_sampler = SequentialSampler(valid_dataset) if args.local_rank == -1 else DistributedSampler(valid_dataset)
# eval_dataloader = DataLoader(valid_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    
# train_batch = next(iter(train_dataloader))
# eval_batch = next(iter(eval_dataloader))

# batch = tuple(t.to(args.device) for t in eval_batch)
# input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch

# lm_logits, mc_logits, *_ = model(
#             input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
#         )

# lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
# lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)

# x = ((lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels))

# torch.nn.CrossEntropyLoss(ignore_index=-100)(x[0][0], x[1][0]), torch.mean((torch.max(x[0][1], 1)[1] == x[1][1]).float())

In [19]:
def train(args, train_dataset, valid_dataset, model, tokenizer):
    
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
    
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.num_train_epochs * len(train_dataloader), 0.0)])
    
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    
    global_step = 0
    metrics = {"nll": 10000.0,
           "accuracy": 0.0}
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    #train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    
    for _ in range(args.num_train_epochs): #3
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) #131438
        for step, batch in enumerate(epoch_iterator): #4 utterances
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            (lm_loss), (mc_loss), *_ = model(
                input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
                mc_labels=mc_labels, lm_labels=lm_labels
            )
            loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
            
            if step % 100 == 0:
                logger.info("Loss for {} is {}".format(step, loss))
                
            if args.n_gpu > 1:
                loss = loss.mean()
                
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            
            tr_loss += loss.item()
            
            if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                
                optimizer.step()
                model.zero_grad()
                
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and (step+1) % args.logging_steps == 0:
                    
                    if args.local_rank == -1 and args.evaluate_during_training:
                        
                        metrics = evaluate(args, model, valid_dataset, metrics, tokenizer)
            
                global_step+=1
            
            if args.max_steps>0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    
    return tr_loss/global_step, metrics

In [20]:
import numpy as np


def evaluate(args, model, valid_dataset, metrics, tokenizer, prefix=''):
    
    eval_outputs_dirs = make_logdir(args.model_checkpoint)
    
    if not os.path.exists(eval_outputs_dirs) and args.local_rank in [-1, 0]:
        os.makedirs(eval_outputs_dirs)
        
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    eval_sampler = SequentialSampler(valid_dataset) if args.local_rank == -1 else DistributedSampler(valid_dataset)
    eval_dataloader = DataLoader(valid_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    
        # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(valid_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    
    nlls = None
    accs = None
    eval_epoch_iterator = tqdm(eval_dataloader, desc="Evaluating")
    for _, batch in enumerate(eval_epoch_iterator):
        
        model.eval()

        with torch.no_grad():
            batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            
            lm_logits, mc_logits, *_ = model(
                input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
            )
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            
            x = ((lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels))
            nll = torch.nn.CrossEntropyLoss(ignore_index=-100)(x[0][0], x[1][0]).detach().cpu().numpy()#Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0]))
            acc = torch.sum((torch.max(x[0][1], 1)[1] == x[1][1]).int()).detach().cpu().numpy().mean()#Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
        
        if nlls is None:
            nlls = nll
            accs = acc
        else:
            nlls = np.append(nlls, nll)
            accs = np.append(accs, acc)
    
    nlls_mean = np.mean(nlls)
    accs_mean = np.mean(accs)
    
    if accs_mean>metrics['accuracy'] and nlls_mean<metrics['nll']:
        logger.info("***** New high accuracy and nll! {} {}*****".format(accs_mean, nlls_mean))
        metrics.update({'nll': nlls_mean, 'accuracy': accs_mean})
        output_eval_file = os.path.join(eval_outputs_dirs, prefix, "eval_results.txt")
    
    return metrics

In [23]:
tr_loss, eval_results = train(args, train_dataset, valid_dataset, model, tokenizer)

INFO:__main__:***** Running training *****
INFO:__main__:  Num examples = 131438
INFO:__main__:  Num Epochs = 3
INFO:__main__:  Instantaneous batch size per GPU = 4
INFO:__main__:  Total train batch size (w. parallel, distributed & accumulation) = 32
INFO:__main__:  Gradient Accumulation steps = 8
INFO:__main__:  Total optimization steps = 12321




Iteration:   0%|          | 0/32860 [00:00<?, ?it/s][A[A[A[A

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:__main__:Loss for 0 is 0.5290927886962891




Iteration:   0%|          | 1/32860 [00:02<18:59:26,  2.08s/it][A[A[A[A



Iteration:   0%|          | 2/32860 [00:03<18:28:32,  2.02s/it][A[A[A[A



Iteration:   0%|          | 3/32860 [00:05<18:08:01,  1.99s/it][A[A[A[A



Iteration:   0%|          | 4/32860 [00:07<17:55:25,  1.96s/it][A[A[A[A



Iteration:   0%|          | 5/32860 [00:09<17:45:50,  1.95s/it][A[A[A[A



Iteration:   0%|          | 6/32860 [00:11<17:38:44,  1.93s/it][A[A[A[A



Iteration:   0%|          | 7/32860 [00:13<17:35:14,  1.93s/it][A[A[A[AINFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 5
INFO:__main__:  Batch size = 1





Evaluating:   0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A[A

ValueError: not enough values to unpack (expected 5, got 1)

In [340]:
def update(engine, batch):
    model.train()
    batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
    input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
    (lm_loss), (mc_loss), *_ = model(
        input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
        mc_labels=mc_labels, lm_labels=lm_labels
    )
    loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
    if args.fp16:
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
    else:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
    if engine.state.iteration % args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()
trainer = Engine(update)

In [342]:
def inference(engine, batch):
    model.eval()
    with torch.no_grad():
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
        # if we dont send labels to model, it doesnt return losses
        lm_logits, mc_logits, *_ = model(
            input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
        )
        lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
        lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
        return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
evaluator = Engine(inference)

In [351]:
trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
if args.n_epochs < 1:
    trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
if args.eval_before_start:
    trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

if args.distributed:
    trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
    evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))


In [352]:
# Linearly decrease the learning rate from lr to zero
scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)


<ignite.engine.engine.RemovableEventHandle at 0x7f958048a278>

In [356]:
def average_distributed_scalar(scalar, args):
    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
    if args.local_rank == -1:
        return scalar
    scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
    return scalar_t.item()

# Prepare metrics - note how we compute distributed metrics
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])),
           "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), #applying average_distributed_scalar to metrics["nll"] 
                "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
for name, metric in metrics.items():
    metric.attach(evaluator, name)

In [361]:
pbar = ProgressBar(persist=True)
pbar.attach(trainer, metric_names=["loss"])
evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

log_dir = make_logdir(args.model_checkpoint)

checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" takes care of distributed encapsulation

torch.save(args, log_dir + '/model_training_args.bin')
getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
tokenizer.save_pretrained(log_dir)
        



('runs/May22_23-18-44_ip-172-31-29-0_openai-gpt/vocab.json',
 'runs/May22_23-18-44_ip-172-31-29-0_openai-gpt/merges.txt',
 'runs/May22_23-18-44_ip-172-31-29-0_openai-gpt/special_tokens_map.json',
 'runs/May22_23-18-44_ip-172-31-29-0_openai-gpt/added_tokens.json')

In [363]:
trainer.run(train_loader, max_epochs=args.n_epochs)

INFO:ignite.engine.engine.Engine:Engine run resuming from iteration 0, epoch 0 until 3 epochs
INFO:ignite.engine.engine.Engine:Engine run resuming from iteration 1, epoch 1 until 1 epochs


                                     





[A[A[A[A[A[A                       


[A[A[A                            




[A[A[A[A[A                      







[A[A[A[A[A[A[A[A                 



[A[A[A[A                         
[A                                  






[A[A[A[A[A[A[A                    

  0%|          | 0/2 [124:56:25<?, ?it/s][A[A





  0%|          | 0/18878 [124:13:59<?, ?it/s][A[A[A[A[A[A


  0%|          | 0/2 [124:53:06<?, ?it/s][A[A[A




  0%|          | 0/2 [124:52:56<?, ?it/s][A[A[A[A[A







  0%|          | 0/18878 [124:13:45<?, ?it/s][A[A[A[A[A[A[A[A



  0%|          | 0/2 [124:52:56<?, ?it/s][A[A[A[A
  0%|          | 0/2 [124:56:09<?, ?it/s][A






  0%|          | 0/18878 [124:13:45<?, ?it/s][A[

Validation: {}
Validation: {}


RuntimeError: CUDA error: device-side assert triggered