In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [3]:
import torchfly
torchfly.set_random_seed(123)

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import regex as re
import random
import itertools
import tqdm
import time
from torchfly.utils.model_utils import get_pretrained_states

try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter
from apex import amp
from allennlp.training.checkpointer import Checkpointer
# from pytorch_transformers import AdamW, WarmupLinearSchedule, GPT2Tokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torchfly.text.tokenizers import UnifiedBPETokenizer

from torchfly.modules.losses import SequenceFocalLoss, SequenceCrossEntropyLoss
from torchfly.modules.transformers import GPT2SimpleLM, UnifiedGPT2SmallConfig
from cam676_eval.cam676_eval import clean_sentence, entities, entity_dict, success_f1_metric, bleu_metric

In [6]:
# set tokenizer
tokenizer = UnifiedBPETokenizer()
tokenizer.sep_token = "None"
# add speicial tokens in the same order as Roberta
# tokenizer.add_tokens(["<s>", "<pad>", "</s>", "<unk>", "<mask>"])



In [5]:
'''
class GPT2SmallConfig:
    vocab_size = 50257 + len(tokenizer.added_tokens_encoder)
    n_special = len(tokenizer.added_tokens_encoder)
    n_positions = 1024
    n_ctx = 1024
    n_embd = 768
    n_layer = 12
    n_head = 12
    resid_pdrop = 0.1
    embd_pdrop = 0.1
    attn_pdrop = 0.1
    layer_norm_epsilon = 1e-5
    initializer_range = 0.02
    gradient_checkpointing = False
    
class GPT2MediumConfig:
    vocab_size = len(tokenizer.added_tokens_encoder)
    n_special = len(tokenizer.added_tokens_encoder)
    n_positions = 1024
    n_ctx = 1024
    n_embd = 1024
    n_layer = 24
    n_head = 16
    resid_pdrop = 0.1
    embd_pdrop = 0.1
    attn_pdrop = 0.1
    layer_norm_epsilon = 1e-5
    initializer_range = 0.02
    gradient_checkpointing = True
'''

'\nclass GPT2SmallConfig:\n    vocab_size = 50257 + len(tokenizer.added_tokens_encoder)\n    n_special = len(tokenizer.added_tokens_encoder)\n    n_positions = 1024\n    n_ctx = 1024\n    n_embd = 768\n    n_layer = 12\n    n_head = 12\n    resid_pdrop = 0.1\n    embd_pdrop = 0.1\n    attn_pdrop = 0.1\n    layer_norm_epsilon = 1e-5\n    initializer_range = 0.02\n    gradient_checkpointing = False\n    \nclass GPT2MediumConfig:\n    vocab_size = len(tokenizer.added_tokens_encoder)\n    n_special = len(tokenizer.added_tokens_encoder)\n    n_positions = 1024\n    n_ctx = 1024\n    n_embd = 1024\n    n_layer = 24\n    n_head = 16\n    resid_pdrop = 0.1\n    embd_pdrop = 0.1\n    attn_pdrop = 0.1\n    layer_norm_epsilon = 1e-5\n    initializer_range = 0.02\n    gradient_checkpointing = True\n'

In [8]:
model_A = GPT2SimpleLM(UnifiedGPT2SmallConfig)
model_B = GPT2SimpleLM(UnifiedGPT2SmallConfig)
# model_A.load_state_dict(torch.load("../../../Checkpoint/best.th"))
# model_B.load_state_dict(torch.load("../../../Checkpoint/best.th"))
# model_A.load_state_dict(get_pretrained_states("unified-gpt2-small"))
# model_B.load_state_dict(get_pretrained_states("unified-gpt2-small"))
model_A.load_state_dict(torch.load("/data/jinggu/project/Pretraining_GPT/mc_version_torchfly_gpt_small"))
model_B.load_state_dict(torch.load("/data/jinggu/project/Pretraining_GPT/mc_version_torchfly_gpt_small"))


In [14]:
def align_keep_indices(batch_keep_indices):
    prev = batch_keep_indices[1]
    new_batch_keep_indices = [prev]

    for i in range(1, len(batch_keep_indices)):
        curr = batch_keep_indices[i]
        new = []

        for idx in curr:
            new.append(prev.index(idx))

        new_batch_keep_indices.append(new)
        prev = curr
        
    return new_batch_keep_indices


class CamRestDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.bos = tokenizer.encode("<s>")
        self.user_bos = tokenizer.encode("A:")
        self.system_bos = tokenizer.encode("B:")
        
        self.eos = [628, 198]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        full_dialog = self.data[index]
        
        full_dialog_tokens = []
        cur_pos = 0
        
        for turn_dialog in full_dialog:
            # user
            user_tokens = self.user_bos + tokenizer.encode(turn_dialog['user']) + self.eos
            user_pos = torch.arange(cur_pos, cur_pos + len(user_tokens))
            cur_pos = user_pos[-1] + 1
            
            # belief span
            belief_tokens = self.bos + \
                            tokenizer.encode(";".join(turn_dialog['bspan_inform'][1:])) + \
                            self.eos
            belief_pos = torch.arange(cur_pos, cur_pos + len(belief_tokens))
            cur_pos = belief_pos[-1]
            
            # system
            if np.random.rand() < 0.04:
                turn_dialog["degree"] = 0 
            database = tokenizer.encode(str(turn_dialog["degree"]))
            # database_pos = torch.LongTensor([1023])
            
            system_tokens = self.system_bos + \
                            tokenizer.encode(turn_dialog['replaced_response']) + \
                            self.eos
            system_pos = torch.arange(cur_pos, cur_pos + len(system_tokens) + 1)
            cur_pos = system_pos[-1] + 1
            
            # concat database and response
            system_tokens = database + system_tokens
            # system_pos = torch.cat([database_pos, system_pos], dim=0)
            
            user_tokens = torch.LongTensor(user_tokens)
            system_tokens = torch.LongTensor(system_tokens)
            belief_tokens = torch.LongTensor(belief_tokens)
            
            full_dialog_tokens.append((user_tokens, 
                                       user_pos, 
                                       system_tokens, 
                                       system_pos, 
                                       belief_tokens, 
                                       belief_pos))

        return full_dialog_tokens
        

class Collate_Function:
    """This function handles batch collate.
    """
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.pad = self.tokenizer.encode("<pad>")[0]

    def __call__(self, unpacked_data):

        max_turn_len = max([len(item) for item in unpacked_data])
        
        batch_dialogs = []
        batch_keep_indices = []

        for turn_num in range(max_turn_len):

            keep_indices = []

            for batch_idx in range(len(unpacked_data)):
                if turn_num < len(unpacked_data[batch_idx]):
                    keep_indices.append(batch_idx)

            user_tokens = pad_sequence([unpacked_data[idx][turn_num][0] for idx in keep_indices], 
                                        batch_first=True,
                                        padding_value=self.pad)
            user_pos = pad_sequence([unpacked_data[idx][turn_num][1] for idx in keep_indices], 
                                        batch_first=True,
                                        padding_value=0)
            system_tokens = pad_sequence([unpacked_data[idx][turn_num][2] for idx in keep_indices], 
                                        batch_first=True,
                                        padding_value=self.pad)
            system_pos = pad_sequence([unpacked_data[idx][turn_num][3] for idx in keep_indices], 
                                        batch_first=True,
                                        padding_value=0)
            belief_tokens = pad_sequence([unpacked_data[idx][turn_num][4] for idx in keep_indices], 
                                        batch_first=True,
                                        padding_value=self.pad)
            belief_pos = pad_sequence([unpacked_data[idx][turn_num][5] for idx in keep_indices], 
                                        batch_first=True,
                                        padding_value=0)  

            user_mask = (user_tokens != self.pad).byte()
            system_mask = (system_tokens != self.pad).byte()
            belief_mask = (belief_tokens != self.pad).byte()


            batch_dialogs.append((user_tokens, user_pos, user_mask, 
                                  system_tokens, system_pos, system_mask, 
                                  belief_tokens, belief_pos, belief_mask))
            batch_keep_indices.append(keep_indices)
            
        # align keep indices
        # batch_keep_indices = align_keep_indices(batch_keep_indices)
        return batch_dialogs, batch_keep_indices

In [15]:
def calculate_loss(logits, target, mask):
    logits = logits[:, :-1].contiguous()
    target = target[:, 1:].contiguous()
    mask = mask[:, 1:].contiguous().float()
    loss = criterion(logits, target, mask, label_smoothing=0.02, reduce=True)
    return loss

def filter_past(past, keep_indices):
    past = [item[:, keep_indices] for item in past]
    return past

def replace_punc(x):
    x = x.replace("<", "").replace(">", "")
    return x.replace(".", " .").replace(",", " .").replace("?", " ?").replace("?", " ?")

In [16]:
train_data = torch.load("../data/DataProcess/train_data.pkl")
val_data = torch.load("../data/DataProcess/val_data.pkl")
test_data = torch.load("../data/DataProcess/test_data.pkl")

indices = np.arange(len(train_data))
np.random.shuffle(indices)
# use all data
indices = indices[: 200]
train_data = [train_data[idx] for idx in indices]

In [17]:
train_dataset = CamRestDataset(train_data, tokenizer)
val_dataset = CamRestDataset(val_data, tokenizer)
test_dataset = CamRestDataset(test_data, tokenizer)

train_batch_size = 1
collate_func = Collate_Function(tokenizer)

train_dataloader = DataLoader(dataset=train_dataset, 
                              shuffle=True,
                              batch_size=train_batch_size, 
                              collate_fn=collate_func)

eval_batch_size = 16

val_dataloader = DataLoader(dataset=val_dataset, 
                          shuffle=False,
                          batch_size=eval_batch_size, 
                          collate_fn=collate_func)

test_dataloader = DataLoader(dataset=test_dataset, 
                          shuffle=False,
                          batch_size=eval_batch_size, 
                          collate_fn=collate_func)

This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Inpu

In [18]:
criterion = SequenceFocalLoss(gamma=0.0, beta=0.0)

In [19]:
device = torch.device("cuda")
model_A = model_A.to(device)
model_B = model_A

## Training

In [20]:
if not os.path.isdir("Checkpoint"):
    os.makedirs("Checkpoint")
checkpointer = Checkpointer(serialization_dir="Checkpoint", 
                            keep_serialized_model_every_num_seconds=3600*2, 
                            num_serialized_models_to_keep=10)

In [21]:
# optimizer
num_epochs = 10
num_gradients_accumulation = 1
num_train_optimization_steps = num_train_optimization_steps = len(train_dataset) * num_epochs // train_batch_size // num_gradients_accumulation

param_optimizer = list(model_A.named_parameters()) + list(model_B.named_parameters())
no_decay = ['ln', 'bias', 'LayerNorm']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = AdamW(optimizer_grouped_parameters,
                  lr=5e-5,
                  correct_bias=False)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                 num_warmup_steps=500,
                                 num_training_steps=num_train_optimization_steps)

In [22]:
# [model_A, model_B], optimizer = amp.initialize([model_A, model_B], optimizer, opt_level="O0")

In [23]:
user_weight = 1.0

def train_one_iter(batch_dialogs, batch_keep_indices, update_count, fp16=False):

    aligned_batch_keep_indices = align_keep_indices(batch_keep_indices)
   
    mask = torch.ByteTensor([]).to(device)
    prev_batch_size = batch_dialogs[0][0].shape[0]
    

    past = None
    all_logits = []
    target = []
    total_loss = 0 
    
    for turn_num in range(len(batch_keep_indices)):

        # data send to gpu
        dialogs = batch_dialogs[turn_num]
        dialogs = [item.to(device) for item in dialogs]

        user_tokens, user_pos, user_mask, \
            system_tokens, system_pos, system_mask, \
            belief_tokens, belief_pos, belief_mask = dialogs

        # filtering algorithm
        keep_indices = aligned_batch_keep_indices[turn_num]

        if len(keep_indices) != prev_batch_size:
            past = filter_past(past, keep_indices)
            mask = mask[keep_indices, :]

        # User Utterance
        mask = torch.cat([mask, user_mask], dim=-1)
        logits, past = model_A(user_tokens, position_ids=user_pos, mask=mask, past=past)
        all_logits.append(logits)
        target.append(user_tokens)
        # A_loss = calculate_loss(logits, user_tokens, user_mask)

        # System Response
        mask = torch.cat([mask, system_mask], dim=-1)
        logits, past = model_B(system_tokens, position_ids=system_pos, mask=mask, past=past)
        all_logits.append(logits)
        target.append(system_tokens)
        # B_loss = calculate_loss(logits, system_tokens, system_mask)

        # tail
        # total_loss = total_loss + user_weight * A_loss + B_loss
        prev_batch_size = user_tokens.shape[0]

#     breakpoint
    all_logits = torch.cat(all_logits, dim=1)
    all_logits = all_logits[:, :-1].contiguous()

    target = torch.cat(target, dim=1)
    target = target[:, 1:].contiguous()
    
    target_mask = torch.ones_like(target).float()
    
    total_loss = criterion(all_logits, target, target_mask, label_smoothing=0.02, reduce=True)

    # gradient accumulation
    total_loss /= len(batch_keep_indices)
    total_loss /= num_gradients_accumulation 
    
    if fp16:
        with amp.scale_loss(total_loss, optimizer) as scaled_loss:
            scaled_loss.backward()
    else:
        total_loss.backward()
        
    record_loss = total_loss.item() * num_gradients_accumulation
    perplexity = np.exp(record_loss)
    
    return record_loss, perplexity

In [24]:
def validate(dataloader, data):

    model_A.eval()
    model_B.eval()

    temperature = 0.5

    all_response = []

    for batch_dialogs, batch_keep_indices in tqdm.notebook.tqdm(dataloader):

        aligned_batch_keep_indices = align_keep_indices(batch_keep_indices)
        past = None
        generated_responses = [[] for i in range(batch_dialogs[0][0].shape[0])]

        mask = torch.ByteTensor([]).to(device)
        prev_batch_size = batch_dialogs[0][0].shape[0]

        with torch.no_grad():
            for turn_num in range(len(batch_keep_indices)):
                # data send to gpu
                dialogs = batch_dialogs[turn_num]
                dialogs = [item.to(device) for item in dialogs]

                user_tokens, user_pos, user_mask, \
                    system_tokens, system_pos, system_mask, \
                    belief_tokens, belief_pos, belief_mask = dialogs

                # batch filtering algorithm
                keep_indices = aligned_batch_keep_indices[turn_num]

                if len(keep_indices) != prev_batch_size:
                    past = filter_past(past, keep_indices)
                    mask = mask[keep_indices, :]

                # define some initials
                cur_batch_size = user_tokens.shape[0]
                flags = np.ones(cur_batch_size)
                generated_tokens = [[] for i in range(cur_batch_size)]

                # feed in user
                mask = torch.cat([mask, user_mask], dim=-1)
                _, past = model_A(user_tokens, position_ids=user_pos, mask=mask, past=past)

                # response generation
                response = []


                # first three tokens
                prev_input = system_tokens[:, :3]
                cur_pos = system_pos[:, :3]
                temp_past = past
                temp_mask = F.pad(mask, pad=(0,3), value=1)

                # feed into B
                logits, temp_past = model_B(prev_input, position_ids=cur_pos, mask=temp_mask, past=temp_past)
                # set current position
                cur_pos = cur_pos[:, -1].unsqueeze(1) + 1

                for i in range(50):
                    logits = logits[:, -1, :] / temperature
                    prev_tokens = torch.argmax(logits, dim=-1)
                    np_prev_tokens = prev_tokens.cpu().numpy()
                    # nucleus sampling
                    # logits = top_filtering(logits, top_k=100, top_p=0.7)
                    # probs = F.softmax(logits, -1)
                    # prev_input = torch.multinomial(probs, num_samples=1)

                    # add to generated tokens list
                    count = 0
                    for idx, value in enumerate(flags):
                        if value != 0:
                            generated_tokens[idx].append(np_prev_tokens[count])
                            count += 1

                    # filtering algorithm
                    if np.any(np_prev_tokens == 628):
                        # set flags 0
                        count = 0
                        for idx, value in enumerate(flags):
                            if value == 1:
                                if np_prev_tokens[count] == 628:
                                    flags[idx] = 0
                                count += 1
                        # compute which one to keep
                        keep_indices = np.argwhere(np_prev_tokens != 628).squeeze(1)
                        # filter
                        prev_tokens = prev_tokens[keep_indices.tolist()]
                        cur_pos = cur_pos[keep_indices.tolist(), :]
                        temp_mask = temp_mask[keep_indices.tolist(), :]
                        temp_past = [item[:, keep_indices.tolist()] for item in temp_past]
                        np_prev_tokens = np_prev_tokens[keep_indices.tolist()]

                    if np.all(flags == 0):
                        break

                    # prepare for the next token        
                    temp_mask = F.pad(temp_mask, pad=(0, 1), value=1)
                    logits, temp_past = model_B(prev_tokens.view(-1, 1), 
                                           position_ids=cur_pos, 
                                           mask=temp_mask, 
                                           past=temp_past)
                    cur_pos = cur_pos + 1

                # real system_tokens feed in
                mask = torch.cat([mask, system_mask], dim=-1)
                _, past = model_B(system_tokens, position_ids=system_pos, mask=mask, past=past)

                # inject into generated_responses_list
                decoded_responses = [tokenizer.decode(item).replace("\n", "") for item in generated_tokens]
                count = 0
                for idx in batch_keep_indices[turn_num]:
                    generated_responses[idx].append(decoded_responses[count])
                    count += 1

            # add to the final responses        
            for item in generated_responses:
                all_response.extend(item)
                
    # Stage 2
    #   prepare for metric eval
    dialog_data = []
    count = 0
    all_results = []

    for i in range(len(data)):
        raw_dialog = data[i]

        for turn_num in range(len(raw_dialog)):

            replaced_response = clean_sentence(
                replace_punc(raw_dialog[turn_num]["replaced_response"].lower().replace("slot", "SLOT")), entity_dict)

            generated_response = clean_sentence(replace_punc(all_response[count].lower().replace("slot", "SLOT")), entity_dict)

            dialog_data.append({"dial_id": raw_dialog[turn_num]["dial_id"],
                                "turn_num": raw_dialog[turn_num]["turn_num"],
                                "response": replaced_response,
                                "generated_response":generated_response 
                              })
            count += 1
            
    sccuess_f1 = success_f1_metric(dialog_data)
    bleu = bleu_metric(dialog_data)

    return {"bleu": bleu,
            "sccuess_f1": sccuess_f1
            }

In [25]:
update_count = 0
progress_bar = tqdm.tqdm_notebook
start = time.time()

for ep in range(num_epochs):

    "Training"
    pbar = progress_bar(train_dataloader)
    model_A.train()
    model_B.train()
    
    for batch_dialogs, batch_keep_indices in pbar:
        
        record_loss, perplexity = train_one_iter(batch_dialogs, batch_keep_indices, update_count, fp16=False)

        update_count += 1

        if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
            # update for gradient accumulation
            
#             torch.nn.utils.clip_grad_norm_(model_A.parameters(), 5.0)
#             torch.nn.utils.clip_grad_norm_(model_B.parameters(), 5.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            
            # speed measure
            end = time.time()
            speed = train_batch_size * num_gradients_accumulation / (end - start)
            start = end
            
            # show progress
            pbar.set_postfix(loss=record_loss, perplexity=perplexity, speed=speed)
    
    "Evaluation"
    print(f"Epoch {ep} Validation")
    eval_res = validate(val_dataloader, val_data)
    print(eval_res)
    
    print(f"Epoch {ep} Test")
    eval_res = validate(test_dataloader, test_data)
    print(eval_res)
    
    checkpointer.save_checkpoint(ep, 
                                 [model_A.state_dict(), model_A.state_dict()],
                                 {"None": None},
                                 True
                                 )

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Inpu

TypeError: forward() got an unexpected keyword argument 'mask'

In [26]:
update_count = 0
progress_bar = tqdm.notebook.tqdm
start = time.time()

for ep in range(num_epochs):

    "Training"
    pbar = progress_bar(train_dataloader)
    model_A.train()
    model_B.train()
    
    for batch_dialogs, batch_keep_indices in pbar:
        
        record_loss, perplexity = train_one_iter(batch_dialogs, batch_keep_indices, update_count, fp16=False)

        update_count += 1

        if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
            # update for gradient accumulation
            
#             torch.nn.utils.clip_grad_norm_(model_A.parameters(), 5.0)
#             torch.nn.utils.clip_grad_norm_(model_B.parameters(), 5.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            
            # speed measure
            end = time.time()
            speed = train_batch_size * num_gradients_accumulation / (end - start)
            start = end
            
            # show progress
            pbar.set_postfix(loss=record_loss, perplexity=perplexity, speed=speed)
    
    "Evaluation"
    print(f"Epoch {ep} Validation")
    eval_res = validate(val_dataloader, val_data)
    print(eval_res)
    
    print(f"Epoch {ep} Test")
    eval_res = validate(test_dataloader, test_data)
    print(eval_res)
    
    checkpointer.save_checkpoint(ep, 
                                 [model_A.state_dict(), model_A.state_dict()],
                                 {"None": None},
                                 True
                                 )

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Inpu

TypeError: forward() got an unexpected keyword argument 'mask'

{'bleu': 0.18528576144485603, 'sccuess_f1': 0.7661691492358308}
Epoch 0 Test
100%
9/9 [00:57<00:00, 6.43s/it]

{'bleu': 0.18891712428744042, 'sccuess_f1': 0.7538619929578371}
100%
200/200 [43:15<00:00, 12.98s/it, loss=0.263, perplexity=1.3, speed=2.11]

Epoch 1 Validation
100%
9/9 [01:19<00:00, 8.81s/it]

{'bleu': 0.15647152396609024, 'sccuess_f1': 0.844407059794514}
Epoch 1 Test
100%
9/9 [00:48<00:00, 5.38s/it]

{'bleu': 0.15899872884357819, 'sccuess_f1': 0.828924157270311}
100%
200/200 [41:01<00:00, 12.31s/it, loss=0.286, perplexity=1.33, speed=2.18]

Epoch 2 Validation
100%
9/9 [00:39<00:00, 4.41s/it]

{'bleu': 0.18037047407794105, 'sccuess_f1': 0.7925998002521975}
Epoch 2 Test
100%
9/9 [00:17<00:00, 2.00s/it]

{'bleu': 0.1848661149382996, 'sccuess_f1': 0.7939999949966201}
100%
200/200 [39:07<00:00, 11.74s/it, loss=0.272, perplexity=1.31, speed=3.53]

Epoch 3 Validation
100%
9/9 [00:17<00:00, 1.98s/it]

{'bleu': 0.13512004199985797, 'sccuess_f1': 0.7270765862609081}
Epoch 3 Test
100%
9/9 [01:03<00:00, 7.10s/it]

{'bleu': 0.13928672975886824, 'sccuess_f1': 0.717149215617259}
100%
200/200 [37:20<00:00, 11.20s/it, loss=0.209, perplexity=1.23, speed=2.71]

Epoch 4 Validation
100%
9/9 [00:55<00:00, 6.20s/it]

{'bleu': 0.20840774459932415, 'sccuess_f1': 0.8462222172182756}
Epoch 4 Test
100%
9/9 [00:30<00:00, 3.42s/it]

{'bleu': 0.21322651574953702, 'sccuess_f1': 0.8427787884113279}
100%
200/200 [35:17<00:00, 10.59s/it, loss=0.217, perplexity=1.24, speed=2.35]

Epoch 5 Validation
100%
9/9 [00:49<00:00, 5.48s/it]

{'bleu': 0.19571594985463295, 'sccuess_f1': 0.8317580290116173}
Epoch 5 Test
100%
9/9 [00:23<00:00, 2.61s/it]

{'bleu': 0.2072375771999586, 'sccuess_f1': 0.8360814692822786}
100%
200/200 [33:13<00:00, 9.97s/it, loss=0.187, perplexity=1.21, speed=2.13]

Epoch 6 Validation
100%
9/9 [00:25<00:00, 2.83s/it]

{'bleu': 0.20805844245958913, 'sccuess_f1': 0.8542056024606866}
Epoch 6 Test
100%
9/9 [00:24<00:00, 2.69s/it]

{'bleu': 0.2053606768192143, 'sccuess_f1': 0.8547008496846618}
100%
200/200 [31:07<00:00, 9.34s/it, loss=0.294, perplexity=1.34, speed=3.93]

Epoch 7 Validation
100%
9/9 [01:06<00:00, 7.36s/it]

{'bleu': 0.20471616971302903, 'sccuess_f1': 0.8595348787049779}
Epoch 7 Test
100%
9/9 [00:42<00:00, 4.71s/it]

{'bleu': 0.20968254546296883, 'sccuess_f1': 0.857414443652919}
100%
200/200 [29:02<00:00, 8.71s/it, loss=0.143, perplexity=1.15, speed=1.71]

Epoch 8 Validation
100%
9/9 [00:57<00:00, 6.43s/it]

{'bleu': 0.20096424029373133, 'sccuess_f1': 0.839622636494233}
Epoch 8 Test
100%
9/9 [00:32<00:00, 3.56s/it]

{'bleu': 0.19602954287333282, 'sccuess_f1': 0.8489483697448351}
100%
200/200 [26:55<00:00, 8.08s/it, loss=0.199, perplexity=1.22, speed=3.24]

Epoch 9 Validation
100%
9/9 [00:49<00:00, 5.51s/it]

{'bleu': 0.19466862379128747, 'sccuess_f1': 0.8205128154988641}
Epoch 9 Test
100%
9/9 [00:24<00:00, 2.67s/it]

{'bleu': 0.1984431006396542, 'sccuess_f1': 0.8428158098350613}
1
​