In [1]:
class CFG:
    model_path = '/root/autodl-tmp/weights/chatglm3-6b'
    data_path = '/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl'
    output_dir = '/root/autodl-tmp/checkpoints/glm3'
    num_train_epochs = 10
    batch_size = 8
    max_tokens = 192
    max_query = 64
    lr = 1e-5
    warm_up_steps = 200

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

import sys
import json
import pandas as pd
from tqdm import tqdm

In [3]:
sys.path.append('/root/tuning_space/Components/')
import interact
import model_tools
from Static import prompt_dict, st, si

In [4]:
model_path = '/root/autodl-fs/weights/chatglm3-6b'
data_path = '/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl'

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],  # lora的目标位置，具体有哪些可选项可打印出源码中的key_list 注意不同的模型中的定义名称不同
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [5]:
%%time
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(CFG.model_path, trust_remote_code=True).cuda().float()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 6.15 s, sys: 21.9 s, total: 28.1 s
Wall time: 12.8 s


In [6]:
#施加peft lora
model_tools.model_profile(model)
print('conducting peft lora ---------------')
model = get_peft_model(model, config)
model_tools.model_profile(model)

Total Parameters: 6243584000
Trainable Parameters: 6243584000
Percentage of Trainable Parameters: 100.00%
conducting peft lora ---------------
Total Parameters: 6247483392
Trainable Parameters: 3899392
Percentage of Trainable Parameters: 0.06%


In [7]:
class instruction_dataset(Dataset):
    def __init__(self, data_path:'str', tokenizer, truncate_length, max_query_length):
        super().__init__()
        self.tokenizer = tokenizer
        self.examples = []
        
        with open(data_path, 'r') as file:
            for line in file:
                sample=json.loads(line)
                # input_ids的结构应该为：prompt_tokens, src_tokens, [gMASK], <sop>, tgt_tokens, <eop>, [PAD]... 
                # 或者简化一点，即为 query, [gMASK], <sop>, answer, <eop>, [PAD]... 
                # padding的目的是为了对齐各个instance，以组成batch（当然batch_size=1时其实没必要）
                # 总体的input_ids的长度不超过truncate_length，其中query的长度不超过max_query_length，同理可以计算出answer的最大长度
                max_answer_length = truncate_length - max_query_length - 3
                
                # 判断query的长度
                query = sample['question']
                query_ids = tokenizer.encode(query, add_special_tokens=False)
                if len(query_ids) > max_query_length:
                    query_ids = query_ids[:max_query_length]
                
                # 判断answer的长度
                answer = sample['response_j']
                answer_ids = tokenizer.encode(answer, add_special_tokens=False)
                if len(answer) > max_answer_length:
                    answer_ids = answer_ids[:max_answer_length]
                    
                # 合并
                input_ids = query_ids + [si['[gMASK]']] + [si['sop']] + answer_ids + [si['eop']]
                pre_context_length = input_ids.index(si['sop'])
                end_answer_index = input_ids.index(si['eop'])
                
                # padding
                padding_length=truncate_length-len(input_ids)
                input_ids+=padding_length*[tokenizer.pad_token_id]
                
                # 制作labels；其中query部分，pad部分均不参与loss的计算 # 因为需要整体向左移动，所以要少填充一个
                labels = [-100] * (pre_context_length+1) + input_ids[pre_context_length+1: end_answer_index+1]
                labels = labels + [-100] * (truncate_length-len(labels))
                
                # 制作attention_mask
                eop_position = input_ids.index(si['eop'])+1
                attention_mask = [True]*eop_position
                attention_mask += [False]*(truncate_length-len(attention_mask))
                
                self.examples.append({
                    'query' : query,
                    'answer' : answer,
                    'input_ids' : input_ids,
                    'labels' : labels,
                    'attention_mask' : attention_mask,
                })
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        instance = self.examples[item]
        return instance

In [8]:
def coll_fn(batch:list):
    input_labels = []
    labels = []
    attention_mask = []
    for sample in batch:
        # 实际上词表长度只有65024，所以int32就可以了 # attention_mask用bool就行 (我收回我的画，完全是玄学)
        input_labels.append(torch.tensor(sample['input_ids'], dtype=torch.long))
        labels.append(torch.tensor(sample['labels'], dtype=torch.long))
        attention_mask.append(torch.tensor(sample['attention_mask'], dtype=torch.float64)) #, dtype=torch.bool
    batch = {'input_ids':input_labels, 'labels':labels, 'attention_mask': attention_mask}
    batch = {name:torch.stack(item).cuda() for name,item in batch.items()} #对于一个元素不为基本元素的list，需要使用stack方法
    return batch

In [9]:
%%time
finetuning_instruction=instruction_dataset(CFG.data_path, tokenizer, CFG.max_tokens, CFG.max_query)
instruction_loader = DataLoader(finetuning_instruction, batch_size=CFG.batch_size, shuffle=True, collate_fn=coll_fn)

CPU times: user 4.81 s, sys: 57.4 ms, total: 4.87 s
Wall time: 4.87 s


In [10]:
# 定义参数优化器 & 学习率优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=CFG.warm_up_steps,
        num_training_steps=(len(instruction_loader) * CFG.num_train_epochs),
    )

In [11]:
def lora_tuning():
    # 训练阶段
    model.train()
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(CFG.num_train_epochs):
        for step, batch in tqdm(enumerate(instruction_loader)):
            # 前向传播 & 计算loss (使用fp16)
            with torch.cuda.amp.autocast():
                outputs = model(**batch)
                loss = outputs.loss
                if step%10==0:
                    print(loss)
            # 反向传播
            scaler.scale(loss).backward()
            # 优化器调度
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            lr_scheduler.step()

In [12]:
lora_tuning()

0it [00:00, ?it/s]

tensor(17.0159, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:07,  1.66it/s]

tensor(15.6001, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:13,  1.70it/s]

tensor(16.1438, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:18,  1.70it/s]

tensor(16.5460, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:24,  1.70it/s]

tensor(15.6992, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:30,  1.70it/s]

tensor(15.9086, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:36,  1.70it/s]

tensor(15.9301, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:42,  1.70it/s]

tensor(16.1512, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:48,  1.70it/s]

tensor(14.6778, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:54,  1.69it/s]

tensor(15.6397, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [01:00,  1.69it/s]

tensor(15.7128, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:06,  1.69it/s]

tensor(15.7362, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.69it/s]

tensor(15.3638, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.69it/s]

tensor(14.8456, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(13.9057, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:29,  1.69it/s]

tensor(14.1827, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:35,  1.69it/s]

tensor(13.5062, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:41,  1.69it/s]

tensor(12.6888, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:47,  1.69it/s]

tensor(11.6464, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:53,  1.69it/s]

tensor(10.9594, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:59,  1.69it/s]

tensor(9.9540, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:05,  1.69it/s]

tensor(9.0112, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:11,  1.69it/s]

tensor(8.6236, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(7.9817, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(7.4602, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.69it/s]

tensor(6.7753, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.69it/s]

tensor(6.6712, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(6.7931, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(6.3746, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.69it/s]

tensor(6.3691, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:58,  1.69it/s]

tensor(6.3692, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:04,  1.69it/s]

tensor(5.8958, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:10,  1.69it/s]

tensor(5.9364, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:16,  1.69it/s]

tensor(6.0623, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:22,  1.69it/s]

tensor(5.4880, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:28,  1.69it/s]

tensor(5.5495, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:34,  1.69it/s]

tensor(5.7876, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.69it/s]

tensor(5.7133, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(5.2553, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.69it/s]

tensor(5.8841, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.68it/s]

tensor(5.4768, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(5.2880, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(5.0263, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(5.1278, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.68it/s]

tensor(5.2455, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:27,  1.69it/s]

tensor(5.1878, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:33,  1.69it/s]

tensor(5.0654, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:39,  1.69it/s]

tensor(4.7954, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:45,  1.69it/s]

tensor(4.7338, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:51,  1.69it/s]

tensor(4.7782, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:57,  1.69it/s]

tensor(4.6782, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.68it/s]

tensor(5.0206, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(4.6934, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.69it/s]

tensor(4.5298, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.68it/s]

tensor(4.9277, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.69it/s]

tensor(4.3718, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.68it/s]

tensor(4.5344, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(4.3553, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.69it/s]

tensor(4.8055, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.69it/s]

tensor(4.4083, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:56,  1.69it/s]

tensor(4.1422, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:02,  1.68it/s]

tensor(4.4064, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:08,  1.69it/s]

tensor(4.7792, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:14,  1.69it/s]

tensor(4.5498, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:20,  1.69it/s]

tensor(4.3799, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:26,  1.69it/s]

tensor(4.6397, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.68it/s]

tensor(4.4337, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(4.5501, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.69it/s]

tensor(4.1015, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.68it/s]

tensor(4.3354, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(4.4259, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.68it/s]

tensor(4.3243, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(4.3434, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.69it/s]

tensor(4.3251, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.69it/s]

tensor(4.4355, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:25,  1.69it/s]

tensor(4.4194, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:31,  1.69it/s]

tensor(4.1890, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:37,  1.69it/s]

tensor(4.2344, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:43,  1.69it/s]

tensor(4.3781, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:49,  1.69it/s]

tensor(4.3057, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:55,  1.68it/s]

tensor(3.8520, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.68it/s]

tensor(3.9918, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(4.0175, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(4.1955, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(3.9676, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(4.1299, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.68it/s]

tensor(3.9665, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(4.0766, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(4.1102, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:48,  1.69it/s]

tensor(3.7106, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:54,  1.68it/s]

tensor(3.7506, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [09:00,  1.69it/s]

tensor(4.0711, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:06,  1.69it/s]

tensor(3.8368, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:12,  1.69it/s]

tensor(3.5989, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:18,  1.69it/s]

tensor(3.4932, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.69it/s]

tensor(4.0072, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(3.8148, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(3.8012, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.69it/s]

tensor(3.9172, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.68it/s]

tensor(3.6991, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.69it/s]

tensor(3.5626, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.69it/s]

tensor(3.7952, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.68it/s]

tensor(3.6879, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.68it/s]

tensor(3.6651, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.69it/s]

tensor(3.9912, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:23,  1.69it/s]

tensor(3.4405, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:29,  1.69it/s]

tensor(3.5146, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:35,  1.69it/s]

tensor(3.7528, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:41,  1.69it/s]

tensor(3.8642, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:47,  1.69it/s]

tensor(3.9214, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(3.9785, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.69it/s]

tensor(3.7692, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.69it/s]

tensor(3.8530, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(3.7070, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(3.5898, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.69it/s]

tensor(3.3036, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(3.6382, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.68it/s]

tensor(3.9900, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:40,  1.69it/s]

tensor(3.7070, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:46,  1.69it/s]
0it [00:00, ?it/s]

tensor(3.4254, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.69it/s]

tensor(4.0998, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.68it/s]

tensor(4.0490, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:17,  1.69it/s]

tensor(3.3163, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:23,  1.69it/s]

tensor(3.8359, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:29,  1.69it/s]

tensor(3.7784, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.68it/s]

tensor(3.8321, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.69it/s]

tensor(3.2719, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.69it/s]

tensor(3.4927, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.68it/s]

tensor(3.4085, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.69it/s]

tensor(3.8188, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.68it/s]

tensor(3.5712, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.68it/s]

tensor(3.4816, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.68it/s]

tensor(3.4522, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(3.4423, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:29,  1.68it/s]

tensor(3.4052, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.69it/s]

tensor(3.5317, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.68it/s]

tensor(3.6873, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.69it/s]

tensor(3.4061, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.68it/s]

tensor(3.6595, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.69it/s]

tensor(3.3030, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.69it/s]

tensor(3.3829, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(3.6886, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(3.1744, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(3.7072, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.69it/s]

tensor(3.3532, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.69it/s]

tensor(3.7399, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(3.5495, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(3.3909, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.69it/s]

tensor(3.7276, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:58,  1.68it/s]

tensor(3.1137, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:03,  1.69it/s]

tensor(3.3326, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.69it/s]

tensor(3.5579, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.69it/s]

tensor(3.5835, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(3.1617, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.69it/s]

tensor(2.9533, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.69it/s]

tensor(3.0617, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.69it/s]

tensor(3.7926, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(3.2292, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.69it/s]

tensor(3.6573, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.69it/s]

tensor(3.4958, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(3.2442, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(3.4150, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(3.2040, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.69it/s]

tensor(3.0409, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:26,  1.69it/s]

tensor(3.5046, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:32,  1.69it/s]

tensor(3.5152, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(3.1262, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(3.2379, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.68it/s]

tensor(3.4275, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(3.2564, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.69it/s]

tensor(3.1097, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(3.0235, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.68it/s]

tensor(3.2859, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.69it/s]

tensor(3.3005, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.69it/s]

tensor(3.2095, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(3.9109, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(3.5489, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.68it/s]

tensor(3.0832, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.69it/s]

tensor(3.7703, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:55,  1.69it/s]

tensor(3.1114, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:01,  1.68it/s]

tensor(3.2655, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.69it/s]

tensor(3.5226, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.69it/s]

tensor(3.0655, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.68it/s]

tensor(3.6349, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.68it/s]

tensor(3.1559, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.68it/s]

tensor(2.9447, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(3.1753, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.69it/s]

tensor(3.6798, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.68it/s]

tensor(3.1216, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(3.1483, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.69it/s]

tensor(3.4052, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(3.0611, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.68it/s]

tensor(3.1162, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.68it/s]

tensor(3.0657, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:25,  1.68it/s]

tensor(3.0485, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:30,  1.69it/s]

tensor(3.2151, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.69it/s]

tensor(2.7992, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.69it/s]

tensor(3.2580, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.69it/s]

tensor(2.8742, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.69it/s]

tensor(3.2194, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.69it/s]

tensor(2.8168, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.68it/s]

tensor(3.0509, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.68it/s]

tensor(3.1914, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.68it/s]

tensor(3.5215, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(3.1574, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.69it/s]

tensor(2.9799, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(3.3469, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(3.3679, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:48,  1.69it/s]

tensor(2.8454, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:54,  1.69it/s]

tensor(3.1722, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [08:59,  1.69it/s]

tensor(3.3638, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(3.4160, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(3.1535, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.68it/s]

tensor(2.8206, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.68it/s]

tensor(3.2598, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(2.8150, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(3.1519, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.69it/s]

tensor(2.9969, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.9699, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.69it/s]

tensor(3.7006, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.69it/s]

tensor(3.2962, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.68it/s]

tensor(3.2552, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.68it/s]

tensor(3.1462, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.69it/s]

tensor(2.9244, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:23,  1.68it/s]

tensor(2.9887, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:29,  1.68it/s]

tensor(2.9117, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.69it/s]

tensor(3.4305, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.68it/s]

tensor(3.0107, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.68it/s]

tensor(2.9192, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(3.3372, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.68it/s]

tensor(2.7229, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.68it/s]

tensor(3.0830, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.7059, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(2.9845, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.69it/s]

tensor(3.0657, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(3.1764, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.68it/s]

tensor(3.0503, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:40,  1.68it/s]

tensor(2.9381, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]
0it [00:00, ?it/s]

tensor(3.4527, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.68it/s]

tensor(2.9640, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.68it/s]

tensor(3.1839, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:17,  1.68it/s]

tensor(3.1166, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:23,  1.69it/s]

tensor(3.0873, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:29,  1.69it/s]

tensor(3.0579, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.68it/s]

tensor(3.1161, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.69it/s]

tensor(3.1095, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.68it/s]

tensor(3.0732, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.68it/s]

tensor(2.8857, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.68it/s]

tensor(3.2178, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.69it/s]

tensor(3.2514, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.68it/s]

tensor(3.1447, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.69it/s]

tensor(3.1949, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(2.9821, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:29,  1.69it/s]

tensor(3.0712, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:35,  1.69it/s]

tensor(2.9400, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(2.7851, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.68it/s]

tensor(2.7627, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.69it/s]

tensor(3.0811, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.69it/s]

tensor(2.8768, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.68it/s]

tensor(3.0104, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(3.1368, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(2.9672, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(2.9049, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.69it/s]

tensor(3.0959, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.69it/s]

tensor(2.9921, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(3.2954, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(3.2196, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.68it/s]

tensor(2.8517, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:58,  1.68it/s]

tensor(3.1403, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:04,  1.69it/s]

tensor(2.9584, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:10,  1.69it/s]

tensor(2.9986, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.68it/s]

tensor(2.9684, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(3.2615, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.68it/s]

tensor(3.2668, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.69it/s]

tensor(3.1078, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.68it/s]

tensor(3.1513, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(3.1552, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.68it/s]

tensor(3.0205, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.69it/s]

tensor(2.6790, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(2.7172, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(2.9913, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(3.0273, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.68it/s]

tensor(2.8862, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:27,  1.69it/s]

tensor(2.7203, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:33,  1.69it/s]

tensor(3.0530, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.9088, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(2.9553, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.69it/s]

tensor(3.0774, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(2.9156, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.69it/s]

tensor(3.0213, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(3.0490, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.68it/s]

tensor(2.4228, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.68it/s]

tensor(2.7931, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.68it/s]

tensor(2.9796, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.68it/s]

tensor(3.3829, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.68it/s]

tensor(2.6103, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.69it/s]

tensor(2.6497, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.69it/s]

tensor(2.8378, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:56,  1.69it/s]

tensor(3.2285, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:02,  1.68it/s]

tensor(3.0909, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.69it/s]

tensor(2.8880, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.69it/s]

tensor(2.8217, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.69it/s]

tensor(2.8629, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.68it/s]

tensor(2.9002, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.69it/s]

tensor(2.7532, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(2.5765, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.69it/s]

tensor(3.3816, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.68it/s]

tensor(2.9456, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(3.1358, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.68it/s]

tensor(3.0776, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(2.8555, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.69it/s]

tensor(2.8838, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.69it/s]

tensor(2.8686, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:25,  1.68it/s]

tensor(2.8878, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:31,  1.69it/s]

tensor(2.7640, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.68it/s]

tensor(2.8537, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.68it/s]

tensor(2.8418, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.69it/s]

tensor(2.8719, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.68it/s]

tensor(3.2900, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.69it/s]

tensor(2.6843, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(2.8620, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(2.8802, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(2.7734, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.68it/s]

tensor(2.9269, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.69it/s]

tensor(3.1282, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(2.9207, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(2.6011, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:48,  1.68it/s]

tensor(2.5560, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:54,  1.69it/s]

tensor(2.8418, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [09:00,  1.69it/s]

tensor(2.9334, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.6421, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(2.9897, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.69it/s]

tensor(3.1534, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.68it/s]

tensor(2.4028, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(2.5117, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.68it/s]

tensor(2.6934, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.69it/s]

tensor(2.5265, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.9990, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.68it/s]

tensor(2.7236, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.68it/s]

tensor(3.0256, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.69it/s]

tensor(2.6930, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.68it/s]

tensor(2.5749, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.69it/s]

tensor(3.0007, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:23,  1.69it/s]

tensor(2.9212, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:29,  1.68it/s]

tensor(2.7703, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.69it/s]

tensor(3.1234, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.69it/s]

tensor(3.1506, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.69it/s]

tensor(2.5957, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(2.7603, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.69it/s]

tensor(3.1229, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.68it/s]

tensor(2.6120, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(3.2064, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(2.9678, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.69it/s]

tensor(2.9028, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(3.0501, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.69it/s]

tensor(3.0232, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:40,  1.69it/s]

tensor(2.8764, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]
0it [00:00, ?it/s]

tensor(2.5678, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.69it/s]

tensor(2.4442, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.68it/s]

tensor(2.6341, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:17,  1.68it/s]

tensor(3.0533, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:23,  1.69it/s]

tensor(2.6340, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:29,  1.69it/s]

tensor(2.7026, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.69it/s]

tensor(2.6634, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.68it/s]

tensor(2.8669, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.69it/s]

tensor(3.0198, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.68it/s]

tensor(2.6718, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.69it/s]

tensor(2.9496, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.68it/s]

tensor(2.6325, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.68it/s]

tensor(2.5242, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.69it/s]

tensor(2.7938, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(2.9554, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:29,  1.68it/s]

tensor(2.7673, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.68it/s]

tensor(2.5395, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(3.1542, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.69it/s]

tensor(3.0565, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.68it/s]

tensor(2.7539, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.68it/s]

tensor(2.8238, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.69it/s]

tensor(3.0174, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(2.8575, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(2.5994, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(2.7489, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.69it/s]

tensor(3.2401, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.68it/s]

tensor(3.3597, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.68it/s]

tensor(3.1695, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(2.6469, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.69it/s]

tensor(2.8035, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:58,  1.68it/s]

tensor(2.5946, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:04,  1.69it/s]

tensor(2.9461, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.68it/s]

tensor(2.9659, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.69it/s]

tensor(2.8834, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(3.5153, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.69it/s]

tensor(2.9411, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.69it/s]

tensor(2.2857, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.69it/s]

tensor(3.1859, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(2.9303, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.69it/s]

tensor(2.7625, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.68it/s]

tensor(2.6967, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(2.6524, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.68it/s]

tensor(3.2873, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.68it/s]

tensor(2.9822, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.68it/s]

tensor(2.8809, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:27,  1.69it/s]

tensor(2.6215, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:33,  1.69it/s]

tensor(2.6648, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.8778, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(2.7983, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.69it/s]

tensor(2.8974, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(2.8072, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.69it/s]

tensor(3.0107, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(2.9895, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.69it/s]

tensor(2.6875, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.69it/s]

tensor(2.7392, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.69it/s]

tensor(2.7605, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(2.5662, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(2.4512, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.69it/s]

tensor(3.0802, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.69it/s]

tensor(2.8663, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:56,  1.69it/s]

tensor(3.0084, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:02,  1.68it/s]

tensor(2.9374, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.69it/s]

tensor(2.4666, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.68it/s]

tensor(2.6299, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.69it/s]

tensor(2.7749, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.69it/s]

tensor(2.8465, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.69it/s]

tensor(2.8786, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(2.6349, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.69it/s]

tensor(2.6700, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.69it/s]

tensor(3.0537, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.68it/s]

tensor(2.3567, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.69it/s]

tensor(2.6040, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.68it/s]

tensor(2.5124, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.68it/s]

tensor(2.6704, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.68it/s]

tensor(2.7930, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:25,  1.68it/s]

tensor(2.4653, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:31,  1.68it/s]

tensor(2.6103, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.68it/s]

tensor(2.8041, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.69it/s]

tensor(2.5754, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.69it/s]

tensor(2.9441, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.69it/s]

tensor(2.5169, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.68it/s]

tensor(2.9140, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(2.5825, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.68it/s]

tensor(2.6367, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.68it/s]

tensor(2.6403, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(2.5102, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.69it/s]

tensor(2.6274, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(2.6513, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(2.7720, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:48,  1.69it/s]

tensor(2.5214, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:54,  1.69it/s]

tensor(2.5682, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [09:00,  1.68it/s]

tensor(2.7115, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.7290, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(2.4752, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.69it/s]

tensor(2.6849, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.69it/s]

tensor(2.9928, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(2.7504, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(2.5250, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.69it/s]

tensor(2.8875, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.68it/s]

tensor(2.9132, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.68it/s]

tensor(2.2674, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.69it/s]

tensor(2.5389, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.68it/s]

tensor(2.5257, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.69it/s]

tensor(2.6232, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.69it/s]

tensor(2.6978, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:23,  1.69it/s]

tensor(2.8439, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:29,  1.69it/s]

tensor(2.5918, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.69it/s]

tensor(2.4900, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.69it/s]

tensor(2.9734, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.69it/s]

tensor(2.3404, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.68it/s]

tensor(2.5124, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.69it/s]

tensor(2.7059, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.69it/s]

tensor(3.0887, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.6586, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(2.8034, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.68it/s]

tensor(2.5635, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.68it/s]

tensor(2.8581, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.68it/s]s]

tensor(3.1410, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.68it/s]

tensor(3.0714, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.69it/s]

tensor(2.7323, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.69it/s]

tensor(2.6227, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.68it/s]

tensor(2.7126, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.69it/s]

tensor(2.6201, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.69it/s]

tensor(2.7373, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.68it/s]

tensor(2.6602, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(2.8936, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:29,  1.68it/s]

tensor(2.5836, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.68it/s]

tensor(3.0958, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(2.3381, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.69it/s]

tensor(2.7358, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.69it/s]

tensor(2.8961, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.69it/s]

tensor(2.5765, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.68it/s]

tensor(2.6512, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(2.5718, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(2.2254, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(2.8067, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.69it/s]

tensor(2.5908, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.69it/s]

tensor(2.6741, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.68it/s]

tensor(2.5439, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(2.7004, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.69it/s]

tensor(2.5990, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:57,  1.68it/s]

tensor(2.6489, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:03,  1.68it/s]

tensor(2.6287, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.69it/s]

tensor(2.6984, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.68it/s]

tensor(2.7401, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(2.7522, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.69it/s]

tensor(2.7895, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.69it/s]

tensor(2.3993, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.69it/s]

tensor(2.8694, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(2.6030, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.69it/s]

tensor(2.6845, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.69it/s]

tensor(2.2765, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(3.0171, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(2.4325, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(2.6781, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.69it/s]

tensor(2.5918, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:27,  1.69it/s]

tensor(2.8289, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:32,  1.68it/s]

tensor(2.4513, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.4489, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(2.9007, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.69it/s]

tensor(2.4480, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.68it/s]

tensor(2.5896, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.69it/s]

tensor(2.6347, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(2.4886, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.69it/s]

tensor(2.8498, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.68it/s]

tensor(2.6232, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.68it/s]

tensor(2.6524, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(2.5778, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(2.7272, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.69it/s]

tensor(2.5936, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.69it/s]

tensor(2.4347, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(2.5011, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.69it/s]

tensor(3.2117, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.69it/s]

tensor(2.6063, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(2.6851, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.69it/s]

tensor(2.5377, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(2.3717, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.68it/s]

tensor(2.9548, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.69it/s]

tensor(2.6127, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:24,  1.69it/s]

tensor(2.5386, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:30,  1.68it/s]

tensor(2.6218, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.68it/s]

tensor(2.4487, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.68it/s]

tensor(2.8793, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.68it/s]

tensor(2.5043, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.68it/s]

tensor(2.5383, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.68it/s]

tensor(2.6301, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(2.6033, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(2.8807, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(3.0328, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(2.8363, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.68it/s]

tensor(2.2769, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(2.3880, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(2.7439, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:48,  1.69it/s]

tensor(2.7701, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:54,  1.69it/s]

tensor(2.2996, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [08:59,  1.69it/s]

tensor(2.7656, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.4774, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(2.5323, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.68it/s]

tensor(2.8580, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.68it/s]

tensor(2.7108, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(3.0361, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.68it/s]

tensor(2.8241, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.69it/s]

tensor(2.8785, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.5125, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.68it/s]

tensor(2.4439, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.68it/s]

tensor(2.5328, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.68it/s]

tensor(3.0562, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.69it/s]

tensor(2.7505, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.69it/s]

tensor(2.3609, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:23,  1.69it/s]

tensor(2.6860, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:28,  1.69it/s]

tensor(2.5328, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.68it/s]

tensor(2.6328, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.69it/s]

tensor(2.9010, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.69it/s]

tensor(2.7446, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(2.6609, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.69it/s]

tensor(2.5592, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.69it/s]

tensor(2.5756, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.5602, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(2.4277, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.69it/s]

tensor(3.0208, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(3.1082, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.68it/s]

tensor(2.3364, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:40,  1.68it/s]

tensor(2.9597, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]
0it [00:00, ?it/s]

tensor(2.3816, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.68it/s]

tensor(2.8788, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.69it/s]

tensor(2.6040, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:17,  1.69it/s]

tensor(2.6202, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:23,  1.69it/s]

tensor(2.2679, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:29,  1.69it/s]

tensor(2.8888, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.68it/s]

tensor(2.6235, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.68it/s]

tensor(2.7291, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.69it/s]

tensor(2.2472, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.69it/s]

tensor(2.5448, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.69it/s]

tensor(2.8088, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.69it/s]

tensor(2.4875, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.69it/s]

tensor(2.3795, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.68it/s]

tensor(2.1919, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(2.5398, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:28,  1.69it/s]

tensor(2.7966, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.69it/s]

tensor(2.4460, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(2.6132, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.68it/s]

tensor(2.2696, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.69it/s]

tensor(2.6373, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.69it/s]

tensor(2.6691, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.68it/s]

tensor(2.8644, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(2.7218, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.68it/s]

tensor(2.8559, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(2.4175, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.68it/s]

tensor(2.5483, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.69it/s]

tensor(2.5992, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(2.3568, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(2.9477, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.69it/s]

tensor(2.2474, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:57,  1.68it/s]

tensor(2.5574, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:03,  1.68it/s]

tensor(2.6860, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.69it/s]

tensor(2.3492, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.69it/s]

tensor(2.8351, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(2.3611, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.69it/s]

tensor(2.7770, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.68it/s]

tensor(2.8732, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.69it/s]

tensor(2.8261, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(2.4153, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.69it/s]

tensor(2.8087, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.69it/s]

tensor(2.3991, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(2.5539, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(2.5267, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(2.4863, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.68it/s]

tensor(2.8217, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:26,  1.69it/s]

tensor(2.4658, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:32,  1.68it/s]

tensor(2.8583, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.5898, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(2.9252, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.68it/s]

tensor(2.2786, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(2.6218, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.68it/s]

tensor(2.6782, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.68it/s]

tensor(2.4966, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.69it/s]

tensor(2.5904, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.69it/s]

tensor(2.3576, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.69it/s]

tensor(2.4635, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(2.2457, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(2.6581, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.68it/s]

tensor(2.4520, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:49,  1.68it/s]

tensor(2.8602, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:55,  1.68it/s]

tensor(2.4144, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:01,  1.69it/s]

tensor(2.5231, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.68it/s]

tensor(2.9014, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.69it/s]

tensor(2.7068, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.69it/s]

tensor(2.4322, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.68it/s]

tensor(2.5247, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.68it/s]

tensor(2.8087, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(2.7167, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.68it/s]

tensor(2.6019, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.69it/s]

tensor(2.8574, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(2.8083, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.68it/s]

tensor(2.5917, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(2.7473, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.68it/s]

tensor(3.0118, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.69it/s]

tensor(3.0023, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:24,  1.68it/s]

tensor(2.5790, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:30,  1.69it/s]

tensor(2.7313, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.69it/s]

tensor(2.4379, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.69it/s]

tensor(2.5693, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.69it/s]

tensor(2.3715, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.68it/s]

tensor(2.7259, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.68it/s]

tensor(2.6066, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(2.6987, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(2.6559, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(2.3871, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(2.9669, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.69it/s]

tensor(2.6324, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(2.4330, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(2.3663, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:47,  1.68it/s]

tensor(2.6465, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:53,  1.69it/s]

tensor(2.4440, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [08:59,  1.69it/s]

tensor(2.5467, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.3447, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(2.5132, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.69it/s]

tensor(2.6208, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.68it/s]

tensor(2.3410, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(2.4321, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(2.5734, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.68it/s]

tensor(2.2458, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.4278, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.69it/s]

tensor(2.2091, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.68it/s]

tensor(2.6086, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.69it/s]

tensor(2.8273, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.68it/s]

tensor(2.7205, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.69it/s]

tensor(2.7797, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:22,  1.69it/s]

tensor(2.4479, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:28,  1.69it/s]

tensor(2.4387, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.68it/s]

tensor(2.7719, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.69it/s]

tensor(2.7833, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.68it/s]

tensor(2.9749, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(2.5146, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.68it/s]

tensor(2.6187, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.69it/s]

tensor(2.7761, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.0989, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.68it/s]

tensor(2.7433, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.69it/s]

tensor(2.7966, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(2.6286, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.69it/s]

tensor(2.2980, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:40,  1.69it/s]

tensor(2.4086, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]
0it [00:00, ?it/s]

tensor(2.4130, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.69it/s]

tensor(2.6527, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.69it/s]

tensor(2.4700, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:17,  1.69it/s]

tensor(2.7933, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:23,  1.69it/s]

tensor(2.5484, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:29,  1.69it/s]

tensor(2.7817, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.69it/s]

tensor(2.4260, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.69it/s]

tensor(2.5235, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.69it/s]

tensor(2.6351, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.69it/s]

tensor(2.5102, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.69it/s]

tensor(2.4812, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.69it/s]

tensor(2.4343, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.68it/s]

tensor(2.4067, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.69it/s]

tensor(2.6807, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(2.6803, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:28,  1.69it/s]

tensor(2.5453, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.69it/s]

tensor(2.6114, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(2.3046, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.69it/s]

tensor(2.3916, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.69it/s]

tensor(2.5050, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.69it/s]

tensor(2.6493, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.69it/s]

tensor(2.3867, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.68it/s]

tensor(1.9589, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(2.8903, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(2.5982, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.68it/s]

tensor(2.5315, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.68it/s]

tensor(2.3228, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(2.5287, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(2.6734, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.68it/s]

tensor(2.6068, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:57,  1.69it/s]

tensor(2.7238, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:03,  1.69it/s]

tensor(2.7872, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.69it/s]

tensor(2.7031, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.69it/s]

tensor(2.9544, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(2.8420, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.68it/s]

tensor(2.2688, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.68it/s]

tensor(2.1943, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.68it/s]

tensor(2.8539, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.68it/s]

tensor(2.4562, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.68it/s]

tensor(2.4117, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.69it/s]

tensor(2.5242, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(2.5555, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.68it/s]

tensor(2.5182, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(2.7859, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.69it/s]

tensor(2.4456, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:26,  1.69it/s]

tensor(2.5723, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:32,  1.69it/s]

tensor(2.5501, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.5984, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(2.4031, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.69it/s]

tensor(2.7827, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(2.9071, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.69it/s]

tensor(2.1650, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(2.5752, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.68it/s]

tensor(2.7137, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.69it/s]

tensor(2.3663, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.69it/s]

tensor(2.6700, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(2.6256, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(2.2683, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.69it/s]

tensor(3.2167, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.69it/s]

tensor(2.1534, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:55,  1.69it/s]

tensor(2.8042, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:01,  1.69it/s]

tensor(2.4112, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.68it/s]

tensor(2.6897, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.69it/s]

tensor(2.7393, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.69it/s]

tensor(2.5897, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.69it/s]

tensor(2.6903, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.69it/s]

tensor(2.2584, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(2.6761, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.68it/s]

tensor(2.7350, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.69it/s]

tensor(2.3979, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(2.3769, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.69it/s]

tensor(2.5402, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(2.4571, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.69it/s]

tensor(2.7505, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:18,  1.69it/s]

tensor(2.6235, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:24,  1.69it/s]

tensor(2.5632, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:30,  1.69it/s]

tensor(2.1696, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.69it/s]

tensor(2.4908, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.69it/s]

tensor(2.3987, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.68it/s]

tensor(2.1313, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.69it/s]

tensor(2.5816, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.69it/s]

tensor(2.2918, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.68it/s]

tensor(2.5683, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(2.5074, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(2.8535, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(2.7968, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.68it/s]

tensor(2.2929, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(2.3972, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(2.5981, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:47,  1.69it/s]

tensor(2.5381, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:53,  1.69it/s]

tensor(2.8598, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [08:59,  1.69it/s]

tensor(2.0876, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.3560, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(2.7000, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.69it/s]

tensor(2.5297, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.69it/s]

tensor(2.3567, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(2.8937, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(2.6703, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.68it/s]

tensor(2.6007, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.3960, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.69it/s]

tensor(2.3094, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.69it/s]

tensor(2.6694, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.69it/s]

tensor(2.4924, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:10,  1.69it/s]

tensor(2.4099, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:16,  1.68it/s]

tensor(2.1581, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:22,  1.69it/s]

tensor(2.6561, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:28,  1.69it/s]

tensor(2.4331, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.69it/s]

tensor(2.9140, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.69it/s]

tensor(2.6253, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.69it/s]

tensor(2.2128, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.68it/s]

tensor(2.6043, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.68it/s]

tensor(2.4072, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.69it/s]

tensor(2.3314, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.3655, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(2.6283, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.69it/s]

tensor(2.7361, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(2.5434, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.69it/s]

tensor(2.3347, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:39,  1.69it/s]

tensor(2.4628, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]
0it [00:00, ?it/s]

tensor(2.4082, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.69it/s]

tensor(2.1849, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.69it/s]

tensor(2.6480, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.4253, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.69it/s]

tensor(2.1225, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.69it/s]

tensor(2.0844, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.69it/s]

tensor(2.4111, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.69it/s]

tensor(2.5311, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.69it/s]

tensor(2.6468, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(2.4948, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:28,  1.69it/s]

tensor(2.2846, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.69it/s]

tensor(2.6327, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(2.5360, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.69it/s]

tensor(2.5614, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.69it/s]

tensor(2.3040, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.69it/s]

tensor(2.4316, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.69it/s]

tensor(2.5655, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(2.6763, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(2.5459, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.68it/s]

tensor(2.3131, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.69it/s]

tensor(2.6054, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.69it/s]

tensor(2.1861, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(2.4843, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(2.6678, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:51,  1.69it/s]

tensor(2.2819, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:57,  1.69it/s]

tensor(2.8835, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:03,  1.69it/s]

tensor(2.5305, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.69it/s]

tensor(2.5518, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.68it/s]

tensor(2.1377, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(2.3756, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.69it/s]

tensor(2.3410, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.68it/s]

tensor(2.1830, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.69it/s]

tensor(2.3162, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(2.4267, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.69it/s]

tensor(2.2113, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.68it/s]

tensor(2.5679, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(2.9129, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(2.4407, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(2.9153, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:20,  1.69it/s]

tensor(2.2037, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:26,  1.69it/s]

tensor(2.5020, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:32,  1.68it/s]

tensor(2.5697, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.2643, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(2.5415, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.69it/s]

tensor(2.7163, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(2.7883, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.69it/s]

tensor(2.5297, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(2.4413, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.69it/s]

tensor(2.4541, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.69it/s]

tensor(2.6953, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.68it/s]

tensor(2.6634, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(2.3564, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.68it/s]

tensor(2.6726, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.69it/s]

tensor(2.5221, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:49,  1.69it/s]

tensor(2.6988, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:55,  1.69it/s]

tensor(2.4936, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:01,  1.69it/s]

tensor(2.7635, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.68it/s]

tensor(2.4787, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.69it/s]

tensor(3.1435, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.69it/s]

tensor(2.6245, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.69it/s]

tensor(2.4555, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.68it/s]

tensor(2.5117, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.69it/s]

tensor(2.2949, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.69it/s]

tensor(2.6405, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.69it/s]

tensor(2.2343, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(2.2855, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.69it/s]

tensor(2.3989, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(2.4342, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.69it/s]

tensor(2.8580, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:18,  1.69it/s]

tensor(2.3696, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:24,  1.68it/s]

tensor(2.4150, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:30,  1.69it/s]

tensor(2.7913, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.69it/s]

tensor(2.3374, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.69it/s]

tensor(2.5591, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.69it/s]

tensor(2.9418, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.69it/s]

tensor(2.2890, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.69it/s]

tensor(2.6245, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(2.8530, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(2.7326, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(3.0187, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(2.4209, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.69it/s]

tensor(2.4265, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(2.6250, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.68it/s]

tensor(2.3595, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:47,  1.68it/s]

tensor(2.5341, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:53,  1.69it/s]

tensor(2.7720, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [08:59,  1.69it/s]

tensor(2.6152, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.5217, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(2.2357, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.69it/s]

tensor(2.2347, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.69it/s]

tensor(2.4036, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(2.3635, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(2.2296, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.69it/s]

tensor(2.5418, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.7222, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.69it/s]

tensor(2.2840, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.68it/s]

tensor(2.6124, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.69it/s]

tensor(2.2018, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:10,  1.68it/s]

tensor(2.9012, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:16,  1.68it/s]

tensor(2.7485, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:22,  1.68it/s]

tensor(2.7313, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:28,  1.68it/s]

tensor(2.6153, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.69it/s]

tensor(2.2786, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.69it/s]

tensor(2.5617, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.69it/s]

tensor(2.8438, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(2.1564, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.69it/s]

tensor(2.5531, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.69it/s]

tensor(2.3415, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.9004, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.68it/s]

tensor(2.3103, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.68it/s]

tensor(2.6014, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(2.4832, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.69it/s]

tensor(2.7433, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:39,  1.69it/s]

tensor(2.4850, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]
0it [00:00, ?it/s]

tensor(2.3100, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.69it/s]

tensor(2.3077, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.68it/s]

tensor(2.4206, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:17,  1.69it/s]

tensor(2.3183, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:23,  1.68it/s]

tensor(2.4060, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:29,  1.69it/s]

tensor(2.5522, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.68it/s]

tensor(2.9431, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.69it/s]

tensor(2.4225, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.69it/s]

tensor(2.4783, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.68it/s]

tensor(2.7071, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.69it/s]

tensor(2.2062, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.69it/s]

tensor(2.4011, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.69it/s]

tensor(2.3601, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.69it/s]

tensor(2.4539, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.69it/s]

tensor(2.4514, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:28,  1.69it/s]

tensor(2.4255, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.68it/s]

tensor(2.4528, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(2.5933, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.69it/s]

tensor(2.2315, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.69it/s]

tensor(2.8025, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.68it/s]

tensor(2.2278, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.69it/s]

tensor(2.3675, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(2.5781, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.69it/s]

tensor(2.4521, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.68it/s]

tensor(2.0807, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.69it/s]

tensor(2.2095, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.68it/s]

tensor(2.2014, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(3.1140, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.68it/s]

tensor(2.3483, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.68it/s]

tensor(2.0574, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:58,  1.68it/s]

tensor(2.1534, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:03,  1.69it/s]

tensor(2.5160, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.69it/s]

tensor(2.5608, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.69it/s]

tensor(2.2420, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.69it/s]

tensor(2.6164, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.69it/s]

tensor(2.3223, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.69it/s]

tensor(2.5306, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.69it/s]

tensor(2.8101, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(2.5025, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.68it/s]

tensor(2.6995, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.69it/s]

tensor(2.5402, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.68it/s]

tensor(2.4519, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(2.7145, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.69it/s]

tensor(2.3021, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.68it/s]

tensor(2.0168, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:27,  1.69it/s]

tensor(2.8176, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:32,  1.69it/s]

tensor(2.6012, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.2769, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.69it/s]

tensor(2.4181, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.69it/s]

tensor(2.7301, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(2.1492, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.68it/s]

tensor(2.5445, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.69it/s]

tensor(2.5764, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.68it/s]

tensor(2.1597, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.68it/s]

tensor(2.3635, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.68it/s]

tensor(2.5265, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(2.7088, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(2.6904, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.68it/s]

tensor(2.4733, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.68it/s]

tensor(2.5643, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:56,  1.69it/s]

tensor(2.7708, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:01,  1.68it/s]

tensor(2.5058, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.68it/s]

tensor(2.1604, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.68it/s]

tensor(2.8375, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.68it/s]

tensor(2.5290, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.68it/s]

tensor(2.4758, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.68it/s]

tensor(2.4189, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.68it/s]

tensor(2.5542, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.68it/s]

tensor(2.3863, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.68it/s]

tensor(2.7366, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(2.6206, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.68it/s]

tensor(2.6630, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.68it/s]

tensor(2.6379, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.69it/s]

tensor(2.6947, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.68it/s]

tensor(2.6417, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:25,  1.69it/s]

tensor(2.5072, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:31,  1.69it/s]

tensor(2.1939, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.69it/s]

tensor(2.5979, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.68it/s]

tensor(2.4540, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.69it/s]

tensor(2.3067, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.69it/s]

tensor(2.1723, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.69it/s]

tensor(2.8127, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(2.4361, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(2.7440, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(2.3937, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.69it/s]

tensor(2.5322, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.68it/s]

tensor(2.3147, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.68it/s]

tensor(2.4063, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(2.6504, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:48,  1.68it/s]

tensor(2.5449, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:54,  1.68it/s]

tensor(2.5181, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [09:00,  1.69it/s]

tensor(2.3925, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.2785, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.69it/s]

tensor(2.6603, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.68it/s]

tensor(2.0828, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.69it/s]

tensor(2.4351, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.69it/s]

tensor(2.4284, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(2.2674, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.68it/s]

tensor(2.6439, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.4820, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.69it/s]

tensor(2.4876, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.69it/s]

tensor(2.7694, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.68it/s]

tensor(2.4847, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.69it/s]

tensor(2.4297, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.69it/s]

tensor(2.4947, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:23,  1.69it/s]

tensor(2.5655, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:29,  1.69it/s]

tensor(2.6462, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:35,  1.68it/s]

tensor(2.8254, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.68it/s]

tensor(2.5408, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.68it/s]

tensor(2.3296, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(2.6200, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.69it/s]

tensor(2.3350, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.69it/s]

tensor(2.6167, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.1550, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(2.4813, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.68it/s]

tensor(2.4282, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.68it/s]

tensor(2.3644, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.69it/s]

tensor(2.4076, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:40,  1.69it/s]

tensor(3.0417, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]
0it [00:00, ?it/s]

tensor(2.4904, device='cuda:0', grad_fn=<NllLossBackward0>)


10it [00:05,  1.69it/s]

tensor(2.4810, device='cuda:0', grad_fn=<NllLossBackward0>)


20it [00:11,  1.68it/s]

tensor(2.1795, device='cuda:0', grad_fn=<NllLossBackward0>)


30it [00:17,  1.68it/s]

tensor(2.1135, device='cuda:0', grad_fn=<NllLossBackward0>)


40it [00:23,  1.68it/s]

tensor(2.6092, device='cuda:0', grad_fn=<NllLossBackward0>)


50it [00:29,  1.68it/s]

tensor(2.4970, device='cuda:0', grad_fn=<NllLossBackward0>)


60it [00:35,  1.68it/s]

tensor(2.6592, device='cuda:0', grad_fn=<NllLossBackward0>)


70it [00:41,  1.69it/s]

tensor(2.3790, device='cuda:0', grad_fn=<NllLossBackward0>)


80it [00:47,  1.69it/s]

tensor(2.6547, device='cuda:0', grad_fn=<NllLossBackward0>)


90it [00:53,  1.69it/s]

tensor(2.3593, device='cuda:0', grad_fn=<NllLossBackward0>)


100it [00:59,  1.69it/s]

tensor(2.5579, device='cuda:0', grad_fn=<NllLossBackward0>)


110it [01:05,  1.69it/s]

tensor(2.2365, device='cuda:0', grad_fn=<NllLossBackward0>)


120it [01:11,  1.69it/s]

tensor(2.5199, device='cuda:0', grad_fn=<NllLossBackward0>)


130it [01:17,  1.68it/s]

tensor(2.4230, device='cuda:0', grad_fn=<NllLossBackward0>)


140it [01:23,  1.68it/s]

tensor(2.4017, device='cuda:0', grad_fn=<NllLossBackward0>)


150it [01:29,  1.69it/s]

tensor(2.0557, device='cuda:0', grad_fn=<NllLossBackward0>)


160it [01:34,  1.69it/s]

tensor(2.2542, device='cuda:0', grad_fn=<NllLossBackward0>)


170it [01:40,  1.69it/s]

tensor(1.9567, device='cuda:0', grad_fn=<NllLossBackward0>)


180it [01:46,  1.69it/s]

tensor(2.5573, device='cuda:0', grad_fn=<NllLossBackward0>)


190it [01:52,  1.69it/s]

tensor(2.8372, device='cuda:0', grad_fn=<NllLossBackward0>)


200it [01:58,  1.69it/s]

tensor(2.9693, device='cuda:0', grad_fn=<NllLossBackward0>)


210it [02:04,  1.69it/s]

tensor(2.4325, device='cuda:0', grad_fn=<NllLossBackward0>)


220it [02:10,  1.69it/s]

tensor(3.1496, device='cuda:0', grad_fn=<NllLossBackward0>)


230it [02:16,  1.68it/s]

tensor(2.5332, device='cuda:0', grad_fn=<NllLossBackward0>)


240it [02:22,  1.69it/s]

tensor(2.6498, device='cuda:0', grad_fn=<NllLossBackward0>)


250it [02:28,  1.68it/s]

tensor(2.3203, device='cuda:0', grad_fn=<NllLossBackward0>)


260it [02:34,  1.68it/s]

tensor(2.5757, device='cuda:0', grad_fn=<NllLossBackward0>)


270it [02:40,  1.69it/s]

tensor(2.3732, device='cuda:0', grad_fn=<NllLossBackward0>)


280it [02:46,  1.69it/s]

tensor(2.1600, device='cuda:0', grad_fn=<NllLossBackward0>)


290it [02:52,  1.68it/s]

tensor(1.9675, device='cuda:0', grad_fn=<NllLossBackward0>)


300it [02:57,  1.68it/s]

tensor(2.5689, device='cuda:0', grad_fn=<NllLossBackward0>)


310it [03:03,  1.68it/s]

tensor(2.5083, device='cuda:0', grad_fn=<NllLossBackward0>)


320it [03:09,  1.69it/s]

tensor(2.3223, device='cuda:0', grad_fn=<NllLossBackward0>)


330it [03:15,  1.68it/s]

tensor(2.4709, device='cuda:0', grad_fn=<NllLossBackward0>)


340it [03:21,  1.68it/s]

tensor(2.5957, device='cuda:0', grad_fn=<NllLossBackward0>)


350it [03:27,  1.69it/s]

tensor(2.1834, device='cuda:0', grad_fn=<NllLossBackward0>)


360it [03:33,  1.69it/s]

tensor(2.4224, device='cuda:0', grad_fn=<NllLossBackward0>)


370it [03:39,  1.68it/s]

tensor(2.7232, device='cuda:0', grad_fn=<NllLossBackward0>)


380it [03:45,  1.69it/s]

tensor(2.1737, device='cuda:0', grad_fn=<NllLossBackward0>)


390it [03:51,  1.68it/s]

tensor(2.5131, device='cuda:0', grad_fn=<NllLossBackward0>)


400it [03:57,  1.68it/s]

tensor(2.2378, device='cuda:0', grad_fn=<NllLossBackward0>)


410it [04:03,  1.69it/s]

tensor(2.8249, device='cuda:0', grad_fn=<NllLossBackward0>)


420it [04:09,  1.69it/s]

tensor(2.1161, device='cuda:0', grad_fn=<NllLossBackward0>)


430it [04:15,  1.68it/s]

tensor(2.4059, device='cuda:0', grad_fn=<NllLossBackward0>)


440it [04:21,  1.69it/s]

tensor(2.1804, device='cuda:0', grad_fn=<NllLossBackward0>)


450it [04:27,  1.68it/s]

tensor(2.5224, device='cuda:0', grad_fn=<NllLossBackward0>)


460it [04:32,  1.69it/s]

tensor(2.5587, device='cuda:0', grad_fn=<NllLossBackward0>)


470it [04:38,  1.69it/s]

tensor(2.2707, device='cuda:0', grad_fn=<NllLossBackward0>)


480it [04:44,  1.68it/s]

tensor(2.7391, device='cuda:0', grad_fn=<NllLossBackward0>)


490it [04:50,  1.68it/s]

tensor(2.6356, device='cuda:0', grad_fn=<NllLossBackward0>)


500it [04:56,  1.69it/s]

tensor(2.1368, device='cuda:0', grad_fn=<NllLossBackward0>)


510it [05:02,  1.69it/s]

tensor(2.7009, device='cuda:0', grad_fn=<NllLossBackward0>)


520it [05:08,  1.68it/s]

tensor(2.6413, device='cuda:0', grad_fn=<NllLossBackward0>)


530it [05:14,  1.68it/s]

tensor(2.7166, device='cuda:0', grad_fn=<NllLossBackward0>)


540it [05:20,  1.69it/s]

tensor(2.4654, device='cuda:0', grad_fn=<NllLossBackward0>)


550it [05:26,  1.69it/s]

tensor(2.1221, device='cuda:0', grad_fn=<NllLossBackward0>)


560it [05:32,  1.69it/s]

tensor(2.6474, device='cuda:0', grad_fn=<NllLossBackward0>)


570it [05:38,  1.69it/s]

tensor(2.1043, device='cuda:0', grad_fn=<NllLossBackward0>)


580it [05:44,  1.69it/s]

tensor(2.1313, device='cuda:0', grad_fn=<NllLossBackward0>)


590it [05:50,  1.69it/s]

tensor(3.2087, device='cuda:0', grad_fn=<NllLossBackward0>)


600it [05:56,  1.68it/s]

tensor(2.5351, device='cuda:0', grad_fn=<NllLossBackward0>)


610it [06:01,  1.69it/s]

tensor(2.5641, device='cuda:0', grad_fn=<NllLossBackward0>)


620it [06:07,  1.69it/s]

tensor(2.3944, device='cuda:0', grad_fn=<NllLossBackward0>)


630it [06:13,  1.69it/s]

tensor(2.2030, device='cuda:0', grad_fn=<NllLossBackward0>)


640it [06:19,  1.69it/s]

tensor(2.3318, device='cuda:0', grad_fn=<NllLossBackward0>)


650it [06:25,  1.69it/s]

tensor(2.2296, device='cuda:0', grad_fn=<NllLossBackward0>)


660it [06:31,  1.68it/s]

tensor(2.4255, device='cuda:0', grad_fn=<NllLossBackward0>)


670it [06:37,  1.68it/s]

tensor(2.2320, device='cuda:0', grad_fn=<NllLossBackward0>)


680it [06:43,  1.69it/s]

tensor(2.2673, device='cuda:0', grad_fn=<NllLossBackward0>)


690it [06:49,  1.69it/s]

tensor(2.6231, device='cuda:0', grad_fn=<NllLossBackward0>)


700it [06:55,  1.69it/s]

tensor(2.5499, device='cuda:0', grad_fn=<NllLossBackward0>)


710it [07:01,  1.69it/s]

tensor(2.7306, device='cuda:0', grad_fn=<NllLossBackward0>)


720it [07:07,  1.69it/s]

tensor(2.5770, device='cuda:0', grad_fn=<NllLossBackward0>)


730it [07:13,  1.68it/s]

tensor(2.6019, device='cuda:0', grad_fn=<NllLossBackward0>)


740it [07:19,  1.69it/s]

tensor(1.9075, device='cuda:0', grad_fn=<NllLossBackward0>)


750it [07:25,  1.69it/s]

tensor(2.3964, device='cuda:0', grad_fn=<NllLossBackward0>)


760it [07:30,  1.69it/s]

tensor(2.1927, device='cuda:0', grad_fn=<NllLossBackward0>)


770it [07:36,  1.68it/s]

tensor(2.2918, device='cuda:0', grad_fn=<NllLossBackward0>)


780it [07:42,  1.69it/s]

tensor(2.4696, device='cuda:0', grad_fn=<NllLossBackward0>)


790it [07:48,  1.69it/s]

tensor(2.5554, device='cuda:0', grad_fn=<NllLossBackward0>)


800it [07:54,  1.68it/s]

tensor(2.5632, device='cuda:0', grad_fn=<NllLossBackward0>)


810it [08:00,  1.68it/s]

tensor(2.8985, device='cuda:0', grad_fn=<NllLossBackward0>)


820it [08:06,  1.69it/s]

tensor(2.4783, device='cuda:0', grad_fn=<NllLossBackward0>)


830it [08:12,  1.69it/s]

tensor(2.4082, device='cuda:0', grad_fn=<NllLossBackward0>)


840it [08:18,  1.69it/s]

tensor(2.1648, device='cuda:0', grad_fn=<NllLossBackward0>)


850it [08:24,  1.68it/s]

tensor(2.2291, device='cuda:0', grad_fn=<NllLossBackward0>)


860it [08:30,  1.69it/s]

tensor(2.3950, device='cuda:0', grad_fn=<NllLossBackward0>)


870it [08:36,  1.69it/s]

tensor(2.3421, device='cuda:0', grad_fn=<NllLossBackward0>)


880it [08:42,  1.69it/s]

tensor(2.5320, device='cuda:0', grad_fn=<NllLossBackward0>)


890it [08:48,  1.69it/s]

tensor(2.5235, device='cuda:0', grad_fn=<NllLossBackward0>)


900it [08:54,  1.69it/s]

tensor(2.2237, device='cuda:0', grad_fn=<NllLossBackward0>)


910it [08:59,  1.69it/s]

tensor(2.6872, device='cuda:0', grad_fn=<NllLossBackward0>)


920it [09:05,  1.69it/s]

tensor(2.8519, device='cuda:0', grad_fn=<NllLossBackward0>)


930it [09:11,  1.68it/s]

tensor(2.3707, device='cuda:0', grad_fn=<NllLossBackward0>)


940it [09:17,  1.69it/s]

tensor(2.3828, device='cuda:0', grad_fn=<NllLossBackward0>)


950it [09:23,  1.69it/s]

tensor(2.5136, device='cuda:0', grad_fn=<NllLossBackward0>)


960it [09:29,  1.68it/s]

tensor(2.1185, device='cuda:0', grad_fn=<NllLossBackward0>)


970it [09:35,  1.69it/s]

tensor(2.4025, device='cuda:0', grad_fn=<NllLossBackward0>)


980it [09:41,  1.69it/s]

tensor(2.5003, device='cuda:0', grad_fn=<NllLossBackward0>)


990it [09:47,  1.69it/s]

tensor(2.0655, device='cuda:0', grad_fn=<NllLossBackward0>)


1000it [09:53,  1.69it/s]

tensor(2.1749, device='cuda:0', grad_fn=<NllLossBackward0>)


1010it [09:59,  1.69it/s]

tensor(2.0042, device='cuda:0', grad_fn=<NllLossBackward0>)


1020it [10:05,  1.68it/s]

tensor(2.3381, device='cuda:0', grad_fn=<NllLossBackward0>)


1030it [10:11,  1.69it/s]

tensor(2.3315, device='cuda:0', grad_fn=<NllLossBackward0>)


1040it [10:17,  1.68it/s]

tensor(2.2991, device='cuda:0', grad_fn=<NllLossBackward0>)


1050it [10:23,  1.69it/s]

tensor(2.1329, device='cuda:0', grad_fn=<NllLossBackward0>)


1060it [10:28,  1.69it/s]

tensor(2.5768, device='cuda:0', grad_fn=<NllLossBackward0>)


1070it [10:34,  1.68it/s]

tensor(1.8586, device='cuda:0', grad_fn=<NllLossBackward0>)


1080it [10:40,  1.69it/s]

tensor(2.5879, device='cuda:0', grad_fn=<NllLossBackward0>)


1090it [10:46,  1.68it/s]

tensor(2.5804, device='cuda:0', grad_fn=<NllLossBackward0>)


1100it [10:52,  1.69it/s]

tensor(2.0450, device='cuda:0', grad_fn=<NllLossBackward0>)


1110it [10:58,  1.67it/s]

tensor(2.4495, device='cuda:0', grad_fn=<NllLossBackward0>)


1120it [11:04,  1.68it/s]

tensor(2.2226, device='cuda:0', grad_fn=<NllLossBackward0>)


1130it [11:10,  1.69it/s]

tensor(2.3901, device='cuda:0', grad_fn=<NllLossBackward0>)


1140it [11:16,  1.69it/s]

tensor(2.5906, device='cuda:0', grad_fn=<NllLossBackward0>)


1150it [11:22,  1.69it/s]

tensor(2.5799, device='cuda:0', grad_fn=<NllLossBackward0>)


1160it [11:28,  1.69it/s]

tensor(2.2399, device='cuda:0', grad_fn=<NllLossBackward0>)


1170it [11:34,  1.69it/s]

tensor(2.4366, device='cuda:0', grad_fn=<NllLossBackward0>)


1180it [11:40,  1.68it/s]

tensor(2.4779, device='cuda:0', grad_fn=<NllLossBackward0>)


1190it [11:45,  1.69it/s]


In [13]:
#模型的保存
model.save_pretrained(CFG.output_dir)



In [14]:
!ls $CFG.output_dir

README.md  adapter_config.json	adapter_model.safetensors
