In [3]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm, trange
from utils import normalizeTweet, split_into_sentences, bio_tagging, create_training_data





In [6]:
data = pd.read_excel("Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
#data = pd.read_excel("/home/adrian/Downloads/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
print("Total count:", data.shape[0])
data = data[data["Causal association"].notnull()]
print("Labeled count:", data.shape[0])

data.head()

  data = pd.read_excel("Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")


Total count: 5456
Labeled count: 5000


Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


### Interrater-reliabilty measure

In [7]:
from sklearn.metrics import cohen_kappa_score

charline = data[data["Charline association0=no;1=yes"].notnull()]
coder1 = charline["Causal association"].values
coder2 = charline["Charline association0=no;1=yes"]
score = cohen_kappa_score(coder1,coder2)
#print('Cohen\'s Kappa:',score)

### Data Preprocessing

In [8]:
data["Causal association"].value_counts()

0.0    3720
1.0    1280
Name: Causal association, dtype: int64

In [9]:
trainingData = create_training_data(data, min_words_in_sentences=3)
trainingData.head()

Unnamed: 0,tweet,Causal association,BIOtags
0,"tonight , I learned my older girl will back he...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,I'm a trans woman .,0.0,"[O, O, O, O, O, O]"
3,"Both of us could use a world where "" brave and...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"Make a world where people can just be , withou...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [10]:
for i,row in trainingData.sample(n=10).iterrows():
    print("\n")
    print(row["tweet"])
    print(row["BIOtags"])



You don't care about anyone .
['O', 'O', 'O', 'O', 'O', 'O', 'O']


This is someone who has medical aid .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


But the temptation to tinker is ever present and may tend to rock the boat 🤔 HTTPURL
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


She was surprised .
['O', 'O', 'O', 'O']


#MentalHealthAwarenessWeek people is dying cause suicide , and in my country they said that is about diabetes .
['O', 'O', 'O', 'B-E', 'O', 'B-C', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


We have to get the info out there and try to do better .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


USER USER I read he was diabetic and asthmatic as well .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Frivolous me excited by new insulin pen ; sensible me thinks how lucky I am to have free access to pen & insulin #StayinAlive #Insulin4all HTTPURL
['O', 'O', 'B-E', 'O', 'O',

In [11]:
trainingData["Causal association"].value_counts()

0.0    7607
1.0    1019
Name: Causal association, dtype: int64

### Training

In [12]:
trainingDataSample = trainingData#.sample(n=200)
train = trainingDataSample.sample(frac=0.8, random_state=0)
test = trainingDataSample.drop(train.index)
validate = train.sample(frac=0.2, random_state=0)
train = train.drop(validate.index)
print("Train:", train.shape)
print("Validate:", validate.shape)
print("Test:", test.shape)

Train: (5521, 3)
Validate: (1380, 3)
Test: (1725, 3)


In [13]:

# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, bio_tags, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.bio_tags = bio_tags
        self.tag2id = {label: idx for idx, label in enumerate(["O", "B-C", "I-C", "B-E", "I-E"])}
        self.tag2id[-100] = -100
        self.id2tag = {id:tag for tag,id in self.tag2id.items()}

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        bio_tags_extended = self.extend_tags(self.text[idx], self.bio_tags[idx], ids[idx])
        assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.float)
              , "bio_tags" : torch.tensor(list(map(lambda bioTags: self.tag2id[bioTags], bio_tags_extended))
, dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

    
    def extend_tags(self, tokens_old, tags_old, ids_tokenized_padded):
        """ 
            Each token has a BIO tag label. 
            However BERT's tokenization splits tokens into subwords. How to label those subwords?
            
            Option 1:
            ---------
            
            add the same label to each subword than the first subword. Only replace "B" by "I"
            Ex. 
            #lowbloodsugar => '#low@@', 'blood@@', 'sugar@@'
               "B-C"       =>   "B-C" ,   "I-C"  ,   "I-C"
            
            Option 2 (implemented):      
            ---------
            
            From : https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities
            A common obstacle with using pre-trained models for token-level classification: many of the tokens in
            the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called 
            WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token
            is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter 
            handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we 
            have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will
            end up with a mismatch between our tokens and our labels.

            One way to handle this is to only train on the tag labels for the first subtoken of a split token. 
            We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. 
            In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set 
            the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
        """
        tags = [-100] # add for start token <CLS>
        for token_old, tag in zip(tokens_old.split(" "), tags_old):
#            print(F"\ntoken_old: {token_old};    tag: {tag}")
            for i, sub_token in enumerate(self.tokenizer.tokenize(token_old)):
                if (i == 0):
                    tags.append(tag)
                else: 
                    tags.append(-100)
           
        tags.append(-100) # 0 for end of sentence token
    
        # append -100 for all padded elements
        padded_elements = ids_tokenized_padded.count(1) # id 1 is <PAD> ; Alternative: where attention_mask == 0 add -100
        tags.extend([-100]*padded_elements)
        
        return tags
        
        
    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train["tweet"].map(normalizeTweet).values.tolist()
                           , train["Causal association"].values.tolist()
                           , train["BIOtags"].values.tolist()
                           , tokenizer)
val_dataset = TweetDataSet(validate["tweet"].map(normalizeTweet).values.tolist()
                           , validate["Causal association"].values.tolist()
                           , validate["BIOtags"].values.tolist()
                           , tokenizer)
test_dataset = TweetDataSet(test["tweet"].map(normalizeTweet).values.tolist()
                           , test["Causal association"].values.tolist()
                           , test["BIOtags"].values.tolist()
                           , tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# put data to batches
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)


  assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


5521
1380
1725


In [15]:
# 1) Trainer 
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred, labels):
    """
        Dataset is unbalanced -> measure weighted metrics
        Calculate metrics for each label, and find their average wieghted by support (Number of true instances for each label)
        This alters 'macro' to account for label imbalance;
        it can result in an F-Score taht is not between precision and recall
    """
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted') #binary
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalNER(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalNER, self).__init__()
        self.num_labels = 5 # B-C, I-C, B-E, I-E, O
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
#        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output_seq, _ = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output_2 = self.dropout(output_seq)
        output_3 = self.linear1(output_2)
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        logit = self.softmax(output_5)
        return logit


## Model parameters
batchsize_train = 16
lr = 5e-5
adam_eps = 1e-8
epochs = 10
num_warmup_steps = 0
num_training_steps = len(train_loader)*epochs

# Store our loss and learning rate for plotting
train_loss_set = []
learning_rate = []


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CausalNER()
model.to(device)

# fine-tune only the task-specific parameters -> Vivek? 
for param in model.bert.parameters():
    param.requires_grad = False
    


optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
loss_fn = CrossEntropyLoss(ignore_index=-100) # ignore subwords/tokens with label -100 


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.3.output.dense.weight', 'lm_head.decoder.weight', 'roberta.encoder.layer.6.intermediate.dense.bias', 'roberta.encoder.layer.8.output.dense.bias', 'roberta.encoder.layer.11.attention.self.value.bias', 'roberta.encoder.layer.1.attention.output.dense.weight', 'roberta.encoder.layer.3.output.LayerNorm.bias', 'roberta.encoder.layer.9.attention.self.key.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.5.intermediate.dense.bias', 'roberta.encoder.layer.9.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.6.intermediate.dense.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'roberta.encoder.layer.5.attention.self.

### Training

In [16]:
N_bio_tags = 5 # "O", "B-C", "I-C", "B-E", "I-C"
for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")

    
    ############ training eval metrics ######################
    nb_tr_steps = 0 # Tracking variables
    train_loss = []
    train_acc = []
    train_prec = []
    train_rec = []
    train_f1 = []
    
    #########################################################
    
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        bio_tags = batch['bio_tags'].to(device)
        
        ################################################
        model.train() # set model to training mode
        logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

        ################################################ 
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        active_logits = logits.view(-1, N_bio_tags)[active_loss] # N_bio_tags=5 
        active_tags = bio_tags.view(-1)[active_loss]
        loss = loss_fn(active_logits, active_tags)             
        print("loss:", loss)       ## TODO VIVEK: check loss function calculation
        loss.backward() # backward pass
        optim.step()    # update parameters and take a steup using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
            
            
        ################## Training Performance Measures ##########
        logits = logits.detach().to('cpu').numpy()
        tags_ids = bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]                      
                
        metrics = compute_metrics(pred, tags)
        train_acc.append(metrics["accuracy"])
        train_prec.append(metrics["precision"])
        train_rec.append(metrics["recall"])
        train_f1.append(metrics["f1"])
                          
        nb_tr_steps += 1
           
    print(F'\n\tTraining Loss: {np.mean(train_loss)}')
    print(F'\n\tTraining acc: {np.mean(train_acc)}')
    print(F'\n\tTraining prec: {np.mean(train_prec)}')
    print(F'\n\tTraining rec: {np.mean(train_rec)}')
    print(F'\n\tTraining f1: {np.mean(train_f1)}')
                          
                          
    # store the current learning rate
    for param_group in optim.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])
    

    ############# Validation ################
    
    nb_eval_steps = 0 # Tracking variables
    val_accuracy = []
    val_loss = []
    val_acc = []
    val_prec = []
    val_rec = []
    val_f1 = []

    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        v_input_ids, v_input_mask, v_token_type_ids, v_labels, v_bio_tags = batch  # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            model.eval() # put model in evaluation mode for validation set
            logits = model(**{"input_ids":v_input_ids, "attention_mask":v_input_mask, "token_type_ids":v_token_type_ids}) # forward pass, calculates logit predictions

        ######################################################
        
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        v_active_loss = v_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        v_active_logits = logits.view(-1, N_bio_tags)[v_active_loss] # 5 
        v_active_tags = v_bio_tags.view(-1)[v_active_loss]
        v_loss = loss_fn(v_active_logits, v_active_tags)             
        val_loss.append(v_loss.item())
              
        #########################################################
        logits = logits.detach().to('cpu').numpy()
        tags_ids = v_bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]#.flatten()        
        
        metrics = compute_metrics(pred, tags)
        val_acc.append(metrics["accuracy"])
        val_prec.append(metrics["precision"])
        val_rec.append(metrics["recall"])
        val_f1.append(metrics["f1"])
                              
        nb_eval_steps += 1
        
    print(F'\n\tValidation Loss: {np.mean(val_loss)}')
    print(F'\n\tValidation acc: {np.mean(val_acc)}')
    print(F'\n\tValidation prec: {np.mean(val_prec)}')
    print(F'\n\tValidation rec: {np.mean(val_rec)}')
    print(F'\n\tValidation f1: {np.mean(val_f1)}')
    


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/346 [00:00<?, ?it/s][A




  0%|          | 1/346 [00:06<35:43,  6.21s/it][A

loss: tensor(1.5494, device='cuda:0', grad_fn=<NllLossBackward>)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  1%|          | 2/346 [00:11<33:06,  5.77s/it][A

loss: tensor(1.5088, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:17<31:58,  5.59s/it][A

loss: tensor(1.5073, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:22<31:35,  5.54s/it][A

loss: tensor(1.4385, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:27<31:06,  5.47s/it][A

loss: tensor(1.4134, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:33<30:51,  5.45s/it][A

loss: tensor(1.3899, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:38<30:45,  5.44s/it][A

loss: tensor(1.3247, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:44<30:45,  5.46s/it][A

loss: tensor(1.2737, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:49<30:31,  5.43s/it][A

loss: tensor(1.2527, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:55<30:27,  5.44s/it][A

loss: tensor(1.2267, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [01:00<30:20,  5.44s/it][A

loss: tensor(1.1708, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:05<30:08,  5.41s/it][A

loss: tensor(1.1748, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:11<30:01,  5.41s/it][A

loss: tensor(1.1389, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:16<30:02,  5.43s/it][A

loss: tensor(1.1290, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:22<29:55,  5.42s/it][A

loss: tensor(1.0869, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:27<29:39,  5.39s/it][A

loss: tensor(1.0481, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:32<29:29,  5.38s/it][A

loss: tensor(1.0478, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:38<29:19,  5.36s/it][A

loss: tensor(1.0334, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:43<29:19,  5.38s/it][A

loss: tensor(1.0514, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:48<29:08,  5.36s/it][A

loss: tensor(1.0173, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:54<28:59,  5.35s/it][A

loss: tensor(1.0031, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:59<28:53,  5.35s/it][A

loss: tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:04<28:52,  5.36s/it][A

loss: tensor(0.9731, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:10<28:42,  5.35s/it][A

loss: tensor(0.9703, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:15<28:32,  5.33s/it][A

loss: tensor(0.9788, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:20<28:29,  5.34s/it][A

loss: tensor(1.0114, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:26<28:35,  5.38s/it][A

loss: tensor(0.9682, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:31<28:26,  5.37s/it][A

loss: tensor(0.9531, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:37<28:22,  5.37s/it][A

loss: tensor(0.9351, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:42<28:24,  5.39s/it][A

loss: tensor(0.9411, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:47<28:17,  5.39s/it][A

loss: tensor(0.9435, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:53<28:10,  5.38s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:58<28:05,  5.39s/it][A

loss: tensor(0.9539, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [03:04<28:06,  5.40s/it][A

loss: tensor(0.9522, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:09<27:53,  5.38s/it][A

loss: tensor(0.9442, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:14<27:44,  5.37s/it][A

loss: tensor(0.9462, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:20<27:42,  5.38s/it][A

loss: tensor(1.0074, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:25<27:50,  5.42s/it][A

loss: tensor(0.9723, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:31<27:47,  5.43s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:36<27:16,  5.35s/it][A

loss: tensor(0.9685, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:41<27:01,  5.32s/it][A

loss: tensor(0.9494, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:46<26:57,  5.32s/it][A

loss: tensor(0.9691, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:52<26:46,  5.30s/it][A

loss: tensor(0.9814, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:57<26:32,  5.27s/it][A

loss: tensor(0.9753, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [04:02<26:23,  5.26s/it][A

loss: tensor(0.9761, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:07<26:22,  5.27s/it][A

loss: tensor(0.9788, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:13<26:13,  5.26s/it][A

loss: tensor(0.9740, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:18<26:06,  5.26s/it][A

loss: tensor(0.9514, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:23<25:59,  5.25s/it][A

loss: tensor(0.9393, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:28<25:58,  5.27s/it][A

loss: tensor(0.9425, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:34<25:48,  5.25s/it][A

loss: tensor(0.9703, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:39<25:42,  5.25s/it][A

loss: tensor(0.9284, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:44<25:36,  5.24s/it][A

loss: tensor(0.9867, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:49<25:36,  5.26s/it][A

loss: tensor(0.9582, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:55<25:28,  5.25s/it][A

loss: tensor(0.9373, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [05:00<25:21,  5.25s/it][A

loss: tensor(0.9956, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [05:05<25:21,  5.26s/it][A

loss: tensor(0.9993, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:10<25:12,  5.25s/it][A

loss: tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:16<25:07,  5.25s/it][A

loss: tensor(0.9434, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:21<25:03,  5.26s/it][A

loss: tensor(0.9457, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:26<25:03,  5.28s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:32<24:59,  5.28s/it][A

loss: tensor(0.9603, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:37<24:53,  5.28s/it][A

loss: tensor(0.9118, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:42<24:48,  5.28s/it][A

loss: tensor(0.9663, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:47<24:48,  5.30s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:53<24:43,  5.30s/it][A

loss: tensor(0.9361, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:58<24:37,  5.30s/it][A

loss: tensor(0.9518, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [06:03<24:33,  5.30s/it][A

loss: tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:09<24:36,  5.33s/it][A

loss: tensor(0.9106, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:14<24:30,  5.33s/it][A

loss: tensor(0.9758, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:19<24:29,  5.34s/it][A

loss: tensor(0.9685, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:25<24:28,  5.36s/it][A

loss: tensor(0.9348, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:30<24:31,  5.39s/it][A

loss: tensor(0.9311, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:36<24:26,  5.39s/it][A

loss: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:41<24:09,  5.35s/it][A

loss: tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:46<24:03,  5.35s/it][A

loss: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:52<23:52,  5.32s/it][A

loss: tensor(0.9242, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:57<23:40,  5.30s/it][A

loss: tensor(0.9734, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [07:02<23:33,  5.29s/it][A

loss: tensor(0.9758, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [07:07<23:31,  5.31s/it][A

loss: tensor(0.9718, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:13<23:21,  5.29s/it][A

loss: tensor(0.9917, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:18<23:13,  5.28s/it][A

loss: tensor(0.9485, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:23<23:08,  5.28s/it][A

loss: tensor(0.9216, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:28<23:06,  5.29s/it][A

loss: tensor(0.9491, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:34<22:58,  5.28s/it][A

loss: tensor(0.9600, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:39<22:52,  5.28s/it][A

loss: tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:44<22:43,  5.26s/it][A

loss: tensor(0.9341, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:50<22:41,  5.28s/it][A

loss: tensor(0.9436, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:55<22:33,  5.27s/it][A

loss: tensor(0.9641, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [08:00<22:25,  5.26s/it][A

loss: tensor(0.9287, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [08:05<22:20,  5.26s/it][A

loss: tensor(1.0208, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:11<22:21,  5.28s/it][A

loss: tensor(0.9097, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:16<22:12,  5.27s/it][A

loss: tensor(0.9363, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:21<22:04,  5.25s/it][A

loss: tensor(0.9316, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:26<21:59,  5.26s/it][A

loss: tensor(1.0050, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:32<22:02,  5.29s/it][A

loss: tensor(0.9268, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:37<21:54,  5.28s/it][A

loss: tensor(0.9507, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:42<21:48,  5.28s/it][A

loss: tensor(0.9898, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:47<21:40,  5.27s/it][A

loss: tensor(0.9333, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:53<21:38,  5.28s/it][A

loss: tensor(0.9087, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:58<21:31,  5.27s/it][A

loss: tensor(0.9172, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [09:03<21:24,  5.27s/it][A

loss: tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:09<21:22,  5.28s/it][A

loss: tensor(0.9187, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:14<21:14,  5.27s/it][A

loss: tensor(0.9170, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:19<21:10,  5.27s/it][A

loss: tensor(0.9721, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:24<21:04,  5.27s/it][A

loss: tensor(0.9583, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:30<21:03,  5.29s/it][A

loss: tensor(1.0446, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:35<20:55,  5.28s/it][A

loss: tensor(0.9465, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:40<20:47,  5.26s/it][A

loss: tensor(0.9244, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:45<20:42,  5.26s/it][A

loss: tensor(0.9631, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:51<20:42,  5.29s/it][A

loss: tensor(0.9598, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:56<20:34,  5.28s/it][A

loss: tensor(0.9208, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [10:01<20:27,  5.27s/it][A

loss: tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [10:07<20:20,  5.26s/it][A

loss: tensor(0.9082, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:12<20:20,  5.29s/it][A

loss: tensor(0.9082, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:17<20:12,  5.27s/it][A

loss: tensor(0.9084, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:22<20:05,  5.26s/it][A

loss: tensor(0.9862, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:28<19:59,  5.26s/it][A

loss: tensor(0.9536, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:33<19:57,  5.27s/it][A

loss: tensor(0.9802, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:38<19:50,  5.27s/it][A

loss: tensor(0.9081, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:43<19:42,  5.25s/it][A

loss: tensor(0.9078, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:49<19:37,  5.26s/it][A

loss: tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:54<19:37,  5.28s/it][A

loss: tensor(0.9482, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:59<19:30,  5.27s/it][A

loss: tensor(0.9194, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [11:05<19:25,  5.27s/it][A

loss: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:10<19:24,  5.29s/it][A

loss: tensor(0.9584, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:15<19:14,  5.27s/it][A

loss: tensor(0.9652, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:20<19:09,  5.27s/it][A

loss: tensor(0.9075, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:26<19:03,  5.27s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:31<19:00,  5.28s/it][A

loss: tensor(0.9739, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:36<18:53,  5.27s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:41<18:47,  5.27s/it][A

loss: tensor(0.9647, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:47<18:39,  5.26s/it][A

loss: tensor(0.9389, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:52<18:40,  5.29s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:57<18:34,  5.28s/it][A

loss: tensor(0.9075, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [12:03<18:26,  5.27s/it][A

loss: tensor(0.9631, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [12:08<18:24,  5.28s/it][A

loss: tensor(0.9214, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:13<18:33,  5.35s/it][A

loss: tensor(1.0029, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:19<18:32,  5.37s/it][A

loss: tensor(0.9207, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:24<18:34,  5.41s/it][A

loss: tensor(0.9554, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:30<18:36,  5.45s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:36<18:52,  5.55s/it][A

loss: tensor(0.9370, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:41<18:33,  5.49s/it][A

loss: tensor(0.9227, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:46<18:16,  5.43s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:52<18:07,  5.41s/it][A

loss: tensor(0.9504, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:57<17:53,  5.37s/it][A

loss: tensor(0.9178, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [13:02<17:42,  5.34s/it][A

loss: tensor(0.9073, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [13:07<17:33,  5.32s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:13<17:28,  5.32s/it][A

loss: tensor(0.9736, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:18<17:19,  5.30s/it][A

loss: tensor(0.9572, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:23<17:12,  5.30s/it][A

loss: tensor(0.9536, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:29<17:05,  5.29s/it][A

loss: tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:34<17:02,  5.30s/it][A

loss: tensor(0.9643, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:39<16:55,  5.29s/it][A

loss: tensor(0.9960, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:44<16:50,  5.29s/it][A

loss: tensor(0.9493, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:50<16:44,  5.29s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:55<16:43,  5.31s/it][A

loss: tensor(0.9318, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [14:00<16:35,  5.30s/it][A

loss: tensor(0.9483, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [14:06<16:27,  5.28s/it][A

loss: tensor(0.9183, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [14:11<16:21,  5.28s/it][A

loss: tensor(0.9537, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:16<16:19,  5.30s/it][A

loss: tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:21<16:11,  5.28s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:27<16:03,  5.27s/it][A

loss: tensor(0.9407, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:32<15:57,  5.26s/it][A

loss: tensor(0.9454, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:37<15:55,  5.28s/it][A

loss: tensor(0.9760, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:43<15:48,  5.27s/it][A

loss: tensor(0.9468, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:48<15:41,  5.26s/it][A

loss: tensor(1.0103, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:53<15:35,  5.26s/it][A

loss: tensor(0.9610, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:58<15:32,  5.27s/it][A

loss: tensor(0.9659, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [15:04<15:26,  5.26s/it][A

loss: tensor(1.0347, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [15:09<15:20,  5.26s/it][A

loss: tensor(0.9067, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:14<15:17,  5.28s/it][A

loss: tensor(0.9304, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:19<15:10,  5.27s/it][A

loss: tensor(0.9858, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:25<15:04,  5.26s/it][A

loss: tensor(0.9194, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:30<14:59,  5.26s/it][A

loss: tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:35<14:57,  5.28s/it][A

loss: tensor(0.9544, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:41<14:58,  5.32s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:46<14:52,  5.31s/it][A

loss: tensor(0.9832, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:51<14:43,  5.29s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:57<14:41,  5.31s/it][A

loss: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [16:02<14:34,  5.30s/it][A

loss: tensor(1.0005, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [16:07<14:28,  5.30s/it][A

loss: tensor(1.0234, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [16:12<14:22,  5.29s/it][A

loss: tensor(0.9747, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:18<14:19,  5.31s/it][A

loss: tensor(0.9452, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:23<14:13,  5.30s/it][A

loss: tensor(0.9066, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:28<14:07,  5.30s/it][A

loss: tensor(0.9472, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:34<14:02,  5.30s/it][A

loss: tensor(0.9065, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:39<14:00,  5.32s/it][A

loss: tensor(0.9194, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:44<13:54,  5.31s/it][A

loss: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:50<13:47,  5.31s/it][A

loss: tensor(0.9182, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:55<13:44,  5.32s/it][A

loss: tensor(0.9906, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [17:00<13:37,  5.31s/it][A

loss: tensor(1.0057, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [17:05<13:31,  5.31s/it][A

loss: tensor(0.9064, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [17:11<13:27,  5.31s/it][A

loss: tensor(0.9397, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:16<13:24,  5.33s/it][A

loss: tensor(0.9664, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:21<13:17,  5.31s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:27<13:12,  5.32s/it][A

loss: tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:32<13:07,  5.32s/it][A

loss: tensor(0.9493, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:37<13:05,  5.34s/it][A

loss: tensor(0.9801, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:43<13:08,  5.40s/it][A

loss: tensor(0.9372, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:48<13:03,  5.40s/it][A

loss: tensor(0.9631, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:54<12:58,  5.40s/it][A

loss: tensor(0.9346, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:59<12:54,  5.42s/it][A

loss: tensor(0.9657, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [18:04<12:28,  5.27s/it][A

loss: tensor(0.9790, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [18:10<12:29,  5.32s/it][A

loss: tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:15<12:29,  5.36s/it][A

loss: tensor(0.9336, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:21<12:29,  5.39s/it][A

loss: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:26<12:23,  5.39s/it][A

loss: tensor(0.9542, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:31<12:18,  5.39s/it][A

loss: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:37<12:14,  5.40s/it][A

loss: tensor(0.9063, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:42<12:13,  5.43s/it][A

loss: tensor(0.9348, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:48<12:06,  5.42s/it][A

loss: tensor(0.9420, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:53<11:59,  5.41s/it][A

loss: tensor(0.9622, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:59<11:56,  5.43s/it][A

loss: tensor(0.9444, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [19:04<11:52,  5.44s/it][A

loss: tensor(0.9793, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [19:09<11:45,  5.43s/it][A

loss: tensor(0.9673, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [19:15<11:40,  5.43s/it][A

loss: tensor(0.9914, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:20<11:36,  5.44s/it][A

loss: tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:26<11:28,  5.43s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:31<11:24,  5.43s/it][A

loss: tensor(0.9474, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:36<11:17,  5.42s/it][A

loss: tensor(0.9540, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:42<11:12,  5.43s/it][A

loss: tensor(0.9535, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:47<11:06,  5.42s/it][A

loss: tensor(0.9062, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:53<11:00,  5.42s/it][A

loss: tensor(0.9522, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:58<10:54,  5.41s/it][A

loss: tensor(0.9484, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [20:04<10:51,  5.43s/it][A

loss: tensor(0.9396, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [20:09<10:45,  5.42s/it][A

loss: tensor(0.9631, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [20:14<10:39,  5.42s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:20<10:33,  5.41s/it][A

loss: tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:25<10:30,  5.44s/it][A

loss: tensor(0.9360, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:31<10:23,  5.42s/it][A

loss: tensor(0.9062, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:36<10:17,  5.41s/it][A

loss: tensor(0.9061, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:42<10:10,  5.41s/it][A

loss: tensor(0.9062, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:47<10:08,  5.43s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:52<10:02,  5.43s/it][A

loss: tensor(0.9915, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:58<09:56,  5.42s/it][A

loss: tensor(0.9235, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [21:03<09:50,  5.42s/it][A

loss: tensor(0.9384, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [21:08<09:32,  5.30s/it][A

loss: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [21:14<09:30,  5.33s/it][A

loss: tensor(0.9580, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [21:19<09:28,  5.36s/it][A

loss: tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:24<09:20,  5.34s/it][A

loss: tensor(0.9546, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:30<09:11,  5.31s/it][A

loss: tensor(0.9218, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:35<09:05,  5.29s/it][A

loss: tensor(0.9060, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:40<08:58,  5.28s/it][A

loss: tensor(0.9326, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:45<08:54,  5.29s/it][A

loss: tensor(0.9211, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:51<08:47,  5.28s/it][A

loss: tensor(0.9738, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:56<08:41,  5.27s/it][A

loss: tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [22:01<08:35,  5.26s/it][A

loss: tensor(0.9490, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [22:07<08:32,  5.28s/it][A

loss: tensor(0.9061, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [22:12<08:25,  5.26s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [22:17<08:19,  5.25s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:22<08:14,  5.26s/it][A

loss: tensor(1.0062, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:28<08:09,  5.27s/it][A

loss: tensor(0.9126, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:33<08:04,  5.26s/it][A

loss: tensor(0.9341, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:38<07:58,  5.26s/it][A

loss: tensor(0.9678, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:43<07:53,  5.26s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:49<07:49,  5.27s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:54<07:43,  5.27s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [22:59<07:38,  5.27s/it][A

loss: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [23:04<07:35,  5.29s/it][A

loss: tensor(0.9797, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [23:10<07:28,  5.28s/it][A

loss: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [23:15<07:22,  5.27s/it][A

loss: tensor(0.9649, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:20<07:17,  5.27s/it][A

loss: tensor(0.9669, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:26<07:13,  5.28s/it][A

loss: tensor(0.9191, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:31<07:06,  5.27s/it][A

loss: tensor(0.9058, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:36<07:01,  5.27s/it][A

loss: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:41<06:56,  5.27s/it][A

loss: tensor(0.9057, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:47<06:51,  5.28s/it][A

loss: tensor(0.9566, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:52<06:46,  5.28s/it][A

loss: tensor(0.9211, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:57<06:40,  5.27s/it][A

loss: tensor(0.9178, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [24:02<06:34,  5.26s/it][A

loss: tensor(0.9216, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [24:08<06:31,  5.29s/it][A

loss: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [24:13<06:26,  5.29s/it][A

loss: tensor(0.9232, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [24:18<06:19,  5.28s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:24<06:14,  5.27s/it][A

loss: tensor(1.0098, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:29<06:10,  5.29s/it][A

loss: tensor(0.9057, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:34<06:03,  5.28s/it][A

loss: tensor(0.9461, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:39<05:58,  5.27s/it][A

loss: tensor(0.9664, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:45<05:52,  5.26s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:50<05:48,  5.28s/it][A

loss: tensor(0.9058, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:55<05:43,  5.28s/it][A

loss: tensor(0.9309, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [25:01<05:37,  5.28s/it][A

loss: tensor(0.9057, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [25:06<05:32,  5.27s/it][A

loss: tensor(0.9058, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [25:11<05:28,  5.29s/it][A

loss: tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [25:16<05:22,  5.28s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:22<05:16,  5.27s/it][A

loss: tensor(0.9058, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:27<05:11,  5.28s/it][A

loss: tensor(0.9818, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [25:32<05:05,  5.27s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:37<04:59,  5.26s/it][A

loss: tensor(0.9701, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:43<04:54,  5.25s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:48<04:49,  5.26s/it][A

loss: tensor(0.9139, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:53<04:43,  5.25s/it][A

loss: tensor(0.9057, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [25:58<04:37,  5.24s/it][A

loss: tensor(0.9676, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [26:04<04:31,  5.23s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [26:09<04:27,  5.25s/it][A

loss: tensor(0.9705, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [26:14<04:21,  5.24s/it][A

loss: tensor(0.9058, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [26:19<04:16,  5.23s/it][A

loss: tensor(1.0191, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:25<04:11,  5.23s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:30<04:06,  5.25s/it][A

loss: tensor(0.9544, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:35<04:01,  5.24s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:40<03:55,  5.23s/it][A

loss: tensor(0.9543, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:45<03:49,  5.23s/it][A

loss: tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:51<03:45,  5.24s/it][A

loss: tensor(0.9844, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [26:56<03:39,  5.23s/it][A

loss: tensor(0.9491, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [27:01<03:34,  5.23s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [27:06<03:28,  5.22s/it][A

loss: tensor(1.0247, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [27:12<03:24,  5.24s/it][A

loss: tensor(0.9644, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [27:17<03:18,  5.23s/it][A

loss: tensor(0.9306, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:22<03:13,  5.23s/it][A

loss: tensor(0.9202, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:27<03:08,  5.25s/it][A

loss: tensor(0.9705, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [27:33<03:03,  5.23s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:38<02:57,  5.22s/it][A

loss: tensor(0.9174, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:43<02:52,  5.22s/it][A

loss: tensor(0.9270, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:48<02:47,  5.23s/it][A

loss: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [27:53<02:42,  5.23s/it][A

loss: tensor(0.9420, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [27:59<02:36,  5.22s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [28:04<02:31,  5.22s/it][A

loss: tensor(0.9496, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [28:09<02:26,  5.24s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [28:14<02:21,  5.23s/it][A

loss: tensor(0.9523, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [28:20<02:15,  5.21s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:25<02:10,  5.21s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [28:30<02:05,  5.23s/it][A

loss: tensor(0.9412, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:35<02:00,  5.22s/it][A

loss: tensor(0.9607, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:40<01:54,  5.22s/it][A

loss: tensor(0.9854, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:46<01:49,  5.22s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [28:51<01:45,  5.27s/it][A

loss: tensor(0.9233, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [28:56<01:40,  5.30s/it][A

loss: tensor(0.9057, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [29:02<01:35,  5.32s/it][A

loss: tensor(0.9156, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [29:07<01:30,  5.34s/it][A

loss: tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [29:12<01:25,  5.32s/it][A

loss: tensor(0.9382, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [29:18<01:19,  5.30s/it][A

loss: tensor(0.9931, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:23<01:14,  5.29s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [29:28<01:08,  5.29s/it][A

loss: tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [29:33<01:03,  5.28s/it][A

loss: tensor(0.9837, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:39<00:57,  5.27s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:44<00:52,  5.26s/it][A

loss: tensor(0.9055, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:49<00:47,  5.28s/it][A

loss: tensor(0.9719, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [29:55<00:42,  5.27s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [30:00<00:36,  5.27s/it][A

loss: tensor(0.9254, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [30:05<00:31,  5.26s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [30:10<00:26,  5.28s/it][A

loss: tensor(0.9683, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [30:16<00:21,  5.27s/it][A

loss: tensor(0.9680, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [30:21<00:15,  5.27s/it][A

loss: tensor(0.9288, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [30:26<00:10,  5.27s/it][A

loss: tensor(0.9860, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [30:31<00:05,  5.28s/it][A

loss: tensor(0.9741, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [30:32<00:00,  5.30s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.962872518280338

	Training acc: 0.9587291581798302

	Training prec: 0.9294759048606471

	Training rec: 0.9587291581798302

	Training f1: 0.9430448224275811

	Current Learning rate:  4.5e-05



  1%|          | 1/173 [00:00<02:09,  1.32it/s][A
  1%|          | 2/173 [00:01<01:55,  1.48it/s][A
  2%|▏         | 3/173 [00:02<01:55,  1.47it/s][A
  2%|▏         | 4/173 [00:02<01:56,  1.45it/s][A
  3%|▎         | 5/173 [00:03<01:51,  1.51it/s][A
  3%|▎         | 6/173 [00:04<01:52,  1.49it/s][A
  4%|▍         | 7/173 [00:04<01:52,  1.48it/s][A
  5%|▍         | 8/173 [00:05<01:48,  1.52it/s][A
  5%|▌         | 9/173 [00:06<01:49,  1.49it/s][A
  6%|▌         | 10/173 [00:06<01:50,  1.48it/s][A
  6%|▋         | 11/173 [00:07<01:46,  1.52it/s][A
  7%|▋         | 12/173 [00:08<01:47,  1.50it/s][A
  8%|▊         | 13/173 [00:08<01:48,  1.48it/s][A
  8%|▊         | 14/173 [00:09<01:44,  1.51it/s][A
  9%|▊         | 15/173 [00:10<01:45,  1.49it/s][A
  9%|▉         | 16/173 [00:10<01:46,  1.48it/s][A
 10%|▉         | 17/173 [00:11<01:43,  1.51it/s][A
 10%|█         | 18/173 [00:12<01:43,  1.49it/s][A
 11%|█         | 19/173 [00:12<01:44,  1.47it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9437548370030574

	Validation acc: 0.9612642192619004

	Validation prec: 0.925678751757159

	Validation rec: 0.9612642192619004

	Validation f1: 0.9427258909617894



  0%|          | 1/346 [00:05<30:19,  5.27s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<30:28,  5.31s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:15<30:12,  5.29s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:21<30:04,  5.28s/it][A

loss: tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<29:58,  5.27s/it][A

loss: tensor(0.9420, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:31<30:00,  5.30s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:37<29:54,  5.29s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:42<29:48,  5.29s/it][A

loss: tensor(0.9503, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:47<29:41,  5.29s/it][A

loss: tensor(0.9133, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:52<29:44,  5.31s/it][A

loss: tensor(0.9467, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:58<29:35,  5.30s/it][A

loss: tensor(0.9641, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:03<29:28,  5.29s/it][A

loss: tensor(0.9055, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:08<29:22,  5.29s/it][A

loss: tensor(0.9275, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:14<29:21,  5.31s/it][A

loss: tensor(0.9615, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:19<29:10,  5.29s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:24<29:03,  5.28s/it][A

loss: tensor(0.9549, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:29<28:59,  5.29s/it][A

loss: tensor(0.9568, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:35<28:59,  5.30s/it][A

loss: tensor(0.9056, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:40<28:54,  5.30s/it][A

loss: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:45<28:47,  5.30s/it][A

loss: tensor(0.9620, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:51<28:44,  5.31s/it][A

loss: tensor(0.9484, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:56<28:33,  5.29s/it][A

loss: tensor(0.9257, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:01<28:27,  5.29s/it][A

loss: tensor(0.9451, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:07<28:21,  5.28s/it][A

loss: tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:12<28:22,  5.30s/it][A

loss: tensor(0.9322, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:17<28:11,  5.29s/it][A

loss: tensor(0.9471, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:22<28:06,  5.29s/it][A

loss: tensor(0.9423, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:28<27:56,  5.27s/it][A

loss: tensor(0.9177, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:33<27:57,  5.29s/it][A

loss: tensor(0.9539, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:38<27:48,  5.28s/it][A

loss: tensor(0.9296, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:44<27:43,  5.28s/it][A

loss: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:49<27:34,  5.27s/it][A

loss: tensor(0.9260, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:54<27:37,  5.30s/it][A

loss: tensor(0.9721, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [02:59<27:27,  5.28s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:05<27:21,  5.28s/it][A

loss: tensor(0.9620, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:10<27:14,  5.27s/it][A

loss: tensor(0.9373, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:15<27:17,  5.30s/it][A

loss: tensor(0.9557, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:20<27:07,  5.28s/it][A

loss: tensor(0.9555, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:26<27:02,  5.28s/it][A

loss: tensor(0.9458, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:31<27:04,  5.31s/it][A

loss: tensor(0.9533, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:36<26:56,  5.30s/it][A

loss: tensor(0.9055, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:42<26:49,  5.29s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:47<26:41,  5.28s/it][A

loss: tensor(0.9054, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:52<26:40,  5.30s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:58<26:30,  5.28s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:03<26:26,  5.29s/it][A

loss: tensor(0.9582, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:08<26:19,  5.28s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:13<26:18,  5.30s/it][A

loss: tensor(0.9137, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:19<26:10,  5.29s/it][A

loss: tensor(0.9054, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:24<26:02,  5.28s/it][A

loss: tensor(0.9338, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:29<25:54,  5.27s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:35<25:56,  5.29s/it][A

loss: tensor(0.9134, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:40<25:48,  5.29s/it][A

loss: tensor(0.9417, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:45<25:41,  5.28s/it][A

loss: tensor(0.9717, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:50<25:36,  5.28s/it][A

loss: tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:56<25:37,  5.30s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [05:01<25:30,  5.29s/it][A

loss: tensor(1.0120, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:06<25:25,  5.30s/it][A

loss: tensor(0.9537, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:12<25:14,  5.28s/it][A

loss: tensor(0.9620, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:17<25:15,  5.30s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:22<25:07,  5.29s/it][A

loss: tensor(0.9730, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:27<25:01,  5.29s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:33<24:54,  5.28s/it][A

loss: tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:38<24:54,  5.30s/it][A

loss: tensor(0.9339, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:43<24:46,  5.29s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:49<24:37,  5.28s/it][A

loss: tensor(0.9373, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:54<24:34,  5.29s/it][A

loss: tensor(0.9670, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [05:59<24:26,  5.28s/it][A

loss: tensor(1.0137, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:04<24:23,  5.28s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:10<24:16,  5.28s/it][A

loss: tensor(0.9054, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:15<24:16,  5.30s/it][A

loss: tensor(0.9488, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:20<24:08,  5.29s/it][A

loss: tensor(0.9054, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:26<24:00,  5.27s/it][A

loss: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:31<23:51,  5.26s/it][A

loss: tensor(0.9819, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:36<23:52,  5.29s/it][A

loss: tensor(0.9431, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:41<23:44,  5.27s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:47<23:36,  5.26s/it][A

loss: tensor(0.9429, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:52<23:29,  5.26s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:57<23:36,  5.30s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [07:03<23:28,  5.30s/it][A

loss: tensor(0.9975, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:08<23:20,  5.29s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:13<23:13,  5.28s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:18<23:13,  5.30s/it][A

loss: tensor(0.9608, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:24<23:04,  5.29s/it][A

loss: tensor(0.9812, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:29<22:56,  5.27s/it][A

loss: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:34<22:54,  5.29s/it][A

loss: tensor(0.9542, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:39<22:47,  5.28s/it][A

loss: tensor(0.9639, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:45<22:40,  5.27s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:50<22:33,  5.27s/it][A

loss: tensor(0.9242, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:55<22:30,  5.28s/it][A

loss: tensor(0.9195, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [08:01<22:25,  5.27s/it][A

loss: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:06<22:18,  5.27s/it][A

loss: tensor(0.9181, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:11<22:10,  5.26s/it][A

loss: tensor(0.9971, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:16<22:07,  5.27s/it][A

loss: tensor(0.9321, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:22<22:00,  5.26s/it][A

loss: tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:27<21:54,  5.26s/it][A

loss: tensor(0.9408, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:32<21:48,  5.26s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:37<21:50,  5.28s/it][A

loss: tensor(0.9882, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:43<21:39,  5.26s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:48<21:33,  5.26s/it][A

loss: tensor(0.9490, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:53<21:29,  5.26s/it][A

loss: tensor(0.9493, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [08:59<21:28,  5.28s/it][A

loss: tensor(0.9125, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:04<21:20,  5.27s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:09<21:14,  5.26s/it][A

loss: tensor(0.9503, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:14<21:04,  5.25s/it][A

loss: tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:20<21:04,  5.27s/it][A

loss: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:25<20:57,  5.26s/it][A

loss: tensor(0.9606, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:30<20:53,  5.27s/it][A

loss: tensor(0.9054, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:35<20:52,  5.28s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:41<20:53,  5.31s/it][A

loss: tensor(1.0100, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:46<20:53,  5.33s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:52<20:52,  5.35s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [09:57<20:54,  5.38s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [10:02<20:46,  5.37s/it][A

loss: tensor(0.9401, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:08<20:40,  5.37s/it][A

loss: tensor(0.9418, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:13<20:34,  5.37s/it][A

loss: tensor(0.9810, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:18<20:30,  5.38s/it][A

loss: tensor(0.9312, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:24<20:22,  5.36s/it][A

loss: tensor(0.9778, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:29<20:17,  5.36s/it][A

loss: tensor(0.9301, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:34<20:09,  5.35s/it][A

loss: tensor(0.9467, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:40<20:10,  5.38s/it][A

loss: tensor(0.9312, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:45<20:01,  5.36s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:51<19:52,  5.35s/it][A

loss: tensor(1.0433, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:56<19:47,  5.35s/it][A

loss: tensor(0.9421, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [11:01<19:46,  5.37s/it][A

loss: tensor(0.9394, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:07<19:37,  5.35s/it][A

loss: tensor(0.9594, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:12<19:31,  5.35s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:17<19:25,  5.35s/it][A

loss: tensor(0.9347, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:23<19:24,  5.37s/it][A

loss: tensor(0.9159, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:28<19:16,  5.35s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:33<19:09,  5.35s/it][A

loss: tensor(0.9692, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:39<19:03,  5.34s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:44<19:00,  5.35s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:49<18:52,  5.34s/it][A

loss: tensor(0.9577, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:55<18:46,  5.34s/it][A

loss: tensor(0.9647, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [12:00<18:42,  5.35s/it][A

loss: tensor(0.9233, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [12:05<18:34,  5.33s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:11<18:29,  5.34s/it][A

loss: tensor(0.9876, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:16<18:24,  5.33s/it][A

loss: tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:21<18:22,  5.35s/it][A

loss: tensor(0.9537, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:27<18:14,  5.34s/it][A

loss: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:32<18:08,  5.33s/it][A

loss: tensor(0.9465, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:37<18:00,  5.32s/it][A

loss: tensor(0.9366, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:43<18:01,  5.35s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:48<17:55,  5.35s/it][A

loss: tensor(0.9417, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:53<17:48,  5.34s/it][A

loss: tensor(0.9367, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [12:59<17:44,  5.35s/it][A

loss: tensor(0.9515, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [13:04<17:41,  5.36s/it][A

loss: tensor(0.9730, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:10<17:32,  5.34s/it][A

loss: tensor(0.9690, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:15<17:25,  5.33s/it][A

loss: tensor(0.9712, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:20<17:18,  5.33s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:26<17:17,  5.35s/it][A

loss: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:31<17:10,  5.34s/it][A

loss: tensor(0.9199, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:36<17:04,  5.34s/it][A

loss: tensor(0.9653, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:42<16:58,  5.33s/it][A

loss: tensor(0.9053, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:47<16:54,  5.34s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:52<16:45,  5.32s/it][A

loss: tensor(0.9301, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [13:57<16:38,  5.31s/it][A

loss: tensor(0.9173, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [14:03<16:36,  5.33s/it][A

loss: tensor(0.9905, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [14:08<16:28,  5.32s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:13<16:23,  5.32s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:19<16:18,  5.32s/it][A

loss: tensor(0.9870, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:24<16:16,  5.34s/it][A

loss: tensor(1.0347, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:29<16:10,  5.33s/it][A

loss: tensor(0.9161, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:35<16:04,  5.33s/it][A

loss: tensor(0.9396, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:40<15:58,  5.32s/it][A

loss: tensor(0.9296, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:45<15:56,  5.34s/it][A

loss: tensor(0.9748, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:51<15:48,  5.33s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:56<15:43,  5.33s/it][A

loss: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [15:01<15:37,  5.33s/it][A

loss: tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [15:07<15:35,  5.35s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:12<15:28,  5.33s/it][A

loss: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:17<15:20,  5.32s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:23<15:14,  5.32s/it][A

loss: tensor(0.9824, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:28<15:12,  5.34s/it][A

loss: tensor(0.9709, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:33<15:04,  5.32s/it][A

loss: tensor(0.9538, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:39<14:58,  5.32s/it][A

loss: tensor(1.0228, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:44<14:51,  5.31s/it][A

loss: tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:49<14:49,  5.33s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:55<14:43,  5.32s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [16:00<14:36,  5.31s/it][A

loss: tensor(0.9360, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [16:05<14:34,  5.33s/it][A

loss: tensor(0.9840, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [16:11<14:26,  5.32s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:16<14:21,  5.32s/it][A

loss: tensor(0.9494, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:21<14:14,  5.31s/it][A

loss: tensor(0.9666, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:27<14:11,  5.32s/it][A

loss: tensor(0.9276, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:32<14:05,  5.32s/it][A

loss: tensor(0.9195, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:37<13:58,  5.30s/it][A

loss: tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:42<13:52,  5.30s/it][A

loss: tensor(1.0063, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:48<13:50,  5.33s/it][A

loss: tensor(0.9868, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:53<13:44,  5.32s/it][A

loss: tensor(0.9294, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [16:58<13:38,  5.31s/it][A

loss: tensor(0.9488, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [17:04<13:32,  5.31s/it][A

loss: tensor(0.9593, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [17:09<13:30,  5.33s/it][A

loss: tensor(0.9149, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:14<13:23,  5.32s/it][A

loss: tensor(0.9459, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:20<13:16,  5.31s/it][A

loss: tensor(0.9452, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:25<13:10,  5.31s/it][A

loss: tensor(0.9239, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:30<13:08,  5.33s/it][A

loss: tensor(0.9212, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:36<13:01,  5.31s/it][A

loss: tensor(0.9389, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:41<12:54,  5.30s/it][A

loss: tensor(0.9506, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:46<12:49,  5.31s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:52<12:45,  5.32s/it][A

loss: tensor(0.9270, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:57<12:39,  5.31s/it][A

loss: tensor(0.9784, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [18:02<12:37,  5.34s/it][A

loss: tensor(0.9504, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [18:08<12:36,  5.37s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:13<12:32,  5.38s/it][A

loss: tensor(0.9407, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:19<12:27,  5.38s/it][A

loss: tensor(0.9832, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:24<12:22,  5.38s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:29<12:20,  5.41s/it][A

loss: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:35<12:14,  5.40s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:40<12:08,  5.40s/it][A

loss: tensor(0.9156, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:46<12:02,  5.39s/it][A

loss: tensor(0.9446, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:51<11:59,  5.41s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:56<11:53,  5.40s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [19:02<11:46,  5.40s/it][A

loss: tensor(0.9284, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [19:07<11:41,  5.40s/it][A

loss: tensor(1.0035, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [19:13<11:38,  5.41s/it][A

loss: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:18<11:31,  5.40s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:23<11:25,  5.40s/it][A

loss: tensor(0.9432, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:29<11:19,  5.39s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:34<11:16,  5.41s/it][A

loss: tensor(0.9248, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:40<11:11,  5.42s/it][A

loss: tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:45<11:05,  5.41s/it][A

loss: tensor(0.9800, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:50<11:00,  5.42s/it][A

loss: tensor(0.9420, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:56<10:57,  5.44s/it][A

loss: tensor(0.9646, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [20:01<10:51,  5.43s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [20:07<10:46,  5.44s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [20:12<10:42,  5.45s/it][A

loss: tensor(0.9360, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:18<10:35,  5.43s/it][A

loss: tensor(0.9617, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:23<10:29,  5.43s/it][A

loss: tensor(0.9488, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:28<10:22,  5.41s/it][A

loss: tensor(0.9616, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:34<10:18,  5.43s/it][A

loss: tensor(0.9666, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:39<10:12,  5.42s/it][A

loss: tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:45<10:05,  5.40s/it][A

loss: tensor(0.9426, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:50<09:57,  5.39s/it][A

loss: tensor(0.9395, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:55<09:54,  5.40s/it][A

loss: tensor(0.9466, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [21:01<09:48,  5.40s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [21:06<09:41,  5.39s/it][A

loss: tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [21:12<09:35,  5.38s/it][A

loss: tensor(0.9685, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [21:17<09:31,  5.40s/it][A

loss: tensor(0.9139, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:22<09:26,  5.39s/it][A

loss: tensor(0.9621, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:28<09:21,  5.40s/it][A

loss: tensor(0.9299, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:33<09:14,  5.39s/it][A

loss: tensor(0.9173, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:39<09:11,  5.41s/it][A

loss: tensor(1.0188, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:44<09:05,  5.40s/it][A

loss: tensor(0.9378, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:49<08:59,  5.39s/it][A

loss: tensor(0.9912, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:55<08:56,  5.42s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [22:00<08:50,  5.42s/it][A

loss: tensor(0.9217, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [22:06<08:43,  5.40s/it][A

loss: tensor(0.9329, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [22:11<08:38,  5.40s/it][A

loss: tensor(1.0045, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [22:17<08:35,  5.42s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:22<08:28,  5.40s/it][A

loss: tensor(0.9952, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:27<08:23,  5.42s/it][A

loss: tensor(0.9577, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:33<08:17,  5.41s/it][A

loss: tensor(0.9618, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:38<08:12,  5.41s/it][A

loss: tensor(0.9565, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:43<08:05,  5.40s/it][A

loss: tensor(0.9778, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:49<08:00,  5.39s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:54<07:55,  5.40s/it][A

loss: tensor(0.9609, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [23:00<07:50,  5.40s/it][A

loss: tensor(0.9431, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [23:05<07:43,  5.39s/it][A

loss: tensor(0.9877, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [23:10<07:37,  5.38s/it][A

loss: tensor(0.9925, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [23:16<07:32,  5.38s/it][A

loss: tensor(0.9311, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:21<07:28,  5.41s/it][A

loss: tensor(0.9235, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:27<07:23,  5.40s/it][A

loss: tensor(0.9429, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:32<07:16,  5.39s/it][A

loss: tensor(0.9742, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:37<07:11,  5.39s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:43<07:06,  5.40s/it][A

loss: tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:48<07:00,  5.39s/it][A

loss: tensor(0.9652, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:54<06:54,  5.38s/it][A

loss: tensor(0.9865, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:59<06:48,  5.38s/it][A

loss: tensor(0.9303, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [24:04<06:44,  5.39s/it][A

loss: tensor(0.9699, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [24:10<06:38,  5.39s/it][A

loss: tensor(0.9408, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [24:15<06:33,  5.39s/it][A

loss: tensor(0.9171, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [24:21<06:28,  5.40s/it][A

loss: tensor(0.9952, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:26<06:22,  5.38s/it][A

loss: tensor(0.9613, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:31<06:17,  5.39s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:37<06:11,  5.39s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:42<06:06,  5.40s/it][A

loss: tensor(0.9191, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:47<06:00,  5.39s/it][A

loss: tensor(0.9139, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:53<05:56,  5.40s/it][A

loss: tensor(0.9052, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:58<05:49,  5.38s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [25:04<05:45,  5.40s/it][A

loss: tensor(0.9160, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [25:09<05:39,  5.39s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [25:14<05:34,  5.39s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [25:20<05:28,  5.39s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:25<05:24,  5.41s/it][A

loss: tensor(0.9757, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:31<05:17,  5.39s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [25:36<05:13,  5.40s/it][A

loss: tensor(0.9236, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:41<05:06,  5.38s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:47<05:02,  5.41s/it][A

loss: tensor(0.9695, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:52<04:57,  5.41s/it][A

loss: tensor(0.9387, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:58<04:51,  5.39s/it][A

loss: tensor(0.9296, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [26:03<04:45,  5.39s/it][A

loss: tensor(0.9344, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [26:08<04:41,  5.40s/it][A

loss: tensor(0.9991, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [26:14<04:35,  5.40s/it][A

loss: tensor(0.9736, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [26:19<04:29,  5.40s/it][A

loss: tensor(0.9408, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [26:25<04:25,  5.41s/it][A

loss: tensor(0.9721, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:30<04:19,  5.41s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:35<04:13,  5.40s/it][A

loss: tensor(0.9327, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:41<04:07,  5.39s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:46<04:02,  5.40s/it][A

loss: tensor(0.9153, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:52<03:57,  5.39s/it][A

loss: tensor(0.9878, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:57<03:51,  5.38s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [27:02<03:45,  5.37s/it][A

loss: tensor(0.9268, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [27:08<03:41,  5.40s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [27:13<03:35,  5.39s/it][A

loss: tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [27:18<03:29,  5.37s/it][A

loss: tensor(0.9553, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [27:24<03:24,  5.37s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:29<03:19,  5.40s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:35<03:13,  5.38s/it][A

loss: tensor(0.9217, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [27:40<03:08,  5.40s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:45<03:03,  5.39s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:51<02:58,  5.42s/it][A

loss: tensor(1.0162, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:56<02:52,  5.40s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [28:02<02:47,  5.40s/it][A

loss: tensor(0.9336, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [28:07<02:41,  5.40s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [28:13<02:37,  5.42s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [28:18<02:31,  5.40s/it][A

loss: tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [28:23<02:24,  5.36s/it][A

loss: tensor(0.9372, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [28:29<02:19,  5.36s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:34<02:13,  5.32s/it][A

loss: tensor(0.9120, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [28:39<02:07,  5.31s/it][A

loss: tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:44<02:01,  5.30s/it][A

loss: tensor(0.9698, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:50<01:56,  5.31s/it][A

loss: tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:55<01:51,  5.30s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [29:00<01:45,  5.29s/it][A

loss: tensor(0.9953, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [29:05<01:40,  5.28s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [29:11<01:35,  5.30s/it][A

loss: tensor(0.9870, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [29:16<01:29,  5.29s/it][A

loss: tensor(0.9679, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [29:21<01:24,  5.29s/it][A

loss: tensor(0.9268, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [29:27<01:19,  5.29s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:32<01:14,  5.31s/it][A

loss: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [29:37<01:08,  5.30s/it][A

loss: tensor(0.9447, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [29:43<01:03,  5.29s/it][A

loss: tensor(0.9235, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:48<00:58,  5.29s/it][A

loss: tensor(0.9370, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:53<00:53,  5.31s/it][A

loss: tensor(0.9260, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:59<00:47,  5.31s/it][A

loss: tensor(0.9288, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [30:04<00:42,  5.30s/it][A

loss: tensor(0.9625, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [30:09<00:37,  5.30s/it][A

loss: tensor(0.9723, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [30:14<00:31,  5.32s/it][A

loss: tensor(0.9133, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [30:20<00:26,  5.31s/it][A

loss: tensor(0.9387, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [30:25<00:21,  5.31s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [30:30<00:15,  5.32s/it][A

loss: tensor(1.0212, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [30:36<00:10,  5.30s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [30:41<00:05,  5.31s/it][A

loss: tensor(0.9803, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [30:41<00:00,  5.32s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9424191935558539

	Training acc: 0.9628052421436107

	Training prec: 0.927745250574533

	Training rec: 0.9628052421436107

	Training f1: 0.9447613740104533

	Current Learning rate:  4e-05



  1%|          | 1/173 [00:00<02:02,  1.40it/s][A
  1%|          | 2/173 [00:01<02:01,  1.41it/s][A
  2%|▏         | 3/173 [00:02<01:54,  1.48it/s][A
  2%|▏         | 4/173 [00:02<01:55,  1.46it/s][A
  3%|▎         | 5/173 [00:03<01:56,  1.44it/s][A
  3%|▎         | 6/173 [00:04<01:52,  1.49it/s][A
  4%|▍         | 7/173 [00:04<01:53,  1.46it/s][A
  5%|▍         | 8/173 [00:05<01:53,  1.45it/s][A
  5%|▌         | 9/173 [00:06<01:50,  1.49it/s][A
  6%|▌         | 10/173 [00:06<01:51,  1.46it/s][A
  6%|▋         | 11/173 [00:07<01:52,  1.45it/s][A
  7%|▋         | 12/173 [00:08<01:48,  1.49it/s][A
  8%|▊         | 13/173 [00:08<01:48,  1.47it/s][A
  8%|▊         | 14/173 [00:09<01:49,  1.45it/s][A
  9%|▊         | 15/173 [00:10<01:45,  1.49it/s][A
  9%|▉         | 16/173 [00:10<01:46,  1.47it/s][A
 10%|▉         | 17/173 [00:11<01:47,  1.46it/s][A
 10%|█         | 18/173 [00:12<01:43,  1.49it/s][A
 11%|█         | 19/173 [00:12<01:44,  1.47it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9449272331474834

	Validation acc: 0.9599589934027578

	Validation prec: 0.9230383841296567

	Validation rec: 0.9599589934027578

	Validation f1: 0.9407596485182091



  0%|          | 1/346 [00:05<31:14,  5.43s/it][A

loss: tensor(0.9372, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<30:50,  5.38s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:16<30:35,  5.35s/it][A

loss: tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:21<30:29,  5.35s/it][A

loss: tensor(0.9369, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<30:28,  5.36s/it][A

loss: tensor(0.9544, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:32<30:16,  5.34s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:37<30:04,  5.32s/it][A

loss: tensor(0.9255, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:42<29:58,  5.32s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:48<29:55,  5.33s/it][A

loss: tensor(0.9186, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:53<29:46,  5.32s/it][A

loss: tensor(0.9322, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:58<29:38,  5.31s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:04<29:37,  5.32s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:09<29:26,  5.31s/it][A

loss: tensor(0.9938, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:14<29:20,  5.30s/it][A

loss: tensor(0.9208, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:19<29:15,  5.30s/it][A

loss: tensor(0.9538, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:25<29:14,  5.32s/it][A

loss: tensor(0.9731, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:30<29:09,  5.32s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:35<29:02,  5.31s/it][A

loss: tensor(0.9321, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:41<28:55,  5.31s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:46<28:55,  5.32s/it][A

loss: tensor(0.9286, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:51<28:45,  5.31s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:57<28:36,  5.30s/it][A

loss: tensor(0.9414, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:02<28:30,  5.30s/it][A

loss: tensor(1.0055, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:07<28:30,  5.31s/it][A

loss: tensor(0.9750, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:12<28:20,  5.30s/it][A

loss: tensor(0.9662, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:18<28:15,  5.30s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:23<28:08,  5.29s/it][A

loss: tensor(0.9487, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:28<28:12,  5.32s/it][A

loss: tensor(1.0032, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:34<28:06,  5.32s/it][A

loss: tensor(0.9312, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:39<27:59,  5.31s/it][A

loss: tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:44<28:00,  5.33s/it][A

loss: tensor(0.9316, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:50<27:50,  5.32s/it][A

loss: tensor(0.9187, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:55<27:39,  5.30s/it][A

loss: tensor(0.9363, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [03:00<27:33,  5.30s/it][A

loss: tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:06<27:31,  5.31s/it][A

loss: tensor(0.9907, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:11<27:23,  5.30s/it][A

loss: tensor(0.9583, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:16<27:20,  5.31s/it][A

loss: tensor(1.0204, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:22<27:14,  5.31s/it][A

loss: tensor(0.9295, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:27<27:11,  5.31s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:32<27:03,  5.31s/it][A

loss: tensor(0.9333, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:37<26:53,  5.29s/it][A

loss: tensor(0.9163, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:43<26:46,  5.28s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:48<26:47,  5.31s/it][A

loss: tensor(0.9579, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:53<26:40,  5.30s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:59<26:33,  5.29s/it][A

loss: tensor(0.9186, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:04<26:28,  5.29s/it][A

loss: tensor(0.9471, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:09<26:28,  5.31s/it][A

loss: tensor(0.9322, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:14<26:19,  5.30s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:20<26:14,  5.30s/it][A

loss: tensor(0.9937, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:25<26:09,  5.30s/it][A

loss: tensor(0.9568, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:30<26:09,  5.32s/it][A

loss: tensor(0.9606, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:36<26:03,  5.32s/it][A

loss: tensor(0.9875, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:41<25:55,  5.31s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:46<25:54,  5.32s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:52<25:46,  5.31s/it][A

loss: tensor(0.9545, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:57<25:36,  5.30s/it][A

loss: tensor(0.9335, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [05:02<25:31,  5.30s/it][A

loss: tensor(0.9871, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:08<25:30,  5.32s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:13<25:22,  5.30s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:18<25:17,  5.31s/it][A

loss: tensor(0.9756, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:23<25:10,  5.30s/it][A

loss: tensor(1.0039, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:29<25:07,  5.31s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:34<25:00,  5.30s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:39<24:54,  5.30s/it][A

loss: tensor(0.9636, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:45<24:47,  5.29s/it][A

loss: tensor(0.9518, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:50<24:48,  5.32s/it][A

loss: tensor(0.9309, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:55<24:38,  5.30s/it][A

loss: tensor(0.9368, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [06:01<24:32,  5.30s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:06<24:25,  5.29s/it][A

loss: tensor(0.9693, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:11<24:24,  5.31s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:16<24:15,  5.29s/it][A

loss: tensor(0.9377, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:22<24:10,  5.29s/it][A

loss: tensor(0.9622, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:27<24:04,  5.29s/it][A

loss: tensor(0.9893, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:32<24:03,  5.31s/it][A

loss: tensor(0.9751, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:38<23:55,  5.30s/it][A

loss: tensor(0.9731, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:43<23:48,  5.29s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:48<23:42,  5.29s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:54<23:41,  5.30s/it][A

loss: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:59<23:33,  5.30s/it][A

loss: tensor(0.9500, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [07:04<23:27,  5.29s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:09<23:25,  5.30s/it][A

loss: tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:15<23:16,  5.29s/it][A

loss: tensor(0.9153, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:20<23:10,  5.29s/it][A

loss: tensor(0.9302, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:25<23:01,  5.27s/it][A

loss: tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:31<22:59,  5.28s/it][A

loss: tensor(0.9233, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:36<22:48,  5.26s/it][A

loss: tensor(0.9567, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:41<22:41,  5.26s/it][A

loss: tensor(0.9341, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:46<22:34,  5.25s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:52<22:34,  5.27s/it][A

loss: tensor(0.9618, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:57<22:26,  5.26s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [08:02<22:20,  5.26s/it][A

loss: tensor(0.9177, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:07<22:12,  5.25s/it][A

loss: tensor(0.9635, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:13<22:12,  5.27s/it][A

loss: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:18<22:03,  5.25s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:23<21:58,  5.25s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:28<21:50,  5.24s/it][A

loss: tensor(0.9664, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:34<21:49,  5.26s/it][A

loss: tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:39<21:40,  5.25s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:44<21:33,  5.24s/it][A

loss: tensor(0.9283, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:49<21:28,  5.24s/it][A

loss: tensor(0.9478, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:55<21:26,  5.25s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [09:00<21:17,  5.24s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:05<21:12,  5.24s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:10<21:11,  5.25s/it][A

loss: tensor(0.9130, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:15<21:02,  5.24s/it][A

loss: tensor(0.9326, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:21<20:55,  5.23s/it][A

loss: tensor(0.9147, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:26<20:51,  5.24s/it][A

loss: tensor(0.9307, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:31<20:48,  5.25s/it][A

loss: tensor(0.9398, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:36<20:39,  5.23s/it][A

loss: tensor(0.9256, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:42<20:35,  5.23s/it][A

loss: tensor(0.9550, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:47<20:28,  5.23s/it][A

loss: tensor(0.9651, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:52<20:28,  5.25s/it][A

loss: tensor(0.9823, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [09:57<20:21,  5.24s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [10:03<20:15,  5.24s/it][A

loss: tensor(0.9362, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:08<20:08,  5.23s/it][A

loss: tensor(0.9861, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:13<20:07,  5.25s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:18<20:01,  5.25s/it][A

loss: tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:24<20:00,  5.26s/it][A

loss: tensor(0.9465, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:29<19:53,  5.26s/it][A

loss: tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:34<19:51,  5.27s/it][A

loss: tensor(0.9232, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:39<19:42,  5.26s/it][A

loss: tensor(0.9869, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:45<19:34,  5.25s/it][A

loss: tensor(0.9381, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:50<19:32,  5.26s/it][A

loss: tensor(0.9490, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:55<19:25,  5.25s/it][A

loss: tensor(0.9210, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [11:00<19:17,  5.24s/it][A

loss: tensor(0.9504, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:06<19:11,  5.24s/it][A

loss: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:11<19:09,  5.25s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:16<19:00,  5.23s/it][A

loss: tensor(0.9262, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:21<18:55,  5.23s/it][A

loss: tensor(0.9140, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:27<18:49,  5.23s/it][A

loss: tensor(0.9226, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:32<18:48,  5.25s/it][A

loss: tensor(0.9246, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:37<18:41,  5.24s/it][A

loss: tensor(0.9360, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:42<18:36,  5.24s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:48<18:29,  5.23s/it][A

loss: tensor(0.9353, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:53<18:30,  5.26s/it][A

loss: tensor(0.9430, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [11:58<18:35,  5.31s/it][A

loss: tensor(0.9483, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [12:04<19:16,  5.53s/it][A

loss: tensor(0.9688, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:10<19:21,  5.58s/it][A

loss: tensor(0.9187, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:16<19:30,  5.65s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:21<19:20,  5.63s/it][A

loss: tensor(0.9450, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:27<19:24,  5.68s/it][A

loss: tensor(0.9378, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:33<19:06,  5.62s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:38<18:57,  5.61s/it][A

loss: tensor(0.9250, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:44<18:47,  5.58s/it][A

loss: tensor(0.9121, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:49<18:27,  5.51s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:54<18:11,  5.46s/it][A

loss: tensor(0.9178, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [13:00<18:02,  5.44s/it][A

loss: tensor(0.9389, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [13:05<17:51,  5.41s/it][A

loss: tensor(0.9292, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:11<17:41,  5.39s/it][A

loss: tensor(0.9472, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:16<17:40,  5.41s/it][A

loss: tensor(0.9800, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:21<17:33,  5.40s/it][A

loss: tensor(0.9441, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:27<17:29,  5.41s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:32<17:25,  5.42s/it][A

loss: tensor(0.9234, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:38<17:21,  5.42s/it][A

loss: tensor(0.9362, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:43<17:14,  5.42s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:49<17:16,  5.45s/it][A

loss: tensor(0.9582, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:54<17:16,  5.48s/it][A

loss: tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [14:00<17:21,  5.54s/it][A

loss: tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [14:05<17:18,  5.55s/it][A

loss: tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [14:11<17:11,  5.55s/it][A

loss: tensor(0.9560, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:16<17:00,  5.52s/it][A

loss: tensor(0.9195, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:22<16:51,  5.50s/it][A

loss: tensor(0.9129, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:27<16:37,  5.45s/it][A

loss: tensor(0.9800, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:33<16:25,  5.42s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:38<16:15,  5.39s/it][A

loss: tensor(0.9362, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:43<16:09,  5.38s/it][A

loss: tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:49<15:59,  5.36s/it][A

loss: tensor(0.9411, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:54<15:52,  5.35s/it][A

loss: tensor(0.9483, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:59<15:51,  5.38s/it][A

loss: tensor(0.9270, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [15:05<15:53,  5.42s/it][A

loss: tensor(0.9731, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [15:10<15:58,  5.48s/it][A

loss: tensor(0.9502, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:16<16:01,  5.52s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:22<16:11,  5.61s/it][A

loss: tensor(0.9389, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:28<16:15,  5.67s/it][A

loss: tensor(0.9469, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:33<16:06,  5.65s/it][A

loss: tensor(1.0009, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:39<16:09,  5.70s/it][A

loss: tensor(0.9866, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:45<16:12,  5.76s/it][A

loss: tensor(0.9675, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:51<16:25,  5.86s/it][A

loss: tensor(0.9633, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:57<16:34,  5.95s/it][A

loss: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [16:03<15:56,  5.76s/it][A

loss: tensor(0.9446, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [16:08<15:30,  5.64s/it][A

loss: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [16:13<15:05,  5.52s/it][A

loss: tensor(0.9783, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [16:18<14:47,  5.44s/it][A

loss: tensor(0.9563, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:24<14:34,  5.40s/it][A

loss: tensor(0.9509, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:29<14:26,  5.38s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:34<14:15,  5.35s/it][A

loss: tensor(0.9832, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:40<14:06,  5.32s/it][A

loss: tensor(0.9541, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:45<13:58,  5.31s/it][A

loss: tensor(1.0050, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:50<13:54,  5.32s/it][A

loss: tensor(0.9137, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:56<13:47,  5.30s/it][A

loss: tensor(0.9383, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [17:01<13:39,  5.29s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [17:06<13:34,  5.29s/it][A

loss: tensor(0.9219, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [17:12<14:08,  5.54s/it][A

loss: tensor(0.9374, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [17:18<14:28,  5.71s/it][A

loss: tensor(0.9633, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:25<14:58,  5.95s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:31<14:48,  5.92s/it][A

loss: tensor(0.9447, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:36<14:37,  5.89s/it][A

loss: tensor(0.9407, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:42<14:30,  5.88s/it][A

loss: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:48<14:15,  5.82s/it][A

loss: tensor(0.9360, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:54<14:04,  5.78s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:59<13:45,  5.69s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [18:05<13:37,  5.68s/it][A

loss: tensor(0.9308, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [18:11<13:39,  5.73s/it][A

loss: tensor(0.9351, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [18:16<13:35,  5.75s/it][A

loss: tensor(0.9547, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [18:22<13:37,  5.80s/it][A

loss: tensor(0.9292, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:28<13:23,  5.74s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:34<13:15,  5.72s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:40<13:31,  5.88s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:46<13:40,  5.99s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:52<13:25,  5.92s/it][A

loss: tensor(0.9819, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:58<13:14,  5.89s/it][A

loss: tensor(0.9576, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [19:03<12:59,  5.82s/it][A

loss: tensor(0.9522, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [19:09<12:50,  5.79s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [19:15<12:39,  5.75s/it][A

loss: tensor(0.9593, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [19:21<12:33,  5.75s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [19:26<12:31,  5.78s/it][A

loss: tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [19:32<12:20,  5.74s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:38<12:16,  5.75s/it][A

loss: tensor(0.9170, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:44<12:16,  5.80s/it][A

loss: tensor(0.9470, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:49<11:59,  5.71s/it][A

loss: tensor(0.9625, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:55<12:12,  5.86s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [20:02<12:19,  5.96s/it][A

loss: tensor(0.9982, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [20:08<12:15,  5.98s/it][A

loss: tensor(0.9496, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [20:13<11:50,  5.82s/it][A

loss: tensor(0.9917, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [20:19<11:31,  5.71s/it][A

loss: tensor(0.9591, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [20:24<11:13,  5.61s/it][A

loss: tensor(0.9605, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [20:29<10:56,  5.52s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [20:35<10:46,  5.48s/it][A

loss: tensor(0.9276, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:40<10:39,  5.47s/it][A

loss: tensor(0.9925, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:46<10:32,  5.45s/it][A

loss: tensor(0.9747, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:51<10:26,  5.45s/it][A

loss: tensor(0.9517, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:56<10:17,  5.42s/it][A

loss: tensor(0.9781, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [21:02<10:11,  5.41s/it][A

loss: tensor(0.9163, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [21:07<10:07,  5.42s/it][A

loss: tensor(0.9576, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [21:13<10:03,  5.44s/it][A

loss: tensor(0.9300, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [21:18<09:56,  5.43s/it][A

loss: tensor(0.9189, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [21:23<09:50,  5.42s/it][A

loss: tensor(0.9280, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [21:29<09:44,  5.41s/it][A

loss: tensor(0.9939, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [21:34<09:39,  5.41s/it][A

loss: tensor(0.9605, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [21:40<09:33,  5.41s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:45<09:28,  5.41s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:51<09:24,  5.43s/it][A

loss: tensor(0.9587, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:56<09:17,  5.41s/it][A

loss: tensor(0.9230, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [22:01<09:12,  5.42s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [22:07<09:07,  5.42s/it][A

loss: tensor(0.9423, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [22:12<09:02,  5.43s/it][A

loss: tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [22:18<08:55,  5.41s/it][A

loss: tensor(0.9235, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [22:23<08:49,  5.41s/it][A

loss: tensor(0.9504, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [22:28<08:43,  5.40s/it][A

loss: tensor(0.9514, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [22:34<08:40,  5.42s/it][A

loss: tensor(0.9512, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [22:39<08:33,  5.41s/it][A

loss: tensor(0.9528, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:45<08:28,  5.41s/it][A

loss: tensor(0.9453, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:50<08:23,  5.41s/it][A

loss: tensor(0.9758, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:55<08:19,  5.43s/it][A

loss: tensor(0.9280, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [23:01<08:13,  5.42s/it][A

loss: tensor(0.9922, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [23:06<08:06,  5.41s/it][A

loss: tensor(0.9747, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [23:12<08:01,  5.41s/it][A

loss: tensor(0.9672, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [23:17<07:58,  5.43s/it][A

loss: tensor(0.9650, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [23:23<07:52,  5.43s/it][A

loss: tensor(0.9522, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [23:28<07:46,  5.42s/it][A

loss: tensor(0.9193, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [23:34<07:42,  5.45s/it][A

loss: tensor(0.9430, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [23:39<07:38,  5.46s/it][A

loss: tensor(0.9351, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:44<07:31,  5.44s/it][A

loss: tensor(0.9554, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:50<07:25,  5.43s/it][A

loss: tensor(0.9733, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:55<07:19,  5.43s/it][A

loss: tensor(0.9506, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [24:01<07:12,  5.41s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [24:06<07:07,  5.42s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [24:11<07:01,  5.40s/it][A

loss: tensor(0.9458, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [24:17<06:56,  5.41s/it][A

loss: tensor(0.9576, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [24:22<06:49,  5.39s/it][A

loss: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [24:28<06:44,  5.39s/it][A

loss: tensor(0.9429, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [24:33<06:37,  5.38s/it][A

loss: tensor(0.9491, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [24:38<06:34,  5.40s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [24:44<06:28,  5.40s/it][A

loss: tensor(0.9463, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:49<06:22,  5.39s/it][A

loss: tensor(1.0056, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:55<06:17,  5.40s/it][A

loss: tensor(0.9842, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [25:00<06:14,  5.42s/it][A

loss: tensor(0.9517, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [25:05<06:09,  5.43s/it][A

loss: tensor(0.9302, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [25:11<06:03,  5.43s/it][A

loss: tensor(0.9295, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [25:16<05:57,  5.42s/it][A

loss: tensor(0.9642, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [25:22<05:53,  5.43s/it][A

loss: tensor(0.9526, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [25:27<05:47,  5.43s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [25:33<05:40,  5.41s/it][A

loss: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [25:38<05:34,  5.39s/it][A

loss: tensor(0.9478, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [25:43<05:30,  5.41s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:49<05:24,  5.41s/it][A

loss: tensor(0.9393, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:54<05:19,  5.42s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [26:00<05:15,  5.43s/it][A

loss: tensor(0.9177, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [26:05<05:09,  5.42s/it][A

loss: tensor(0.9730, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [26:10<05:03,  5.43s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [26:16<04:57,  5.42s/it][A

loss: tensor(0.9136, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [26:21<04:53,  5.43s/it][A

loss: tensor(0.9792, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [26:27<04:47,  5.42s/it][A

loss: tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [26:32<04:40,  5.40s/it][A

loss: tensor(0.9436, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [26:37<04:34,  5.39s/it][A

loss: tensor(0.9208, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [26:43<04:30,  5.41s/it][A

loss: tensor(0.9686, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [26:48<04:25,  5.41s/it][A

loss: tensor(0.9605, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:54<04:20,  5.42s/it][A

loss: tensor(0.9327, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:59<04:14,  5.41s/it][A

loss: tensor(0.9418, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [27:05<04:09,  5.43s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [27:10<04:03,  5.41s/it][A

loss: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [27:15<03:57,  5.40s/it][A

loss: tensor(0.9753, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [27:21<03:52,  5.40s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [27:26<03:47,  5.41s/it][A

loss: tensor(0.9306, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [27:32<03:41,  5.40s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [27:37<03:35,  5.39s/it][A

loss: tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [27:42<03:30,  5.38s/it][A

loss: tensor(0.9457, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [27:48<03:25,  5.40s/it][A

loss: tensor(0.9922, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:53<03:19,  5.39s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:59<03:14,  5.40s/it][A

loss: tensor(0.9230, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [28:04<03:09,  5.42s/it][A

loss: tensor(0.9256, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [28:09<03:03,  5.40s/it][A

loss: tensor(0.9464, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [28:15<02:58,  5.41s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [28:20<02:52,  5.40s/it][A

loss: tensor(0.9736, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [28:26<02:47,  5.42s/it][A

loss: tensor(0.9685, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [28:31<02:42,  5.40s/it][A

loss: tensor(0.9688, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [28:36<02:36,  5.39s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [28:42<02:30,  5.39s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [28:47<02:26,  5.42s/it][A

loss: tensor(0.9443, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [28:53<02:20,  5.41s/it][A

loss: tensor(0.9385, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:58<02:14,  5.39s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [29:03<02:09,  5.39s/it][A

loss: tensor(0.9838, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [29:09<02:04,  5.41s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [29:14<01:58,  5.40s/it][A

loss: tensor(0.9572, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [29:20<01:53,  5.40s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [29:25<01:47,  5.39s/it][A

loss: tensor(0.9472, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [29:30<01:42,  5.40s/it][A

loss: tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [29:36<01:37,  5.40s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [29:41<01:31,  5.39s/it][A

loss: tensor(0.9543, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [29:46<01:26,  5.38s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [29:52<01:21,  5.40s/it][A

loss: tensor(0.9239, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:57<01:15,  5.40s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [30:03<01:10,  5.39s/it][A

loss: tensor(0.9210, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [30:08<01:04,  5.41s/it][A

loss: tensor(0.9536, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [30:14<00:59,  5.40s/it][A

loss: tensor(0.9446, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [30:19<00:54,  5.40s/it][A

loss: tensor(0.9919, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [30:24<00:48,  5.39s/it][A

loss: tensor(0.9481, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [30:30<00:43,  5.41s/it][A

loss: tensor(1.0049, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [30:35<00:37,  5.40s/it][A

loss: tensor(0.9133, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [30:41<00:32,  5.41s/it][A

loss: tensor(0.9581, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [30:46<00:27,  5.40s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [30:51<00:21,  5.41s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [30:57<00:16,  5.39s/it][A

loss: tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [31:02<00:10,  5.39s/it][A

loss: tensor(0.9207, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [31:07<00:05,  5.38s/it][A

loss: tensor(0.9858, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [31:08<00:00,  5.40s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(1.0677, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9420807290973002

	Training acc: 0.962919338185882

	Training prec: 0.9278816936309219

	Training rec: 0.962919338185882

	Training f1: 0.9449078794582544

	Current Learning rate:  3.5e-05



  1%|          | 1/173 [00:00<01:50,  1.56it/s][A
  1%|          | 2/173 [00:01<01:56,  1.46it/s][A
  2%|▏         | 3/173 [00:02<01:58,  1.44it/s][A
  2%|▏         | 4/173 [00:02<01:54,  1.47it/s][A
  3%|▎         | 5/173 [00:03<01:56,  1.45it/s][A
  3%|▎         | 6/173 [00:04<01:53,  1.48it/s][A
  4%|▍         | 7/173 [00:04<01:48,  1.54it/s][A
  5%|▍         | 8/173 [00:05<01:46,  1.55it/s][A
  5%|▌         | 9/173 [00:05<01:45,  1.55it/s][A
  6%|▌         | 10/173 [00:06<01:42,  1.59it/s][A
  6%|▋         | 11/173 [00:07<01:42,  1.58it/s][A
  7%|▋         | 12/173 [00:07<01:44,  1.54it/s][A
  8%|▊         | 13/173 [00:08<01:43,  1.54it/s][A
  8%|▊         | 14/173 [00:09<01:46,  1.50it/s][A
  9%|▊         | 15/173 [00:09<01:48,  1.46it/s][A
  9%|▉         | 16/173 [00:10<01:45,  1.49it/s][A
 10%|▉         | 17/173 [00:11<01:47,  1.46it/s][A
 10%|█         | 18/173 [00:12<01:47,  1.44it/s][A
 11%|█         | 19/173 [00:12<01:44,  1.47it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9450329538714679

	Validation acc: 0.9598251164698463

	Validation prec: 0.9227942998089774

	Validation rec: 0.9598251164698463

	Validation f1: 0.940564878113327



  0%|          | 1/346 [00:04<27:50,  4.84s/it][A

loss: tensor(0.9474, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:09<27:45,  4.84s/it][A

loss: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:14<27:44,  4.85s/it][A

loss: tensor(0.9153, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:19<27:38,  4.85s/it][A

loss: tensor(0.9780, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:24<28:38,  5.04s/it][A

loss: tensor(0.9260, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:30<29:11,  5.15s/it][A

loss: tensor(0.9664, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:35<29:33,  5.23s/it][A

loss: tensor(0.9131, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:40<29:43,  5.28s/it][A

loss: tensor(0.9286, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:46<29:51,  5.32s/it][A

loss: tensor(0.9130, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:51<29:50,  5.33s/it][A

loss: tensor(0.9294, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:57<29:56,  5.36s/it][A

loss: tensor(0.9354, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:02<29:47,  5.35s/it][A

loss: tensor(0.9860, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:07<29:39,  5.35s/it][A

loss: tensor(0.9299, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:13<29:34,  5.35s/it][A

loss: tensor(0.9236, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:18<29:39,  5.38s/it][A

loss: tensor(0.9635, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:23<29:35,  5.38s/it][A

loss: tensor(0.9339, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:29<29:31,  5.39s/it][A

loss: tensor(0.9666, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:34<29:23,  5.38s/it][A

loss: tensor(0.9122, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:40<29:23,  5.39s/it][A

loss: tensor(0.9340, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:45<29:21,  5.40s/it][A

loss: tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:50<29:14,  5.40s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:56<29:09,  5.40s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:01<29:09,  5.42s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:07<29:01,  5.41s/it][A

loss: tensor(0.9528, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:12<28:51,  5.39s/it][A

loss: tensor(0.9443, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:17<28:49,  5.40s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:23<28:42,  5.40s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:28<28:40,  5.41s/it][A

loss: tensor(0.9552, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:34<28:30,  5.40s/it][A

loss: tensor(0.9410, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:39<28:28,  5.41s/it][A

loss: tensor(0.9598, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:44<28:18,  5.39s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:50<28:13,  5.39s/it][A

loss: tensor(0.9139, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:55<27:54,  5.35s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [03:00<27:45,  5.34s/it][A

loss: tensor(0.9819, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:06<27:29,  5.31s/it][A

loss: tensor(0.9412, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:11<27:17,  5.28s/it][A

loss: tensor(0.9132, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:16<27:08,  5.27s/it][A

loss: tensor(0.9276, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:21<27:05,  5.28s/it][A

loss: tensor(0.9264, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:27<26:58,  5.27s/it][A

loss: tensor(0.9192, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:32<26:51,  5.27s/it][A

loss: tensor(0.9581, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:37<26:42,  5.25s/it][A

loss: tensor(0.9279, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:42<26:45,  5.28s/it][A

loss: tensor(0.9368, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:48<26:36,  5.27s/it][A

loss: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:53<26:27,  5.26s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:58<26:24,  5.26s/it][A

loss: tensor(0.9935, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:04<26:24,  5.28s/it][A

loss: tensor(0.9498, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:09<26:16,  5.27s/it][A

loss: tensor(0.9968, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:14<26:10,  5.27s/it][A

loss: tensor(0.9329, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:19<26:08,  5.28s/it][A

loss: tensor(0.9401, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:25<26:02,  5.28s/it][A

loss: tensor(0.9551, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:30<25:59,  5.29s/it][A

loss: tensor(0.9653, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:35<25:50,  5.28s/it][A

loss: tensor(0.9134, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:41<25:48,  5.28s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:46<25:38,  5.27s/it][A

loss: tensor(0.9136, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:51<25:32,  5.27s/it][A

loss: tensor(0.9280, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:56<25:25,  5.26s/it][A

loss: tensor(0.9721, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [05:02<25:24,  5.28s/it][A

loss: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:07<25:14,  5.26s/it][A

loss: tensor(0.9125, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:12<25:07,  5.25s/it][A

loss: tensor(0.9466, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:17<25:04,  5.26s/it][A

loss: tensor(0.9446, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:23<25:05,  5.28s/it][A

loss: tensor(1.0195, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:28<24:55,  5.26s/it][A

loss: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:33<24:49,  5.26s/it][A

loss: tensor(0.9500, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:38<24:43,  5.26s/it][A

loss: tensor(0.9148, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:44<24:43,  5.28s/it][A

loss: tensor(0.9410, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:49<24:35,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:54<24:28,  5.26s/it][A

loss: tensor(0.9475, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [05:59<24:21,  5.26s/it][A

loss: tensor(0.9991, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:05<24:21,  5.28s/it][A

loss: tensor(0.9328, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:10<24:14,  5.27s/it][A

loss: tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:15<24:09,  5.27s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:21<24:09,  5.29s/it][A

loss: tensor(0.9724, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:26<23:59,  5.27s/it][A

loss: tensor(0.9218, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:31<23:53,  5.27s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:36<23:49,  5.27s/it][A

loss: tensor(0.9519, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:42<23:44,  5.28s/it][A

loss: tensor(1.0002, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:47<23:35,  5.26s/it][A

loss: tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:52<23:31,  5.27s/it][A

loss: tensor(0.9312, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:57<23:22,  5.25s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [07:03<23:23,  5.28s/it][A

loss: tensor(0.9506, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:08<23:16,  5.27s/it][A

loss: tensor(0.9511, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:13<23:08,  5.26s/it][A

loss: tensor(0.9932, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:18<23:02,  5.26s/it][A

loss: tensor(0.9443, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:24<23:00,  5.27s/it][A

loss: tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:29<22:55,  5.27s/it][A

loss: tensor(0.9480, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:34<22:46,  5.26s/it][A

loss: tensor(1.0203, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:40<22:40,  5.25s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:45<22:45,  5.29s/it][A

loss: tensor(0.9543, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:50<22:36,  5.28s/it][A

loss: tensor(0.9424, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:55<22:29,  5.27s/it][A

loss: tensor(0.9640, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [08:01<22:23,  5.27s/it][A

loss: tensor(0.9700, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:06<22:18,  5.27s/it][A

loss: tensor(0.9275, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:11<22:11,  5.26s/it][A

loss: tensor(0.9300, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:16<22:05,  5.26s/it][A

loss: tensor(0.9926, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:22<22:04,  5.28s/it][A

loss: tensor(0.9464, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:27<21:55,  5.26s/it][A

loss: tensor(0.9451, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:32<21:48,  5.25s/it][A

loss: tensor(0.9306, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:37<21:40,  5.24s/it][A

loss: tensor(0.9823, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:43<21:39,  5.26s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:48<21:32,  5.26s/it][A

loss: tensor(0.9504, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:53<21:26,  5.25s/it][A

loss: tensor(0.9169, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [08:58<21:19,  5.24s/it][A

loss: tensor(0.9688, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:04<21:20,  5.27s/it][A

loss: tensor(0.9236, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:09<21:14,  5.27s/it][A

loss: tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:14<21:07,  5.26s/it][A

loss: tensor(1.0117, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:20<21:01,  5.25s/it][A

loss: tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:25<21:00,  5.28s/it][A

loss: tensor(0.9132, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:30<20:52,  5.26s/it][A

loss: tensor(0.9156, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:35<20:48,  5.27s/it][A

loss: tensor(0.9337, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:41<20:40,  5.26s/it][A

loss: tensor(0.9113, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:46<20:39,  5.28s/it][A

loss: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:51<20:33,  5.27s/it][A

loss: tensor(0.9735, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [09:56<20:26,  5.26s/it][A

loss: tensor(0.9814, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [10:02<20:26,  5.29s/it][A

loss: tensor(0.9630, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:07<20:19,  5.28s/it][A

loss: tensor(0.9543, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:12<20:12,  5.27s/it][A

loss: tensor(0.9376, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:18<20:06,  5.27s/it][A

loss: tensor(0.9474, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:23<20:04,  5.28s/it][A

loss: tensor(0.9417, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:28<19:55,  5.27s/it][A

loss: tensor(0.9482, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:33<19:49,  5.26s/it][A

loss: tensor(0.9795, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:39<19:41,  5.25s/it][A

loss: tensor(0.9633, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:44<19:38,  5.26s/it][A

loss: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:49<19:31,  5.25s/it][A

loss: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:54<19:25,  5.25s/it][A

loss: tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [11:00<19:20,  5.25s/it][A

loss: tensor(1.0338, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:05<19:20,  5.27s/it][A

loss: tensor(0.9209, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:10<19:13,  5.27s/it][A

loss: tensor(0.9228, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:15<19:05,  5.25s/it][A

loss: tensor(0.9902, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:21<18:59,  5.25s/it][A

loss: tensor(0.9677, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:26<18:59,  5.28s/it][A

loss: tensor(0.9600, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:31<18:51,  5.26s/it][A

loss: tensor(0.9445, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:36<18:46,  5.26s/it][A

loss: tensor(0.9182, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:42<18:38,  5.25s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:47<18:37,  5.27s/it][A

loss: tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:52<18:31,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [11:57<18:22,  5.25s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [12:03<18:16,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:08<18:15,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:13<18:09,  5.26s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:19<18:04,  5.27s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:24<18:01,  5.28s/it][A

loss: tensor(0.9481, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:29<17:53,  5.26s/it][A

loss: tensor(0.9311, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:34<17:47,  5.26s/it][A

loss: tensor(0.9195, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:40<17:39,  5.25s/it][A

loss: tensor(0.9806, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:45<17:38,  5.27s/it][A

loss: tensor(0.9890, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:50<17:31,  5.26s/it][A

loss: tensor(0.9457, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [12:55<17:25,  5.25s/it][A

loss: tensor(0.9808, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [13:01<17:17,  5.24s/it][A

loss: tensor(0.9648, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:06<17:17,  5.26s/it][A

loss: tensor(0.9609, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:11<17:12,  5.27s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:16<17:06,  5.26s/it][A

loss: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:22<17:01,  5.27s/it][A

loss: tensor(0.9551, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:27<17:00,  5.29s/it][A

loss: tensor(0.9470, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:32<16:52,  5.27s/it][A

loss: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:38<16:46,  5.27s/it][A

loss: tensor(0.9132, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:43<16:39,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:48<16:37,  5.28s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [13:53<16:31,  5.27s/it][A

loss: tensor(0.9699, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [13:59<16:24,  5.26s/it][A

loss: tensor(0.9182, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [14:04<16:22,  5.28s/it][A

loss: tensor(1.0363, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:09<16:14,  5.27s/it][A

loss: tensor(0.9772, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:14<16:07,  5.26s/it][A

loss: tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:20<16:01,  5.25s/it][A

loss: tensor(0.9454, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:25<15:56,  5.25s/it][A

loss: tensor(0.9140, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:30<15:50,  5.25s/it][A

loss: tensor(0.9468, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:35<15:46,  5.26s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:41<15:40,  5.25s/it][A

loss: tensor(0.9688, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:46<15:37,  5.27s/it][A

loss: tensor(0.9239, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:51<15:31,  5.26s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [14:56<15:26,  5.26s/it][A

loss: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [15:02<15:20,  5.26s/it][A

loss: tensor(0.9393, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:07<15:17,  5.27s/it][A

loss: tensor(0.9115, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:12<15:09,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:17<15:03,  5.25s/it][A

loss: tensor(0.9594, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:23<14:57,  5.25s/it][A

loss: tensor(0.9378, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:28<14:55,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:33<14:48,  5.26s/it][A

loss: tensor(0.9876, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:38<14:42,  5.25s/it][A

loss: tensor(0.9640, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:44<14:35,  5.24s/it][A

loss: tensor(0.9456, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:49<14:34,  5.27s/it][A

loss: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [15:54<14:27,  5.26s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [16:00<14:21,  5.26s/it][A

loss: tensor(0.9490, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [16:05<14:15,  5.25s/it][A

loss: tensor(0.9540, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:10<14:14,  5.28s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:15<14:07,  5.26s/it][A

loss: tensor(0.9116, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:21<14:01,  5.26s/it][A

loss: tensor(0.9384, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:26<13:59,  5.28s/it][A

loss: tensor(0.9531, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:31<13:52,  5.27s/it][A

loss: tensor(0.9805, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:36<13:46,  5.26s/it][A

loss: tensor(0.9284, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:42<13:40,  5.26s/it][A

loss: tensor(0.9304, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:47<13:36,  5.27s/it][A

loss: tensor(0.9434, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [16:52<13:30,  5.27s/it][A

loss: tensor(0.9696, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [16:57<13:24,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [17:03<13:18,  5.25s/it][A

loss: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:08<13:16,  5.27s/it][A

loss: tensor(0.9296, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:13<13:08,  5.26s/it][A

loss: tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:18<13:03,  5.26s/it][A

loss: tensor(0.9282, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:24<12:58,  5.26s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:29<12:56,  5.28s/it][A

loss: tensor(0.9539, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:34<12:49,  5.27s/it][A

loss: tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:40<12:43,  5.26s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:45<12:36,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:50<12:34,  5.28s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [17:55<12:28,  5.27s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [18:01<12:23,  5.27s/it][A

loss: tensor(1.0175, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:06<12:19,  5.28s/it][A

loss: tensor(0.9654, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:11<12:12,  5.27s/it][A

loss: tensor(0.9366, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:16<12:05,  5.26s/it][A

loss: tensor(0.9283, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:22<12:01,  5.27s/it][A

loss: tensor(0.9936, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:27<11:58,  5.29s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:32<11:50,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:38<11:46,  5.27s/it][A

loss: tensor(0.9609, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:43<11:40,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:48<11:36,  5.28s/it][A

loss: tensor(0.9640, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [18:53<11:30,  5.27s/it][A

loss: tensor(0.9556, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [18:59<11:24,  5.26s/it][A

loss: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [19:04<11:17,  5.25s/it][A

loss: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:09<11:14,  5.27s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:14<11:09,  5.27s/it][A

loss: tensor(0.9256, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:20<11:02,  5.26s/it][A

loss: tensor(0.9583, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:25<10:56,  5.25s/it][A

loss: tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:30<10:53,  5.27s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:35<10:46,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:41<10:40,  5.25s/it][A

loss: tensor(0.9703, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:46<10:35,  5.25s/it][A

loss: tensor(0.9782, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [19:51<10:32,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [19:57<10:28,  5.28s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [20:02<10:22,  5.27s/it][A

loss: tensor(0.9256, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:07<10:15,  5.26s/it][A

loss: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:12<10:12,  5.28s/it][A

loss: tensor(0.9815, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:18<10:05,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:23<09:58,  5.25s/it][A

loss: tensor(0.9967, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:28<09:54,  5.26s/it][A

loss: tensor(0.9480, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:33<09:47,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:39<09:41,  5.24s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:44<09:35,  5.23s/it][A

loss: tensor(0.9161, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [20:49<09:33,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [20:54<09:27,  5.25s/it][A

loss: tensor(0.9472, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [21:00<09:20,  5.24s/it][A

loss: tensor(0.9133, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [21:05<09:14,  5.23s/it][A

loss: tensor(0.9693, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:10<09:12,  5.26s/it][A

loss: tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:15<09:05,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:21<09:00,  5.25s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:26<08:54,  5.24s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:31<08:50,  5.25s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:36<08:43,  5.23s/it][A

loss: tensor(0.9733, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:41<08:37,  5.22s/it][A

loss: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [21:47<08:31,  5.22s/it][A

loss: tensor(0.9611, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [21:52<08:28,  5.24s/it][A

loss: tensor(0.9526, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [21:57<08:21,  5.23s/it][A

loss: tensor(0.9502, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [22:02<08:16,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:08<08:12,  5.24s/it][A

loss: tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:13<08:07,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:18<08:01,  5.23s/it][A

loss: tensor(0.9335, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:23<07:56,  5.23s/it][A

loss: tensor(0.9494, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:29<07:51,  5.24s/it][A

loss: tensor(0.9318, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:34<07:44,  5.22s/it][A

loss: tensor(0.9472, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:39<07:38,  5.21s/it][A

loss: tensor(0.9933, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [22:44<07:32,  5.20s/it][A

loss: tensor(0.9372, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [22:49<07:28,  5.21s/it][A

loss: tensor(0.9621, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [22:55<07:23,  5.21s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [23:00<07:17,  5.21s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:05<07:12,  5.21s/it][A

loss: tensor(0.9508, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:10<07:08,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:15<07:03,  5.23s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:21<06:57,  5.22s/it][A

loss: tensor(0.9425, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:26<06:51,  5.21s/it][A

loss: tensor(0.9874, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:31<06:48,  5.23s/it][A

loss: tensor(0.9470, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:36<06:42,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:42<06:36,  5.22s/it][A

loss: tensor(0.9724, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [23:47<06:31,  5.22s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [23:52<06:28,  5.25s/it][A

loss: tensor(0.9757, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [23:57<06:22,  5.24s/it][A

loss: tensor(0.9990, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [24:03<06:17,  5.25s/it][A

loss: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:08<06:12,  5.24s/it][A

loss: tensor(0.9462, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:13<06:07,  5.26s/it][A

loss: tensor(0.9287, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:18<06:02,  5.25s/it][A

loss: tensor(0.9216, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:24<05:56,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:29<05:51,  5.25s/it][A

loss: tensor(0.9709, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:34<05:45,  5.24s/it][A

loss: tensor(0.9325, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:39<05:40,  5.23s/it][A

loss: tensor(0.9329, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [24:44<05:35,  5.24s/it][A

loss: tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [24:50<05:30,  5.25s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [24:55<05:25,  5.25s/it][A

loss: tensor(0.9165, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [25:00<05:19,  5.24s/it][A

loss: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:05<05:13,  5.23s/it][A

loss: tensor(0.9757, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:11<05:09,  5.25s/it][A

loss: tensor(0.9128, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [25:16<05:03,  5.24s/it][A

loss: tensor(0.9368, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:21<04:58,  5.24s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:26<04:53,  5.23s/it][A

loss: tensor(0.9592, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:32<04:48,  5.25s/it][A

loss: tensor(0.9818, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:37<04:43,  5.24s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [25:42<04:37,  5.24s/it][A

loss: tensor(0.9616, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [25:47<04:32,  5.23s/it][A

loss: tensor(0.9854, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [25:53<04:28,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [25:58<04:22,  5.25s/it][A

loss: tensor(0.9351, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [26:03<04:16,  5.24s/it][A

loss: tensor(0.9249, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:08<04:11,  5.24s/it][A

loss: tensor(0.9470, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:14<04:06,  5.25s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:19<04:01,  5.25s/it][A

loss: tensor(0.9270, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:24<03:55,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:29<03:51,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:35<03:45,  5.25s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [26:40<03:40,  5.25s/it][A

loss: tensor(0.9811, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [26:45<03:34,  5.24s/it][A

loss: tensor(0.9603, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [26:50<03:30,  5.25s/it][A

loss: tensor(0.9191, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [26:56<03:24,  5.24s/it][A

loss: tensor(0.9346, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [27:01<03:19,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:06<03:13,  5.23s/it][A

loss: tensor(0.9172, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:11<03:09,  5.26s/it][A

loss: tensor(0.9159, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [27:17<03:03,  5.25s/it][A

loss: tensor(0.9876, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:22<02:58,  5.24s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:27<02:52,  5.24s/it][A

loss: tensor(0.9136, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:32<02:48,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [27:38<02:42,  5.25s/it][A

loss: tensor(0.9833, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [27:43<02:37,  5.25s/it][A

loss: tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [27:48<02:32,  5.24s/it][A

loss: tensor(0.9482, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [27:53<02:27,  5.26s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [27:59<02:21,  5.25s/it][A

loss: tensor(0.9440, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [28:04<02:16,  5.25s/it][A

loss: tensor(0.9617, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:09<02:11,  5.24s/it][A

loss: tensor(0.9713, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [28:14<02:06,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:20<02:01,  5.26s/it][A

loss: tensor(0.9198, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:25<01:55,  5.26s/it][A

loss: tensor(0.9536, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:30<01:50,  5.27s/it][A

loss: tensor(0.9627, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [28:35<01:45,  5.25s/it][A

loss: tensor(0.9736, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [28:41<01:39,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [28:46<01:34,  5.24s/it][A

loss: tensor(0.9218, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [28:51<01:29,  5.26s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [28:56<01:23,  5.25s/it][A

loss: tensor(0.9327, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [29:02<01:18,  5.24s/it][A

loss: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:07<01:13,  5.23s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [29:12<01:08,  5.25s/it][A

loss: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [29:17<01:02,  5.24s/it][A

loss: tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:23<00:57,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:28<00:52,  5.23s/it][A

loss: tensor(0.9804, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:33<00:47,  5.24s/it][A

loss: tensor(0.9616, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [29:38<00:41,  5.23s/it][A

loss: tensor(0.9485, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [29:43<00:36,  5.22s/it][A

loss: tensor(0.9546, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [29:49<00:31,  5.23s/it][A

loss: tensor(0.9729, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [29:54<00:26,  5.25s/it][A

loss: tensor(0.9413, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [29:59<00:20,  5.25s/it][A

loss: tensor(0.9257, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [30:04<00:15,  5.24s/it][A

loss: tensor(0.9764, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [30:10<00:10,  5.26s/it][A

loss: tensor(0.9366, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [30:15<00:05,  5.25s/it][A

loss: tensor(0.9627, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [30:15<00:00,  5.25s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9415410674376294

	Training acc: 0.9633893215801959

	Training prec: 0.9288306294858794

	Training rec: 0.9633893215801959

	Training f1: 0.9456152114184831

	Current Learning rate:  3e-05



  1%|          | 1/173 [00:00<02:00,  1.43it/s][A
  1%|          | 2/173 [00:01<01:51,  1.53it/s][A
  2%|▏         | 3/173 [00:02<01:54,  1.49it/s][A
  2%|▏         | 4/173 [00:02<01:55,  1.47it/s][A
  3%|▎         | 5/173 [00:03<01:50,  1.52it/s][A
  3%|▎         | 6/173 [00:04<01:52,  1.49it/s][A
  4%|▍         | 7/173 [00:04<01:53,  1.47it/s][A
  5%|▍         | 8/173 [00:05<01:49,  1.51it/s][A
  5%|▌         | 9/173 [00:06<01:50,  1.49it/s][A
  6%|▌         | 10/173 [00:06<01:51,  1.47it/s][A
  6%|▋         | 11/173 [00:07<01:47,  1.51it/s][A
  7%|▋         | 12/173 [00:08<01:48,  1.49it/s][A
  8%|▊         | 13/173 [00:08<01:48,  1.47it/s][A
  8%|▊         | 14/173 [00:09<01:44,  1.53it/s][A
  9%|▊         | 15/173 [00:10<01:45,  1.50it/s][A
  9%|▉         | 16/173 [00:10<01:46,  1.48it/s][A
 10%|▉         | 17/173 [00:11<01:42,  1.52it/s][A
 10%|█         | 18/173 [00:12<01:43,  1.49it/s][A
 11%|█         | 19/173 [00:12<01:44,  1.47it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9462702088273329

	Validation acc: 0.9585774712684025

	Validation prec: 0.9205964394244468

	Validation rec: 0.9585774712684025

	Validation f1: 0.9387723876502765



  0%|          | 1/346 [00:05<30:07,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<30:12,  5.27s/it][A

loss: tensor(0.9567, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:15<30:00,  5.25s/it][A

loss: tensor(0.9362, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:20<29:50,  5.24s/it][A

loss: tensor(0.9645, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<29:46,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:31<29:46,  5.26s/it][A

loss: tensor(0.9655, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:36<29:34,  5.23s/it][A

loss: tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:41<29:29,  5.24s/it][A

loss: tensor(0.9979, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:47<29:23,  5.23s/it][A

loss: tensor(0.9348, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:52<29:27,  5.26s/it][A

loss: tensor(0.9529, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:57<29:14,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:02<29:09,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:08<28:58,  5.22s/it][A

loss: tensor(0.9413, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:13<28:58,  5.24s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:18<28:53,  5.24s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:23<28:45,  5.23s/it][A

loss: tensor(0.9424, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:29<28:42,  5.23s/it][A

loss: tensor(0.9791, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:34<28:42,  5.25s/it][A

loss: tensor(0.9221, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:39<28:38,  5.25s/it][A

loss: tensor(0.9536, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:44<28:29,  5.24s/it][A

loss: tensor(0.9263, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:50<28:24,  5.24s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:55<28:24,  5.26s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:00<28:13,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:05<28:06,  5.24s/it][A

loss: tensor(0.9734, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:11<27:58,  5.23s/it][A

loss: tensor(0.9257, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:16<27:55,  5.24s/it][A

loss: tensor(0.9230, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:21<27:45,  5.22s/it][A

loss: tensor(0.9466, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:26<27:38,  5.22s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:31<27:35,  5.22s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:37<27:23,  5.20s/it][A

loss: tensor(0.9511, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:42<27:18,  5.20s/it][A

loss: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:47<27:11,  5.20s/it][A

loss: tensor(0.9249, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:52<27:10,  5.21s/it][A

loss: tensor(1.0064, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [02:57<27:02,  5.20s/it][A

loss: tensor(0.9507, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:03<26:56,  5.20s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:08<26:47,  5.18s/it][A

loss: tensor(0.9140, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:13<26:47,  5.20s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:18<26:39,  5.19s/it][A

loss: tensor(0.9837, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:23<26:31,  5.18s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:28<26:25,  5.18s/it][A

loss: tensor(0.9195, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:34<26:25,  5.20s/it][A

loss: tensor(0.9749, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:39<26:16,  5.19s/it][A

loss: tensor(0.9246, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:44<26:12,  5.19s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:49<26:05,  5.18s/it][A

loss: tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:54<26:05,  5.20s/it][A

loss: tensor(0.9530, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:00<25:58,  5.19s/it][A

loss: tensor(1.0017, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:05<25:48,  5.18s/it][A

loss: tensor(0.9552, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:10<25:47,  5.19s/it][A

loss: tensor(0.9725, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:15<25:38,  5.18s/it][A

loss: tensor(1.0082, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:20<25:29,  5.17s/it][A

loss: tensor(0.9677, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:25<25:24,  5.17s/it][A

loss: tensor(1.0521, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:31<25:25,  5.19s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:36<25:15,  5.17s/it][A

loss: tensor(0.9864, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:41<25:12,  5.18s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:46<25:02,  5.16s/it][A

loss: tensor(0.9661, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:51<25:00,  5.17s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [04:56<24:51,  5.16s/it][A

loss: tensor(1.0012, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:02<24:42,  5.15s/it][A

loss: tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:07<24:34,  5.14s/it][A

loss: tensor(0.9426, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:12<24:35,  5.16s/it][A

loss: tensor(0.9119, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:17<24:25,  5.14s/it][A

loss: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:22<24:22,  5.15s/it][A

loss: tensor(0.9623, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:27<24:20,  5.16s/it][A

loss: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:33<24:19,  5.17s/it][A

loss: tensor(0.9989, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:38<24:11,  5.17s/it][A

loss: tensor(0.9251, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:43<24:05,  5.16s/it][A

loss: tensor(0.9466, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:48<24:00,  5.16s/it][A

loss: tensor(0.9484, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [05:53<23:59,  5.18s/it][A

loss: tensor(0.9804, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [05:58<23:50,  5.16s/it][A

loss: tensor(0.9466, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:04<23:43,  5.16s/it][A

loss: tensor(0.9482, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:09<23:37,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:14<23:37,  5.17s/it][A

loss: tensor(0.9612, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:19<23:39,  5.20s/it][A

loss: tensor(0.9766, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:24<23:37,  5.21s/it][A

loss: tensor(0.9318, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:30<23:38,  5.23s/it][A

loss: tensor(0.9277, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:35<23:29,  5.22s/it][A

loss: tensor(0.9635, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:40<23:25,  5.22s/it][A

loss: tensor(0.9451, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:45<23:19,  5.22s/it][A

loss: tensor(0.9786, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:51<23:17,  5.23s/it][A

loss: tensor(0.9996, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [06:56<23:11,  5.23s/it][A

loss: tensor(0.9212, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:01<23:06,  5.23s/it][A

loss: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:06<22:58,  5.22s/it][A

loss: tensor(0.9572, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:12<22:59,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:17<22:50,  5.23s/it][A

loss: tensor(0.9518, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:22<22:42,  5.22s/it][A

loss: tensor(0.9654, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:27<22:37,  5.22s/it][A

loss: tensor(0.9743, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:32<22:35,  5.23s/it][A

loss: tensor(0.9504, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:38<22:30,  5.24s/it][A

loss: tensor(0.9609, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:43<22:23,  5.23s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:48<22:17,  5.22s/it][A

loss: tensor(0.9443, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [07:53<22:17,  5.25s/it][A

loss: tensor(0.9296, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [07:59<22:12,  5.24s/it][A

loss: tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:04<22:02,  5.23s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:09<22:00,  5.24s/it][A

loss: tensor(0.9780, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:14<21:52,  5.23s/it][A

loss: tensor(0.9582, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:20<21:46,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:25<21:40,  5.22s/it][A

loss: tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:30<21:38,  5.23s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:35<21:32,  5.23s/it][A

loss: tensor(1.0069, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:40<21:30,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:46<21:22,  5.24s/it][A

loss: tensor(0.9413, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [08:51<21:19,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [08:56<21:11,  5.23s/it][A

loss: tensor(0.9307, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:01<21:05,  5.23s/it][A

loss: tensor(0.9497, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:07<20:58,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:12<20:59,  5.25s/it][A

loss: tensor(0.9173, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:17<20:52,  5.24s/it][A

loss: tensor(0.9134, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:22<20:45,  5.23s/it][A

loss: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:28<20:39,  5.23s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:33<20:37,  5.24s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:38<20:28,  5.23s/it][A

loss: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:43<20:23,  5.23s/it][A

loss: tensor(0.9899, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [09:48<20:16,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [09:54<20:15,  5.24s/it][A

loss: tensor(0.9287, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [09:59<20:08,  5.23s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:04<20:00,  5.22s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:09<19:53,  5.21s/it][A

loss: tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:15<19:51,  5.23s/it][A

loss: tensor(0.9528, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:20<19:44,  5.22s/it][A

loss: tensor(0.9370, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:25<19:37,  5.21s/it][A

loss: tensor(0.9303, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:30<19:35,  5.23s/it][A

loss: tensor(0.9221, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:35<19:26,  5.21s/it][A

loss: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:41<19:20,  5.21s/it][A

loss: tensor(0.9647, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:46<19:14,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [10:51<19:12,  5.21s/it][A

loss: tensor(0.9328, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [10:56<19:04,  5.20s/it][A

loss: tensor(0.9255, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:01<18:58,  5.20s/it][A

loss: tensor(0.9394, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:07<18:51,  5.19s/it][A

loss: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:12<18:51,  5.21s/it][A

loss: tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:17<18:45,  5.21s/it][A

loss: tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:22<18:38,  5.20s/it][A

loss: tensor(0.9205, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:27<18:34,  5.21s/it][A

loss: tensor(0.9254, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:33<18:33,  5.23s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:38<18:25,  5.22s/it][A

loss: tensor(0.9671, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:43<18:18,  5.21s/it][A

loss: tensor(0.9602, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [11:48<18:12,  5.20s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [11:54<18:11,  5.22s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [11:59<18:02,  5.21s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:04<17:56,  5.20s/it][A

loss: tensor(0.9372, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:09<17:50,  5.20s/it][A

loss: tensor(0.9421, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:14<17:49,  5.22s/it][A

loss: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:20<17:42,  5.21s/it][A

loss: tensor(0.9602, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:25<17:36,  5.20s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:30<17:34,  5.22s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:35<17:25,  5.20s/it][A

loss: tensor(0.9844, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:40<17:19,  5.20s/it][A

loss: tensor(0.9763, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [12:46<17:14,  5.20s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [12:51<17:12,  5.21s/it][A

loss: tensor(0.9699, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [12:56<17:05,  5.21s/it][A

loss: tensor(0.9886, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:01<16:59,  5.20s/it][A

loss: tensor(0.9147, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:06<16:52,  5.19s/it][A

loss: tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:12<16:50,  5.21s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:17<16:44,  5.20s/it][A

loss: tensor(0.9373, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:22<16:36,  5.19s/it][A

loss: tensor(0.9760, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:27<16:31,  5.19s/it][A

loss: tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:32<16:30,  5.21s/it][A

loss: tensor(0.9698, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:38<16:23,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [13:43<16:16,  5.20s/it][A

loss: tensor(0.9328, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [13:48<16:11,  5.20s/it][A

loss: tensor(0.9191, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [13:53<16:10,  5.22s/it][A

loss: tensor(0.9401, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [13:58<16:03,  5.21s/it][A

loss: tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:04<15:56,  5.20s/it][A

loss: tensor(0.9307, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:09<15:55,  5.22s/it][A

loss: tensor(0.9453, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:14<15:47,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:19<15:41,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:24<15:35,  5.20s/it][A

loss: tensor(0.9597, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:30<15:33,  5.21s/it][A

loss: tensor(0.9725, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:35<15:25,  5.20s/it][A

loss: tensor(0.9165, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:40<15:20,  5.20s/it][A

loss: tensor(0.9602, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [14:45<15:14,  5.20s/it][A

loss: tensor(0.9502, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [14:50<15:12,  5.22s/it][A

loss: tensor(0.9556, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [14:56<15:06,  5.21s/it][A

loss: tensor(0.9547, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:01<15:00,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:06<14:52,  5.19s/it][A

loss: tensor(0.9264, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:11<14:51,  5.21s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:16<14:44,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:22<14:37,  5.19s/it][A

loss: tensor(0.9462, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:27<14:32,  5.19s/it][A

loss: tensor(0.9879, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:32<14:30,  5.21s/it][A

loss: tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:37<14:23,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [15:42<14:18,  5.20s/it][A

loss: tensor(0.9207, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [15:48<14:13,  5.20s/it][A

loss: tensor(0.9801, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [15:53<14:10,  5.22s/it][A

loss: tensor(0.9440, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [15:58<14:04,  5.21s/it][A

loss: tensor(0.9489, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:03<13:57,  5.20s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:09<13:51,  5.20s/it][A

loss: tensor(0.9287, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:14<13:50,  5.22s/it][A

loss: tensor(0.9418, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:19<13:42,  5.21s/it][A

loss: tensor(0.9583, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:24<13:37,  5.21s/it][A

loss: tensor(0.9493, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:29<13:34,  5.22s/it][A

loss: tensor(0.9781, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:35<13:26,  5.20s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [16:40<13:20,  5.20s/it][A

loss: tensor(0.9508, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [16:45<13:14,  5.19s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [16:50<13:12,  5.22s/it][A

loss: tensor(0.9910, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [16:55<13:07,  5.21s/it][A

loss: tensor(0.9899, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:01<13:00,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:06<12:55,  5.20s/it][A

loss: tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:11<12:53,  5.23s/it][A

loss: tensor(0.9707, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:16<12:46,  5.21s/it][A

loss: tensor(0.9744, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:21<12:39,  5.20s/it][A

loss: tensor(0.9469, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:27<12:34,  5.20s/it][A

loss: tensor(0.9432, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:32<12:31,  5.22s/it][A

loss: tensor(0.9487, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:37<12:24,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [17:42<12:18,  5.20s/it][A

loss: tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [17:47<12:12,  5.20s/it][A

loss: tensor(0.9232, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [17:53<12:10,  5.21s/it][A

loss: tensor(0.9594, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [17:58<12:04,  5.21s/it][A

loss: tensor(0.9396, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:03<11:59,  5.21s/it][A

loss: tensor(0.9453, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:08<11:53,  5.21s/it][A

loss: tensor(0.9447, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:14<11:50,  5.22s/it][A

loss: tensor(0.9205, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:19<11:43,  5.21s/it][A

loss: tensor(0.9774, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:24<11:38,  5.21s/it][A

loss: tensor(0.9650, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:29<11:34,  5.22s/it][A

loss: tensor(0.9130, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:34<11:27,  5.21s/it][A

loss: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [18:40<11:21,  5.20s/it][A

loss: tensor(0.9423, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [18:45<11:15,  5.20s/it][A

loss: tensor(0.9965, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [18:50<11:11,  5.20s/it][A

loss: tensor(0.9233, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [18:55<11:05,  5.20s/it][A

loss: tensor(0.9705, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:00<10:59,  5.19s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:06<10:53,  5.19s/it][A

loss: tensor(0.9126, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:11<10:51,  5.21s/it][A

loss: tensor(0.9137, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:16<10:44,  5.20s/it][A

loss: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:21<10:38,  5.19s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:26<10:33,  5.19s/it][A

loss: tensor(0.9464, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:32<10:30,  5.21s/it][A

loss: tensor(0.9185, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [19:37<10:24,  5.20s/it][A

loss: tensor(0.9436, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [19:42<10:17,  5.19s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [19:47<10:12,  5.19s/it][A

loss: tensor(0.9712, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [19:52<10:09,  5.21s/it][A

loss: tensor(0.9613, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [19:58<10:03,  5.20s/it][A

loss: tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:03<09:57,  5.20s/it][A

loss: tensor(0.9363, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:08<09:52,  5.20s/it][A

loss: tensor(0.9173, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:13<09:48,  5.21s/it][A

loss: tensor(0.9530, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:18<09:40,  5.18s/it][A

loss: tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:23<09:34,  5.17s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:29<09:30,  5.18s/it][A

loss: tensor(0.9306, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [20:34<09:22,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [20:39<09:16,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [20:44<09:10,  5.15s/it][A

loss: tensor(0.9768, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [20:49<09:07,  5.16s/it][A

loss: tensor(0.9279, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [20:54<09:01,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:00<08:55,  5.15s/it][A

loss: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:05<08:49,  5.14s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:10<08:46,  5.16s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:15<08:39,  5.15s/it][A

loss: tensor(0.9469, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:20<08:33,  5.14s/it][A

loss: tensor(0.9213, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:25<08:29,  5.14s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [21:30<08:26,  5.17s/it][A

loss: tensor(0.9363, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [21:36<08:19,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [21:41<08:14,  5.15s/it][A

loss: tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [21:46<08:09,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [21:51<08:07,  5.18s/it][A

loss: tensor(0.9321, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [21:56<08:02,  5.18s/it][A

loss: tensor(0.9737, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:02<07:57,  5.19s/it][A

loss: tensor(0.9318, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:07<07:53,  5.21s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:12<07:50,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:17<07:44,  5.22s/it][A

loss: tensor(0.9238, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:22<07:38,  5.20s/it][A

loss: tensor(1.0035, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [22:28<07:33,  5.21s/it][A

loss: tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [22:33<07:26,  5.19s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [22:38<07:20,  5.18s/it][A

loss: tensor(0.9445, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [22:43<07:14,  5.17s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [22:48<07:10,  5.18s/it][A

loss: tensor(0.9117, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [22:53<07:04,  5.18s/it][A

loss: tensor(0.9433, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [22:59<06:59,  5.18s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:04<06:53,  5.16s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:09<06:48,  5.18s/it][A

loss: tensor(0.9198, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:14<06:42,  5.17s/it][A

loss: tensor(0.9312, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:19<06:36,  5.16s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:24<06:31,  5.15s/it][A

loss: tensor(0.9345, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [23:30<06:27,  5.16s/it][A

loss: tensor(0.9529, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [23:35<06:21,  5.16s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [23:40<06:15,  5.15s/it][A

loss: tensor(0.9583, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [23:45<06:10,  5.15s/it][A

loss: tensor(0.9797, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [23:50<06:06,  5.16s/it][A

loss: tensor(0.9450, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [23:55<06:01,  5.17s/it][A

loss: tensor(0.9120, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:01<05:56,  5.16s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:06<05:50,  5.16s/it][A

loss: tensor(1.0143, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:11<05:46,  5.17s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:16<05:40,  5.16s/it][A

loss: tensor(0.9340, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:21<05:35,  5.16s/it][A

loss: tensor(1.0017, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [24:26<05:31,  5.17s/it][A

loss: tensor(0.9423, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [24:31<05:24,  5.15s/it][A

loss: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [24:37<05:18,  5.14s/it][A

loss: tensor(0.9147, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [24:42<05:12,  5.13s/it][A

loss: tensor(0.9545, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [24:47<05:08,  5.14s/it][A

loss: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [24:52<05:03,  5.14s/it][A

loss: tensor(0.9159, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [24:57<04:57,  5.13s/it][A

loss: tensor(0.9742, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:02<04:51,  5.12s/it][A

loss: tensor(0.9288, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:07<04:48,  5.14s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:13<04:42,  5.14s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:18<04:37,  5.14s/it][A

loss: tensor(0.9139, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [25:23<04:32,  5.14s/it][A

loss: tensor(0.9684, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [25:28<04:28,  5.16s/it][A

loss: tensor(0.9284, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [25:33<04:22,  5.15s/it][A

loss: tensor(0.9121, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [25:38<04:17,  5.14s/it][A

loss: tensor(0.9464, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [25:43<04:11,  5.14s/it][A

loss: tensor(0.9470, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [25:49<04:07,  5.16s/it][A

loss: tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [25:54<04:02,  5.16s/it][A

loss: tensor(0.9610, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [25:59<03:57,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:04<03:51,  5.16s/it][A

loss: tensor(0.9458, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:09<03:47,  5.17s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:14<03:41,  5.16s/it][A

loss: tensor(0.9275, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [26:20<03:36,  5.15s/it][A

loss: tensor(0.9992, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [26:25<03:31,  5.17s/it][A

loss: tensor(1.0075, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [26:30<03:26,  5.16s/it][A

loss: tensor(0.9200, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [26:35<03:21,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [26:40<03:16,  5.16s/it][A

loss: tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [26:45<03:11,  5.18s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [26:51<03:06,  5.17s/it][A

loss: tensor(0.9864, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [26:56<03:00,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:01<02:55,  5.15s/it][A

loss: tensor(0.9481, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:06<02:50,  5.18s/it][A

loss: tensor(0.9766, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:11<02:45,  5.18s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [27:16<02:40,  5.17s/it][A

loss: tensor(0.9456, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [27:22<02:35,  5.17s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [27:27<02:30,  5.19s/it][A

loss: tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [27:32<02:24,  5.17s/it][A

loss: tensor(0.9423, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [27:37<02:19,  5.18s/it][A

loss: tensor(0.9691, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [27:42<02:14,  5.18s/it][A

loss: tensor(0.9805, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [27:48<02:10,  5.21s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [27:53<02:04,  5.21s/it][A

loss: tensor(0.9223, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [27:58<01:59,  5.20s/it][A

loss: tensor(0.9371, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:03<01:54,  5.20s/it][A

loss: tensor(0.9328, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:08<01:49,  5.22s/it][A

loss: tensor(0.9246, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [28:14<01:44,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [28:19<01:39,  5.22s/it][A

loss: tensor(0.9661, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [28:24<01:34,  5.23s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [28:29<01:28,  5.22s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [28:35<01:23,  5.22s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [28:40<01:18,  5.22s/it][A

loss: tensor(0.9804, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [28:45<01:13,  5.23s/it][A

loss: tensor(0.9874, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [28:50<01:07,  5.23s/it][A

loss: tensor(0.9135, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [28:55<01:02,  5.22s/it][A

loss: tensor(1.0008, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:01<00:57,  5.22s/it][A

loss: tensor(0.9354, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:06<00:52,  5.24s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:11<00:47,  5.23s/it][A

loss: tensor(0.9501, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [29:16<00:41,  5.22s/it][A

loss: tensor(0.9736, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [29:22<00:36,  5.22s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [29:27<00:31,  5.24s/it][A

loss: tensor(0.9309, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [29:32<00:26,  5.24s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [29:37<00:20,  5.23s/it][A

loss: tensor(0.9251, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [29:43<00:15,  5.23s/it][A

loss: tensor(1.0135, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [29:48<00:10,  5.25s/it][A

loss: tensor(0.9233, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [29:53<00:05,  5.23s/it][A

loss: tensor(1.0009, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [29:53<00:00,  5.18s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9415660600441729

	Training acc: 0.9633328586117332

	Training prec: 0.9287301247724815

	Training rec: 0.9633328586117332

	Training f1: 0.9455340079151707

	Current Learning rate:  2.5e-05



  1%|          | 1/173 [00:00<01:58,  1.45it/s][A
  1%|          | 2/173 [00:01<01:57,  1.45it/s][A
  2%|▏         | 3/173 [00:01<01:50,  1.54it/s][A
  2%|▏         | 4/173 [00:02<01:52,  1.51it/s][A
  3%|▎         | 5/173 [00:03<01:52,  1.49it/s][A
  3%|▎         | 6/173 [00:03<01:48,  1.54it/s][A
  4%|▍         | 7/173 [00:04<01:50,  1.50it/s][A
  5%|▍         | 8/173 [00:05<01:52,  1.47it/s][A
  5%|▌         | 9/173 [00:05<01:47,  1.52it/s][A
  6%|▌         | 10/173 [00:06<01:48,  1.50it/s][A
  6%|▋         | 11/173 [00:07<01:48,  1.49it/s][A
  7%|▋         | 12/173 [00:07<01:45,  1.53it/s][A
  8%|▊         | 13/173 [00:08<01:46,  1.51it/s][A
  8%|▊         | 14/173 [00:09<01:46,  1.49it/s][A
  9%|▊         | 15/173 [00:09<01:43,  1.53it/s][A
  9%|▉         | 16/173 [00:10<01:44,  1.51it/s][A
 10%|▉         | 17/173 [00:11<01:44,  1.49it/s][A
 10%|█         | 18/173 [00:11<01:41,  1.53it/s][A
 11%|█         | 19/173 [00:12<01:42,  1.51it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.944558104683209

	Validation acc: 0.9602846853774322

	Validation prec: 0.9235108865296342

	Validation rec: 0.9602846853774322

	Validation f1: 0.9412008544455014



  0%|          | 1/346 [00:05<30:18,  5.27s/it][A

loss: tensor(0.9298, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<29:55,  5.22s/it][A

loss: tensor(0.9511, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:15<30:03,  5.26s/it][A

loss: tensor(0.9640, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:20<29:50,  5.23s/it][A

loss: tensor(0.9579, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<29:51,  5.26s/it][A

loss: tensor(0.9597, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:31<29:43,  5.25s/it][A

loss: tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:36<29:35,  5.24s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:41<29:27,  5.23s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:47<29:29,  5.25s/it][A

loss: tensor(1.0113, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:52<29:23,  5.25s/it][A

loss: tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:57<29:13,  5.23s/it][A

loss: tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:02<29:07,  5.23s/it][A

loss: tensor(0.9676, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:08<29:10,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:13<29:00,  5.24s/it][A

loss: tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:18<28:52,  5.23s/it][A

loss: tensor(1.0022, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:23<28:46,  5.23s/it][A

loss: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:29<28:44,  5.24s/it][A

loss: tensor(0.9592, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:34<28:40,  5.24s/it][A

loss: tensor(0.9577, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:39<28:31,  5.23s/it][A

loss: tensor(0.9246, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:44<28:31,  5.25s/it][A

loss: tensor(0.9239, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:50<28:19,  5.23s/it][A

loss: tensor(0.9454, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:55<28:15,  5.23s/it][A

loss: tensor(0.9181, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:00<28:09,  5.23s/it][A

loss: tensor(0.9637, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:05<28:10,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:11<28:02,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:16<27:51,  5.22s/it][A

loss: tensor(0.9478, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:21<27:44,  5.22s/it][A

loss: tensor(0.9568, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:26<27:47,  5.24s/it][A

loss: tensor(0.9571, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:31<27:40,  5.24s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:37<27:32,  5.23s/it][A

loss: tensor(0.9205, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:42<27:28,  5.23s/it][A

loss: tensor(0.9283, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:47<27:27,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:52<27:22,  5.25s/it][A

loss: tensor(0.9954, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [02:58<27:15,  5.24s/it][A

loss: tensor(0.9445, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:03<27:09,  5.24s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:08<27:11,  5.26s/it][A

loss: tensor(0.9716, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:13<27:02,  5.25s/it][A

loss: tensor(1.0059, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:19<26:54,  5.24s/it][A

loss: tensor(0.9585, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:24<26:55,  5.26s/it][A

loss: tensor(0.9432, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:29<26:47,  5.25s/it][A

loss: tensor(0.9628, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:34<26:40,  5.25s/it][A

loss: tensor(1.0134, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:40<26:33,  5.24s/it][A

loss: tensor(0.9161, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:45<26:33,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:50<26:23,  5.24s/it][A

loss: tensor(0.9607, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:55<26:17,  5.24s/it][A

loss: tensor(0.9303, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:01<26:09,  5.23s/it][A

loss: tensor(0.9799, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:06<26:08,  5.25s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:11<26:02,  5.24s/it][A

loss: tensor(0.9474, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:16<25:59,  5.25s/it][A

loss: tensor(1.0160, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:22<25:51,  5.24s/it][A

loss: tensor(0.9238, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:27<25:52,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:32<25:43,  5.25s/it][A

loss: tensor(0.9350, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:37<25:36,  5.24s/it][A

loss: tensor(1.0072, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:43<25:31,  5.24s/it][A

loss: tensor(0.9333, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:48<25:30,  5.26s/it][A

loss: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:53<25:20,  5.24s/it][A

loss: tensor(0.9132, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [04:58<25:14,  5.24s/it][A

loss: tensor(0.9331, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:04<25:09,  5.24s/it][A

loss: tensor(0.9321, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:09<25:09,  5.26s/it][A

loss: tensor(0.9679, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:14<25:01,  5.25s/it][A

loss: tensor(0.9366, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:19<24:57,  5.25s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:25<24:56,  5.27s/it][A

loss: tensor(0.9209, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:30<24:47,  5.26s/it][A

loss: tensor(0.9486, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:35<24:37,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:40<24:30,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:46<24:28,  5.24s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:51<24:20,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [05:56<24:16,  5.24s/it][A

loss: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:01<24:09,  5.23s/it][A

loss: tensor(0.9669, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:07<24:09,  5.25s/it][A

loss: tensor(0.9318, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:12<24:01,  5.24s/it][A

loss: tensor(0.9127, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:17<23:58,  5.25s/it][A

loss: tensor(0.9395, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:22<24:03,  5.29s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:28<24:11,  5.34s/it][A

loss: tensor(1.0027, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:33<24:09,  5.35s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:39<24:05,  5.35s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:44<24:02,  5.36s/it][A

loss: tensor(0.9459, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:49<24:06,  5.40s/it][A

loss: tensor(0.9160, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:55<24:01,  5.40s/it][A

loss: tensor(0.9135, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [07:00<23:52,  5.39s/it][A

loss: tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:06<23:46,  5.38s/it][A

loss: tensor(0.9796, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:11<23:46,  5.40s/it][A

loss: tensor(0.9592, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:16<23:38,  5.39s/it][A

loss: tensor(0.9587, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:22<23:31,  5.39s/it][A

loss: tensor(0.9295, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:27<23:24,  5.38s/it][A

loss: tensor(0.9803, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:33<23:23,  5.40s/it][A

loss: tensor(0.9733, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:38<23:17,  5.39s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:43<23:11,  5.39s/it][A

loss: tensor(0.9845, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:49<23:10,  5.41s/it][A

loss: tensor(0.9179, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:54<23:00,  5.39s/it][A

loss: tensor(0.9828, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [08:00<22:54,  5.39s/it][A

loss: tensor(0.9421, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:05<22:47,  5.39s/it][A

loss: tensor(0.9747, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:10<22:46,  5.40s/it][A

loss: tensor(0.9947, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:16<22:39,  5.39s/it][A

loss: tensor(0.9283, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:21<22:33,  5.39s/it][A

loss: tensor(0.9199, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:27<22:28,  5.40s/it][A

loss: tensor(0.9414, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:32<22:29,  5.42s/it][A

loss: tensor(0.9277, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:37<22:20,  5.41s/it][A

loss: tensor(0.9565, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:43<22:13,  5.40s/it][A

loss: tensor(0.9564, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:48<22:07,  5.40s/it][A

loss: tensor(0.9756, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:54<22:06,  5.41s/it][A

loss: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [08:59<22:01,  5.42s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:05<21:58,  5.43s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:10<21:50,  5.41s/it][A

loss: tensor(0.9655, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:15<21:49,  5.43s/it][A

loss: tensor(0.9303, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:21<21:41,  5.42s/it][A

loss: tensor(0.9360, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:26<21:34,  5.41s/it][A

loss: tensor(0.9584, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:32<21:29,  5.42s/it][A

loss: tensor(0.9130, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:37<21:24,  5.42s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:42<21:17,  5.42s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:48<21:10,  5.40s/it][A

loss: tensor(0.9501, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:53<21:10,  5.43s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [09:59<21:00,  5.41s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [10:04<20:56,  5.41s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:09<20:50,  5.41s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:15<20:46,  5.42s/it][A

loss: tensor(0.9389, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:20<20:38,  5.41s/it][A

loss: tensor(0.9485, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:26<20:32,  5.41s/it][A

loss: tensor(0.9721, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:31<20:24,  5.40s/it][A

loss: tensor(1.0075, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:37<20:21,  5.40s/it][A

loss: tensor(1.0115, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:42<20:13,  5.39s/it][A

loss: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:47<20:06,  5.38s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:53<20:00,  5.38s/it][A

loss: tensor(0.9378, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:58<20:00,  5.41s/it][A

loss: tensor(0.9558, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [11:03<19:52,  5.40s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:09<19:45,  5.39s/it][A

loss: tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:14<19:38,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:20<19:37,  5.40s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:25<19:29,  5.39s/it][A

loss: tensor(0.9711, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:30<19:24,  5.39s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:36<19:16,  5.38s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:41<19:14,  5.39s/it][A

loss: tensor(0.9302, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:47<19:07,  5.39s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:52<19:03,  5.40s/it][A

loss: tensor(0.9851, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:57<19:02,  5.41s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [12:03<18:54,  5.40s/it][A

loss: tensor(0.9609, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [12:08<18:13,  5.23s/it][A

loss: tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:12<17:42,  5.11s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:18<17:56,  5.20s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:23<18:00,  5.24s/it][A

loss: tensor(0.9193, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:29<18:02,  5.28s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:34<18:03,  5.31s/it][A

loss: tensor(0.9284, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:39<18:05,  5.35s/it][A

loss: tensor(0.9486, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:45<18:01,  5.35s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:50<17:58,  5.36s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:56<17:56,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [13:01<17:52,  5.39s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [13:06<17:44,  5.38s/it][A

loss: tensor(0.9134, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:12<17:37,  5.37s/it][A

loss: tensor(0.9626, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:17<17:29,  5.35s/it][A

loss: tensor(0.9932, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:22<17:28,  5.37s/it][A

loss: tensor(0.9260, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:28<17:20,  5.37s/it][A

loss: tensor(0.9999, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:33<17:15,  5.36s/it][A

loss: tensor(0.9368, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:38<17:08,  5.35s/it][A

loss: tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:44<17:05,  5.37s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:49<17:01,  5.38s/it][A

loss: tensor(0.9391, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:55<16:56,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [14:00<16:53,  5.39s/it][A

loss: tensor(0.9508, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [14:05<16:45,  5.38s/it][A

loss: tensor(0.9175, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [14:11<16:40,  5.38s/it][A

loss: tensor(0.9493, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:16<16:33,  5.37s/it][A

loss: tensor(0.9521, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:22<16:32,  5.39s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:27<16:25,  5.39s/it][A

loss: tensor(0.9329, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:32<16:18,  5.38s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:38<16:10,  5.36s/it][A

loss: tensor(0.9533, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:43<16:09,  5.38s/it][A

loss: tensor(0.9713, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:48<16:02,  5.38s/it][A

loss: tensor(0.9115, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:54<15:54,  5.36s/it][A

loss: tensor(0.9589, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:59<15:48,  5.36s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [15:05<15:46,  5.38s/it][A

loss: tensor(0.9316, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [15:10<15:42,  5.38s/it][A

loss: tensor(1.0031, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:15<15:34,  5.37s/it][A

loss: tensor(0.9401, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:21<15:30,  5.38s/it][A

loss: tensor(0.9709, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:26<15:28,  5.40s/it][A

loss: tensor(0.9244, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:31<15:21,  5.39s/it][A

loss: tensor(0.9485, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:37<15:15,  5.38s/it][A

loss: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:42<15:09,  5.38s/it][A

loss: tensor(0.9312, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:48<15:04,  5.39s/it][A

loss: tensor(0.9395, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:53<14:58,  5.38s/it][A

loss: tensor(0.9855, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:58<14:51,  5.37s/it][A

loss: tensor(0.9239, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [16:04<14:50,  5.39s/it][A

loss: tensor(0.9276, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [16:09<14:45,  5.40s/it][A

loss: tensor(0.9685, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [16:15<14:39,  5.39s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:20<14:31,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:25<14:27,  5.39s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:31<14:20,  5.38s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:36<14:14,  5.38s/it][A

loss: tensor(0.9308, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:41<14:07,  5.37s/it][A

loss: tensor(0.9205, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:47<14:05,  5.38s/it][A

loss: tensor(0.9377, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:52<13:59,  5.38s/it][A

loss: tensor(0.9807, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:58<13:52,  5.37s/it][A

loss: tensor(0.9263, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [17:03<13:44,  5.36s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [17:08<13:40,  5.37s/it][A

loss: tensor(0.9194, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [17:14<13:35,  5.36s/it][A

loss: tensor(0.9469, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:19<13:30,  5.37s/it][A

loss: tensor(0.9610, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:24<13:25,  5.37s/it][A

loss: tensor(0.9634, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:30<13:21,  5.38s/it][A

loss: tensor(0.9141, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:35<13:15,  5.38s/it][A

loss: tensor(0.9228, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:40<13:08,  5.37s/it][A

loss: tensor(1.0039, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:46<13:05,  5.38s/it][A

loss: tensor(0.9825, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:51<12:58,  5.37s/it][A

loss: tensor(0.9135, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:57<12:52,  5.37s/it][A

loss: tensor(0.9508, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [18:02<12:48,  5.38s/it][A

loss: tensor(0.9553, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [18:07<12:45,  5.39s/it][A

loss: tensor(0.9237, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [18:13<12:37,  5.38s/it][A

loss: tensor(0.9339, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:18<12:32,  5.38s/it][A

loss: tensor(0.9294, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:23<12:25,  5.36s/it][A

loss: tensor(0.9260, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:29<12:20,  5.36s/it][A

loss: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:34<12:15,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:40<12:09,  5.36s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:45<12:03,  5.36s/it][A

loss: tensor(1.0203, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:50<12:01,  5.39s/it][A

loss: tensor(0.9299, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:56<11:56,  5.39s/it][A

loss: tensor(0.9495, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [19:01<11:50,  5.38s/it][A

loss: tensor(0.9346, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [19:06<11:44,  5.38s/it][A

loss: tensor(0.9282, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [19:12<11:41,  5.40s/it][A

loss: tensor(0.9766, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [19:17<11:34,  5.39s/it][A

loss: tensor(0.9461, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:23<11:27,  5.37s/it][A

loss: tensor(0.9292, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:28<11:21,  5.37s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:33<11:18,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:39<11:12,  5.38s/it][A

loss: tensor(1.0007, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:44<11:05,  5.37s/it][A

loss: tensor(0.9466, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:49<11:00,  5.37s/it][A

loss: tensor(0.9637, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:55<10:56,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [20:00<10:51,  5.39s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [20:06<10:45,  5.38s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [20:11<10:42,  5.40s/it][A

loss: tensor(0.9200, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [20:16<10:35,  5.38s/it][A

loss: tensor(0.9280, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:22<10:29,  5.38s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:27<10:23,  5.37s/it][A

loss: tensor(0.9806, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:33<10:17,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:38<10:12,  5.37s/it][A

loss: tensor(0.9601, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:43<10:06,  5.36s/it][A

loss: tensor(0.9339, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:49<09:59,  5.36s/it][A

loss: tensor(0.9361, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:54<09:56,  5.38s/it][A

loss: tensor(0.9494, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:59<09:50,  5.37s/it][A

loss: tensor(0.9665, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [21:05<09:44,  5.36s/it][A

loss: tensor(0.9522, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [21:10<09:38,  5.36s/it][A

loss: tensor(0.9276, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [21:15<09:35,  5.38s/it][A

loss: tensor(0.9754, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [21:21<09:28,  5.36s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:26<09:22,  5.36s/it][A

loss: tensor(0.9132, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:31<09:17,  5.36s/it][A

loss: tensor(0.9941, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:37<09:13,  5.38s/it][A

loss: tensor(0.9557, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:42<09:08,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:48<09:02,  5.37s/it][A

loss: tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:53<08:58,  5.38s/it][A

loss: tensor(0.9224, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:58<08:52,  5.38s/it][A

loss: tensor(0.9801, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [22:04<08:46,  5.37s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [22:09<08:41,  5.37s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [22:15<08:36,  5.38s/it][A

loss: tensor(0.9401, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [22:20<08:30,  5.38s/it][A

loss: tensor(0.9218, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:25<08:25,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:31<08:20,  5.38s/it][A

loss: tensor(0.9389, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:36<08:15,  5.39s/it][A

loss: tensor(0.9671, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:41<08:09,  5.38s/it][A

loss: tensor(0.9206, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:47<08:03,  5.37s/it][A

loss: tensor(0.9217, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:52<07:57,  5.36s/it][A

loss: tensor(0.9442, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:58<07:53,  5.38s/it][A

loss: tensor(0.9796, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [23:03<07:48,  5.38s/it][A

loss: tensor(0.9393, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [23:08<07:43,  5.39s/it][A

loss: tensor(0.9503, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [23:14<07:37,  5.38s/it][A

loss: tensor(1.0651, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [23:19<07:33,  5.40s/it][A

loss: tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:25<07:27,  5.39s/it][A

loss: tensor(0.9498, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:30<07:20,  5.37s/it][A

loss: tensor(0.9573, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:35<07:14,  5.36s/it][A

loss: tensor(0.9727, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:41<07:10,  5.38s/it][A

loss: tensor(0.9200, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:46<07:04,  5.37s/it][A

loss: tensor(0.9862, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:51<06:58,  5.36s/it][A

loss: tensor(0.9135, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:57<06:52,  5.36s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [24:02<06:49,  5.38s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [24:07<06:43,  5.37s/it][A

loss: tensor(0.9219, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [24:13<06:37,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [24:18<06:33,  5.39s/it][A

loss: tensor(0.9824, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [24:24<06:26,  5.37s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:29<06:20,  5.36s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:34<06:14,  5.35s/it][A

loss: tensor(0.9811, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:40<06:09,  5.36s/it][A

loss: tensor(0.9345, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:45<06:04,  5.36s/it][A

loss: tensor(0.9698, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:50<05:59,  5.36s/it][A

loss: tensor(0.9757, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:56<05:53,  5.35s/it][A

loss: tensor(0.9250, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [25:01<05:49,  5.38s/it][A

loss: tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [25:06<05:43,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [25:12<05:37,  5.36s/it][A

loss: tensor(0.9348, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [25:17<05:32,  5.36s/it][A

loss: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [25:23<05:28,  5.38s/it][A

loss: tensor(0.9272, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:28<05:21,  5.36s/it][A

loss: tensor(0.9667, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:33<05:16,  5.36s/it][A

loss: tensor(0.9665, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [25:39<05:10,  5.35s/it][A

loss: tensor(0.9169, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:44<05:05,  5.36s/it][A

loss: tensor(0.9223, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:49<05:00,  5.36s/it][A

loss: tensor(0.9325, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:55<04:54,  5.36s/it][A

loss: tensor(0.9131, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [26:00<04:49,  5.36s/it][A

loss: tensor(0.9370, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [26:05<04:45,  5.38s/it][A

loss: tensor(0.9227, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [26:11<04:39,  5.37s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [26:16<04:33,  5.37s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [26:22<04:29,  5.39s/it][A

loss: tensor(0.9627, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [26:27<04:23,  5.37s/it][A

loss: tensor(0.9756, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:32<04:18,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:38<04:12,  5.38s/it][A

loss: tensor(0.9251, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:43<04:07,  5.39s/it][A

loss: tensor(0.9361, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:49<04:02,  5.38s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:54<03:56,  5.38s/it][A

loss: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:59<03:50,  5.37s/it][A

loss: tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [27:05<03:45,  5.38s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [27:10<03:39,  5.36s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [27:15<03:34,  5.36s/it][A

loss: tensor(0.9543, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [27:21<03:29,  5.37s/it][A

loss: tensor(0.9579, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [27:26<03:24,  5.39s/it][A

loss: tensor(0.9408, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:31<03:18,  5.37s/it][A

loss: tensor(0.9813, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:37<03:13,  5.37s/it][A

loss: tensor(0.9930, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [27:42<03:07,  5.36s/it][A

loss: tensor(0.9531, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:48<03:03,  5.39s/it][A

loss: tensor(0.9540, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:53<02:57,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:58<02:51,  5.37s/it][A

loss: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [28:04<02:46,  5.39s/it][A

loss: tensor(0.9249, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [28:09<02:41,  5.37s/it][A

loss: tensor(1.0263, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [28:14<02:35,  5.37s/it][A

loss: tensor(0.9135, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [28:20<02:30,  5.37s/it][A

loss: tensor(0.9767, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [28:25<02:25,  5.39s/it][A

loss: tensor(0.9326, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [28:31<02:20,  5.39s/it][A

loss: tensor(0.9492, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:36<02:14,  5.38s/it][A

loss: tensor(0.9347, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [28:41<02:08,  5.37s/it][A

loss: tensor(0.9912, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:47<02:03,  5.39s/it][A

loss: tensor(0.9462, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:52<01:58,  5.37s/it][A

loss: tensor(0.9159, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:57<01:52,  5.36s/it][A

loss: tensor(0.9855, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [29:03<01:47,  5.36s/it][A

loss: tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [29:08<01:42,  5.39s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [29:14<01:36,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [29:19<01:31,  5.37s/it][A

loss: tensor(0.9122, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [29:24<01:25,  5.36s/it][A

loss: tensor(1.0367, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [29:30<01:20,  5.38s/it][A

loss: tensor(0.9612, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:35<01:15,  5.38s/it][A

loss: tensor(0.9295, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [29:40<01:09,  5.37s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [29:46<01:04,  5.37s/it][A

loss: tensor(0.9613, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:51<00:59,  5.39s/it][A

loss: tensor(0.9503, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:57<00:53,  5.38s/it][A

loss: tensor(0.9150, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [30:02<00:48,  5.37s/it][A

loss: tensor(1.0054, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [30:07<00:43,  5.38s/it][A

loss: tensor(0.9497, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [30:13<00:37,  5.37s/it][A

loss: tensor(0.9286, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [30:18<00:32,  5.37s/it][A

loss: tensor(0.9583, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [30:23<00:26,  5.37s/it][A

loss: tensor(0.9237, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [30:29<00:21,  5.38s/it][A

loss: tensor(0.9139, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [30:34<00:16,  5.37s/it][A

loss: tensor(0.9149, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [30:40<00:10,  5.37s/it][A

loss: tensor(0.9399, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [30:45<00:05,  5.36s/it][A

loss: tensor(0.9460, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [30:45<00:00,  5.33s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(1.3166, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9428993813219787

	Training acc: 0.9619825502359478

	Training prec: 0.9266243360304762

	Training rec: 0.9619825502359478

	Training f1: 0.9436914760284201

	Current Learning rate:  2e-05



  1%|          | 1/173 [00:00<01:49,  1.57it/s][A
  1%|          | 2/173 [00:01<01:58,  1.44it/s][A
  2%|▏         | 3/173 [00:02<01:59,  1.42it/s][A
  2%|▏         | 4/173 [00:02<01:54,  1.48it/s][A
  3%|▎         | 5/173 [00:03<01:55,  1.45it/s][A
  3%|▎         | 6/173 [00:04<01:56,  1.44it/s][A
  4%|▍         | 7/173 [00:04<01:52,  1.48it/s][A
  5%|▍         | 8/173 [00:05<01:53,  1.45it/s][A
  5%|▌         | 9/173 [00:06<01:54,  1.43it/s][A
  6%|▌         | 10/173 [00:06<01:51,  1.47it/s][A
  6%|▋         | 11/173 [00:07<01:51,  1.45it/s][A
  7%|▋         | 12/173 [00:08<01:52,  1.43it/s][A
  8%|▊         | 13/173 [00:08<01:48,  1.47it/s][A
  8%|▊         | 14/173 [00:09<01:49,  1.45it/s][A
  9%|▊         | 15/173 [00:10<01:50,  1.43it/s][A
  9%|▉         | 16/173 [00:10<01:46,  1.47it/s][A
 10%|▉         | 17/173 [00:11<01:47,  1.45it/s][A
 10%|█         | 18/173 [00:12<01:47,  1.44it/s][A
 11%|█         | 19/173 [00:13<01:44,  1.47it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9456173908503759

	Validation acc: 0.9592226552894652

	Validation prec: 0.9217341113627645

	Validation rec: 0.9592226552894652

	Validation f1: 0.9396990480913392



  0%|          | 1/346 [00:05<31:00,  5.39s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<30:41,  5.35s/it][A

loss: tensor(0.9628, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:16<30:37,  5.36s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:21<30:46,  5.40s/it][A

loss: tensor(0.9737, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<30:35,  5.38s/it][A

loss: tensor(0.9294, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:32<30:25,  5.37s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:37<30:20,  5.37s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:43<30:18,  5.38s/it][A

loss: tensor(0.9214, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:48<30:09,  5.37s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:53<30:01,  5.36s/it][A

loss: tensor(0.9214, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:59<30:02,  5.38s/it][A

loss: tensor(0.9497, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:04<29:50,  5.36s/it][A

loss: tensor(0.9929, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:09<29:47,  5.37s/it][A

loss: tensor(0.9689, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:15<29:42,  5.37s/it][A

loss: tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:20<29:42,  5.39s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:25<29:34,  5.38s/it][A

loss: tensor(0.9882, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:31<29:29,  5.38s/it][A

loss: tensor(0.9533, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:36<29:20,  5.37s/it][A

loss: tensor(0.9280, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:42<29:21,  5.39s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:47<29:13,  5.38s/it][A

loss: tensor(0.9659, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:52<29:04,  5.37s/it][A

loss: tensor(0.9413, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:58<29:03,  5.38s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:03<29:02,  5.39s/it][A

loss: tensor(0.9646, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:09<28:52,  5.38s/it][A

loss: tensor(0.9378, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:14<28:47,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:19<28:39,  5.37s/it][A

loss: tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:25<28:38,  5.39s/it][A

loss: tensor(0.9165, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:30<28:30,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:35<28:23,  5.37s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:41<28:15,  5.37s/it][A

loss: tensor(0.9424, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:46<28:14,  5.38s/it][A

loss: tensor(0.9512, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:52<28:07,  5.37s/it][A

loss: tensor(0.9137, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:57<28:01,  5.37s/it][A

loss: tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [03:02<27:58,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:08<27:51,  5.38s/it][A

loss: tensor(0.9710, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:13<27:46,  5.38s/it][A

loss: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:18<27:38,  5.37s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:24<27:35,  5.38s/it][A

loss: tensor(0.9441, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:29<27:28,  5.37s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:34<27:21,  5.36s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:40<27:15,  5.36s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:45<27:16,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:51<27:08,  5.38s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:56<27:02,  5.37s/it][A

loss: tensor(0.9483, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [04:01<26:51,  5.35s/it][A

loss: tensor(0.9616, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:07<26:53,  5.38s/it][A

loss: tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:12<26:44,  5.37s/it][A

loss: tensor(0.9340, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:17<26:37,  5.36s/it][A

loss: tensor(0.9462, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:23<26:36,  5.38s/it][A

loss: tensor(0.9452, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:28<26:35,  5.39s/it][A

loss: tensor(0.9425, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:34<26:26,  5.38s/it][A

loss: tensor(0.9677, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:39<26:19,  5.37s/it][A

loss: tensor(0.9246, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:44<26:17,  5.39s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:50<26:07,  5.37s/it][A

loss: tensor(0.9545, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:55<26:07,  5.39s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [05:01<26:00,  5.38s/it][A

loss: tensor(0.9228, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [05:06<25:59,  5.40s/it][A

loss: tensor(0.9155, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:11<25:48,  5.38s/it][A

loss: tensor(0.9540, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:17<25:40,  5.37s/it][A

loss: tensor(0.9399, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:22<25:36,  5.37s/it][A

loss: tensor(0.9587, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:27<25:37,  5.39s/it][A

loss: tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:33<25:28,  5.38s/it][A

loss: tensor(1.0067, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:38<25:21,  5.38s/it][A

loss: tensor(0.9339, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:44<25:17,  5.38s/it][A

loss: tensor(0.9838, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:49<25:16,  5.40s/it][A

loss: tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:54<25:05,  5.38s/it][A

loss: tensor(0.9377, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [06:00<24:55,  5.36s/it][A

loss: tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [06:05<24:48,  5.35s/it][A

loss: tensor(0.9263, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:10<24:48,  5.37s/it][A

loss: tensor(0.9250, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:16<24:39,  5.36s/it][A

loss: tensor(0.9552, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:21<24:31,  5.35s/it][A

loss: tensor(0.9208, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:26<24:27,  5.35s/it][A

loss: tensor(0.9282, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:32<24:26,  5.37s/it][A

loss: tensor(0.9194, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:37<24:18,  5.36s/it][A

loss: tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:43<24:11,  5.36s/it][A

loss: tensor(0.9592, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:48<24:04,  5.35s/it][A

loss: tensor(0.9370, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:53<24:03,  5.36s/it][A

loss: tensor(0.9907, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:59<23:57,  5.37s/it][A

loss: tensor(0.9537, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [07:04<23:51,  5.36s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [07:09<23:51,  5.38s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:15<23:43,  5.37s/it][A

loss: tensor(0.9540, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:20<23:38,  5.37s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:26<23:33,  5.37s/it][A

loss: tensor(0.9571, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:31<23:31,  5.39s/it][A

loss: tensor(0.9171, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:36<23:20,  5.37s/it][A

loss: tensor(0.9663, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:42<23:12,  5.35s/it][A

loss: tensor(0.9679, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:47<23:04,  5.34s/it][A

loss: tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:52<23:06,  5.37s/it][A

loss: tensor(0.9667, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:58<23:00,  5.37s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [08:03<22:48,  5.35s/it][A

loss: tensor(1.0155, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [08:08<22:44,  5.35s/it][A

loss: tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:14<22:41,  5.36s/it][A

loss: tensor(0.9302, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:19<22:36,  5.36s/it][A

loss: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:24<22:30,  5.36s/it][A

loss: tensor(0.9381, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:30<22:23,  5.35s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:35<22:23,  5.37s/it][A

loss: tensor(0.9693, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:41<22:15,  5.36s/it][A

loss: tensor(0.9308, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:46<22:08,  5.36s/it][A

loss: tensor(0.9841, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:51<22:01,  5.35s/it][A

loss: tensor(0.9122, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:57<21:59,  5.36s/it][A

loss: tensor(0.9205, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [09:02<21:51,  5.35s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [09:07<21:47,  5.36s/it][A

loss: tensor(0.9459, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:13<21:46,  5.38s/it][A

loss: tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:18<21:37,  5.36s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:23<21:33,  5.37s/it][A

loss: tensor(0.9658, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:29<21:26,  5.36s/it][A

loss: tensor(0.9156, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:34<21:23,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:40<21:17,  5.37s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:45<21:10,  5.36s/it][A

loss: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:50<21:03,  5.36s/it][A

loss: tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:56<21:02,  5.37s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [10:01<20:54,  5.36s/it][A

loss: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [10:06<20:48,  5.36s/it][A

loss: tensor(0.9357, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [10:12<20:43,  5.36s/it][A

loss: tensor(0.9659, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:17<20:39,  5.36s/it][A

loss: tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:22<20:15,  5.29s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:28<20:15,  5.31s/it][A

loss: tensor(0.9208, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:33<20:12,  5.32s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:38<20:15,  5.35s/it][A

loss: tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:44<20:09,  5.35s/it][A

loss: tensor(0.9523, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:49<20:03,  5.35s/it][A

loss: tensor(0.9521, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:54<20:00,  5.36s/it][A

loss: tensor(0.9611, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [11:00<19:59,  5.38s/it][A

loss: tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [11:05<19:51,  5.37s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [11:11<19:45,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:16<19:43,  5.38s/it][A

loss: tensor(0.9546, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:21<19:34,  5.37s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:27<19:29,  5.37s/it][A

loss: tensor(0.9496, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:32<19:23,  5.36s/it][A

loss: tensor(0.9662, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:37<19:21,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:43<19:15,  5.37s/it][A

loss: tensor(0.9491, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:48<19:10,  5.37s/it][A

loss: tensor(0.9630, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:53<19:02,  5.37s/it][A

loss: tensor(0.9148, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:59<19:01,  5.39s/it][A

loss: tensor(0.9115, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [12:04<18:26,  5.25s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [12:09<18:22,  5.25s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [12:14<18:24,  5.29s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:20<18:28,  5.33s/it][A

loss: tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:25<18:26,  5.34s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:31<18:23,  5.36s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:36<18:20,  5.37s/it][A

loss: tensor(1.0107, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:42<18:22,  5.40s/it][A

loss: tensor(0.9175, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:46<17:38,  5.21s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:51<17:08,  5.09s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:56<16:46,  5.01s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [13:01<16:30,  4.95s/it][A

loss: tensor(0.9649, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [13:06<16:18,  4.92s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [13:10<16:08,  4.89s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:16<16:38,  5.07s/it][A

loss: tensor(0.9567, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:21<16:53,  5.17s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:27<17:00,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:31<16:28,  5.10s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:36<16:07,  5.01s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:41<15:50,  4.95s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:46<15:38,  4.92s/it][A

loss: tensor(0.9480, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:51<15:28,  4.89s/it][A

loss: tensor(0.9344, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:56<15:22,  4.88s/it][A

loss: tensor(0.9979, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [14:00<15:13,  4.86s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [14:05<15:06,  4.85s/it][A

loss: tensor(0.9221, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [14:11<15:26,  4.98s/it][A

loss: tensor(0.9199, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:16<15:46,  5.12s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:21<15:55,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:26<15:29,  5.08s/it][A

loss: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:31<15:10,  5.00s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:36<14:59,  4.97s/it][A

loss: tensor(0.9737, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:41<14:46,  4.93s/it][A

loss: tensor(0.9203, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:46<14:35,  4.89s/it][A

loss: tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:50<14:30,  4.89s/it][A

loss: tensor(0.9296, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:55<14:21,  4.86s/it][A

loss: tensor(0.9433, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [15:00<14:38,  4.99s/it][A

loss: tensor(0.9980, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [15:06<14:53,  5.11s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:11<15:05,  5.21s/it][A

loss: tensor(0.9133, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:17<15:09,  5.26s/it][A

loss: tensor(0.9734, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:22<15:11,  5.30s/it][A

loss: tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:27<15:08,  5.31s/it][A

loss: tensor(0.9568, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:33<15:09,  5.35s/it][A

loss: tensor(0.9578, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:38<15:05,  5.36s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:44<14:59,  5.36s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:49<14:57,  5.37s/it][A

loss: tensor(0.9818, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:54<14:55,  5.39s/it][A

loss: tensor(0.9744, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [16:00<14:48,  5.39s/it][A

loss: tensor(0.9279, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [16:05<14:20,  5.25s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [16:10<14:21,  5.28s/it][A

loss: tensor(0.9662, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:15<14:22,  5.32s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:20<13:58,  5.21s/it][A

loss: tensor(0.9741, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:25<13:41,  5.14s/it][A

loss: tensor(0.9774, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:31<13:39,  5.16s/it][A

loss: tensor(0.9338, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:36<13:47,  5.24s/it][A

loss: tensor(0.9897, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:41<13:47,  5.27s/it][A

loss: tensor(0.9226, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:47<13:45,  5.29s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:52<13:44,  5.32s/it][A

loss: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [16:58<13:44,  5.35s/it][A

loss: tensor(0.9656, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [17:03<13:38,  5.35s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [17:08<13:34,  5.36s/it][A

loss: tensor(0.9554, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:14<13:31,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:19<13:24,  5.36s/it][A

loss: tensor(0.9761, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:24<13:18,  5.36s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:30<13:14,  5.37s/it][A

loss: tensor(0.9363, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:35<13:13,  5.40s/it][A

loss: tensor(1.0030, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:41<13:07,  5.39s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:46<12:59,  5.38s/it][A

loss: tensor(0.9312, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:51<12:53,  5.37s/it][A

loss: tensor(0.9656, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:57<12:52,  5.40s/it][A

loss: tensor(0.9551, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [18:02<12:46,  5.40s/it][A

loss: tensor(0.9411, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [18:08<12:41,  5.40s/it][A

loss: tensor(0.9920, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:13<12:34,  5.39s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:18<12:31,  5.41s/it][A

loss: tensor(0.9156, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:24<12:25,  5.40s/it][A

loss: tensor(0.9277, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:29<12:17,  5.39s/it][A

loss: tensor(0.9661, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:34<12:12,  5.39s/it][A

loss: tensor(0.9878, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:40<12:09,  5.41s/it][A

loss: tensor(0.9435, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:45<12:00,  5.38s/it][A

loss: tensor(0.9190, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:51<11:54,  5.37s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:56<11:52,  5.40s/it][A

loss: tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [19:01<11:46,  5.39s/it][A

loss: tensor(0.9665, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [19:07<11:39,  5.38s/it][A

loss: tensor(0.9885, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [19:12<11:32,  5.37s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:18<11:29,  5.38s/it][A

loss: tensor(0.9682, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:23<11:21,  5.37s/it][A

loss: tensor(0.9481, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:28<11:17,  5.37s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:34<11:12,  5.38s/it][A

loss: tensor(0.9209, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:39<11:08,  5.39s/it][A

loss: tensor(0.9321, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:44<11:01,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:50<10:54,  5.37s/it][A

loss: tensor(0.9849, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:55<10:50,  5.38s/it][A

loss: tensor(0.9179, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [20:01<10:47,  5.39s/it][A

loss: tensor(0.9284, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [20:06<10:40,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [20:11<10:33,  5.37s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:17<10:26,  5.36s/it][A

loss: tensor(0.9439, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:22<10:23,  5.37s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:27<10:16,  5.36s/it][A

loss: tensor(0.9567, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:33<10:11,  5.37s/it][A

loss: tensor(0.9851, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:38<10:06,  5.37s/it][A

loss: tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:44<10:01,  5.37s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:49<09:57,  5.38s/it][A

loss: tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:54<09:50,  5.37s/it][A

loss: tensor(0.9223, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [21:00<09:47,  5.39s/it][A

loss: tensor(0.9467, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [21:05<09:41,  5.38s/it][A

loss: tensor(0.9234, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [21:10<09:35,  5.38s/it][A

loss: tensor(0.9224, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [21:16<09:30,  5.39s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:21<09:25,  5.39s/it][A

loss: tensor(0.9828, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:27<09:20,  5.39s/it][A

loss: tensor(0.9657, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:32<09:14,  5.38s/it][A

loss: tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:37<09:07,  5.37s/it][A

loss: tensor(0.9570, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:43<09:04,  5.39s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:48<08:58,  5.39s/it][A

loss: tensor(0.9943, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:54<08:52,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [21:59<08:46,  5.37s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [22:04<08:42,  5.39s/it][A

loss: tensor(0.9391, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [22:10<08:35,  5.37s/it][A

loss: tensor(0.9742, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [22:15<08:28,  5.35s/it][A

loss: tensor(0.9144, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:20<08:22,  5.35s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:26<08:19,  5.37s/it][A

loss: tensor(0.9239, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:31<08:13,  5.36s/it][A

loss: tensor(0.9383, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:36<08:07,  5.35s/it][A

loss: tensor(0.9226, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:42<08:02,  5.36s/it][A

loss: tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:47<07:58,  5.38s/it][A

loss: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:53<07:52,  5.37s/it][A

loss: tensor(0.9554, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [22:58<07:46,  5.36s/it][A

loss: tensor(0.9556, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [23:03<07:40,  5.36s/it][A

loss: tensor(0.9179, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [23:09<07:37,  5.39s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [23:14<07:32,  5.38s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:19<07:26,  5.37s/it][A

loss: tensor(0.9193, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:25<07:21,  5.38s/it][A

loss: tensor(0.9702, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:30<07:15,  5.37s/it][A

loss: tensor(0.9798, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:35<07:09,  5.37s/it][A

loss: tensor(0.9752, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:41<07:03,  5.36s/it][A

loss: tensor(0.9464, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:46<06:59,  5.38s/it][A

loss: tensor(1.0049, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:52<06:53,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:57<06:47,  5.36s/it][A

loss: tensor(0.9894, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [24:02<06:41,  5.35s/it][A

loss: tensor(0.9676, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [24:08<06:37,  5.37s/it][A

loss: tensor(0.9720, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [24:13<06:31,  5.36s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [24:18<06:25,  5.35s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:24<06:19,  5.35s/it][A

loss: tensor(0.9671, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:29<06:15,  5.36s/it][A

loss: tensor(0.9483, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:34<06:09,  5.36s/it][A

loss: tensor(0.9417, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:40<06:04,  5.36s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:45<05:59,  5.36s/it][A

loss: tensor(1.0274, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:51<05:54,  5.38s/it][A

loss: tensor(0.9987, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:56<05:49,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [25:01<05:43,  5.37s/it][A

loss: tensor(0.9118, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [25:07<05:39,  5.39s/it][A

loss: tensor(0.9873, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [25:12<05:32,  5.37s/it][A

loss: tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [25:17<05:27,  5.37s/it][A

loss: tensor(0.9168, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:23<05:21,  5.36s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:28<05:17,  5.38s/it][A

loss: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [25:34<05:11,  5.37s/it][A

loss: tensor(0.9995, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:39<05:06,  5.38s/it][A

loss: tensor(0.9800, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:44<05:00,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:50<04:56,  5.39s/it][A

loss: tensor(0.9333, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:55<04:50,  5.38s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [26:00<04:44,  5.37s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [26:06<04:39,  5.37s/it][A

loss: tensor(0.9447, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [26:11<04:34,  5.39s/it][A

loss: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [26:17<04:28,  5.36s/it][A

loss: tensor(0.9361, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [26:22<04:22,  5.35s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:27<04:16,  5.35s/it][A

loss: tensor(0.9634, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:33<04:12,  5.37s/it][A

loss: tensor(0.9172, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:38<04:06,  5.36s/it][A

loss: tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:43<04:00,  5.35s/it][A

loss: tensor(0.9825, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:49<03:55,  5.35s/it][A

loss: tensor(0.9258, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:54<03:50,  5.37s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [26:59<03:45,  5.36s/it][A

loss: tensor(0.9292, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [27:05<03:40,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [27:10<03:35,  5.38s/it][A

loss: tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [27:16<03:30,  5.40s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [27:21<03:25,  5.41s/it][A

loss: tensor(0.9391, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:26<03:20,  5.41s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:32<03:15,  5.42s/it][A

loss: tensor(0.9263, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [27:37<03:09,  5.41s/it][A

loss: tensor(0.9244, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:43<03:03,  5.40s/it][A

loss: tensor(0.9535, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:48<02:57,  5.39s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:54<02:52,  5.41s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [27:59<02:47,  5.39s/it][A

loss: tensor(0.9363, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [28:04<02:41,  5.38s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [28:10<02:36,  5.38s/it][A

loss: tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [28:15<02:31,  5.40s/it][A

loss: tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [28:20<02:25,  5.38s/it][A

loss: tensor(0.9195, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [28:26<02:19,  5.37s/it][A

loss: tensor(0.9519, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:31<02:14,  5.37s/it][A

loss: tensor(1.0195, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [28:37<02:09,  5.39s/it][A

loss: tensor(0.9517, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:42<02:03,  5.38s/it][A

loss: tensor(1.0198, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:47<01:58,  5.36s/it][A

loss: tensor(0.9755, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:53<01:52,  5.36s/it][A

loss: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [28:58<01:47,  5.40s/it][A

loss: tensor(0.9296, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [29:03<01:42,  5.40s/it][A

loss: tensor(0.9136, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [29:09<01:36,  5.39s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [29:14<01:31,  5.40s/it][A

loss: tensor(0.9367, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [29:20<01:26,  5.40s/it][A

loss: tensor(0.9702, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [29:25<01:21,  5.41s/it][A

loss: tensor(0.9191, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:30<01:15,  5.40s/it][A

loss: tensor(0.9287, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [29:36<01:10,  5.41s/it][A

loss: tensor(0.9747, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [29:41<01:04,  5.40s/it][A

loss: tensor(0.9763, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:47<00:59,  5.39s/it][A

loss: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:52<00:53,  5.39s/it][A

loss: tensor(0.9835, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:57<00:48,  5.40s/it][A

loss: tensor(0.9557, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [30:03<00:43,  5.39s/it][A

loss: tensor(0.9350, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [30:08<00:37,  5.38s/it][A

loss: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [30:14<00:32,  5.38s/it][A

loss: tensor(0.9942, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [30:19<00:26,  5.39s/it][A

loss: tensor(0.9529, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [30:24<00:21,  5.37s/it][A

loss: tensor(0.9412, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [30:30<00:16,  5.37s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [30:35<00:10,  5.38s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [30:41<00:05,  5.39s/it][A

loss: tensor(0.9268, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [30:41<00:00,  5.32s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9417613017765772

	Training acc: 0.9631105379429511

	Training prec: 0.9282566475246357

	Training rec: 0.9631105379429511

	Training f1: 0.945192458201801

	Current Learning rate:  1.5e-05



  1%|          | 1/173 [00:00<02:02,  1.41it/s][A
  1%|          | 2/173 [00:01<01:54,  1.49it/s][A
  2%|▏         | 3/173 [00:02<01:56,  1.45it/s][A
  2%|▏         | 4/173 [00:02<01:57,  1.43it/s][A
  3%|▎         | 5/173 [00:03<01:53,  1.47it/s][A
  3%|▎         | 6/173 [00:04<01:54,  1.45it/s][A
  4%|▍         | 7/173 [00:04<01:55,  1.44it/s][A
  5%|▍         | 8/173 [00:05<01:51,  1.47it/s][A
  5%|▌         | 9/173 [00:06<01:52,  1.46it/s][A
  6%|▌         | 10/173 [00:06<01:53,  1.43it/s][A
  6%|▋         | 11/173 [00:07<01:50,  1.47it/s][A
  7%|▋         | 12/173 [00:08<01:50,  1.45it/s][A
  8%|▊         | 13/173 [00:08<01:51,  1.44it/s][A
  8%|▊         | 14/173 [00:09<01:48,  1.46it/s][A
  9%|▊         | 15/173 [00:10<01:49,  1.44it/s][A
  9%|▉         | 16/173 [00:11<01:50,  1.42it/s][A
 10%|▉         | 17/173 [00:11<01:46,  1.46it/s][A
 10%|█         | 18/173 [00:12<01:47,  1.44it/s][A
 11%|█         | 19/173 [00:13<01:47,  1.43it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9453922889136166

	Validation acc: 0.9594461994558443

	Validation prec: 0.922434374769588

	Validation rec: 0.9594461994558443

	Validation f1: 0.9401095646572624



  0%|          | 1/346 [00:05<29:49,  5.19s/it][A

loss: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<30:06,  5.25s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:15<29:53,  5.23s/it][A

loss: tensor(0.9181, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:20<29:44,  5.22s/it][A

loss: tensor(0.9294, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<29:36,  5.21s/it][A

loss: tensor(0.9206, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:31<29:42,  5.24s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:36<29:30,  5.22s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:41<29:24,  5.22s/it][A

loss: tensor(0.9295, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:46<29:17,  5.21s/it][A

loss: tensor(0.9424, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:52<29:18,  5.23s/it][A

loss: tensor(1.0137, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:57<29:09,  5.22s/it][A

loss: tensor(0.9857, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:02<29:03,  5.22s/it][A

loss: tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:07<28:57,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:13<28:58,  5.24s/it][A

loss: tensor(0.9733, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:18<28:53,  5.24s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:23<28:47,  5.24s/it][A

loss: tensor(0.9236, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:28<28:40,  5.23s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:34<28:40,  5.24s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:39<28:35,  5.24s/it][A

loss: tensor(0.9604, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:44<28:26,  5.23s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:49<28:25,  5.25s/it][A

loss: tensor(0.9127, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:55<28:17,  5.24s/it][A

loss: tensor(0.9776, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:00<28:10,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:05<28:02,  5.23s/it][A

loss: tensor(0.9182, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:10<28:02,  5.24s/it][A

loss: tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:16<27:53,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:21<27:46,  5.22s/it][A

loss: tensor(0.9384, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:26<27:39,  5.22s/it][A

loss: tensor(0.9961, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:31<27:41,  5.24s/it][A

loss: tensor(0.9488, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:36<27:33,  5.23s/it][A

loss: tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:42<27:25,  5.23s/it][A

loss: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:47<27:19,  5.22s/it][A

loss: tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:52<27:18,  5.23s/it][A

loss: tensor(0.9387, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [02:57<27:08,  5.22s/it][A

loss: tensor(0.9538, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:02<27:00,  5.21s/it][A

loss: tensor(0.9519, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:08<26:52,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:13<26:54,  5.23s/it][A

loss: tensor(0.9211, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:18<26:46,  5.21s/it][A

loss: tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:23<26:39,  5.21s/it][A

loss: tensor(0.9376, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:29<26:37,  5.22s/it][A

loss: tensor(0.9994, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:34<26:35,  5.23s/it][A

loss: tensor(0.9368, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:39<26:28,  5.22s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:44<26:22,  5.22s/it][A

loss: tensor(1.0126, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:50<26:19,  5.23s/it][A

loss: tensor(0.9692, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:55<26:10,  5.22s/it][A

loss: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:00<26:04,  5.21s/it][A

loss: tensor(0.9625, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:05<25:56,  5.21s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:10<25:55,  5.22s/it][A

loss: tensor(0.9147, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:16<25:50,  5.22s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:21<25:43,  5.21s/it][A

loss: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:26<25:35,  5.20s/it][A

loss: tensor(0.9171, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:31<25:40,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:36<25:34,  5.24s/it][A

loss: tensor(0.9340, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:42<25:25,  5.23s/it][A

loss: tensor(0.9760, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:47<25:20,  5.22s/it][A

loss: tensor(0.9397, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:52<25:19,  5.24s/it][A

loss: tensor(0.9710, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [04:57<25:11,  5.23s/it][A

loss: tensor(0.9518, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:03<25:02,  5.22s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:08<24:54,  5.21s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:13<24:55,  5.23s/it][A

loss: tensor(0.9347, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:18<24:47,  5.22s/it][A

loss: tensor(0.9431, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:23<24:40,  5.21s/it][A

loss: tensor(0.9573, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:29<24:34,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:34<24:32,  5.22s/it][A

loss: tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:39<24:25,  5.22s/it][A

loss: tensor(0.9374, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:44<24:20,  5.22s/it][A

loss: tensor(0.9160, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:50<24:19,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [05:55<24:11,  5.22s/it][A

loss: tensor(0.9124, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:00<24:06,  5.22s/it][A

loss: tensor(1.0085, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:05<24:00,  5.22s/it][A

loss: tensor(0.9345, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:10<23:58,  5.23s/it][A

loss: tensor(0.9594, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:16<23:50,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:21<23:44,  5.22s/it][A

loss: tensor(0.9136, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:26<23:38,  5.21s/it][A

loss: tensor(0.9210, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:31<23:39,  5.24s/it][A

loss: tensor(0.9532, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:37<23:31,  5.23s/it][A

loss: tensor(0.9855, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:42<23:23,  5.22s/it][A

loss: tensor(0.9391, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:47<23:18,  5.22s/it][A

loss: tensor(0.9813, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:52<23:17,  5.23s/it][A

loss: tensor(0.9515, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [06:58<23:11,  5.23s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:03<23:04,  5.23s/it][A

loss: tensor(0.9610, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:08<22:57,  5.22s/it][A

loss: tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:13<22:57,  5.24s/it][A

loss: tensor(0.9165, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:18<22:50,  5.23s/it][A

loss: tensor(0.9821, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:24<22:43,  5.23s/it][A

loss: tensor(1.0103, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:29<22:36,  5.22s/it][A

loss: tensor(0.9523, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:34<22:34,  5.23s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:39<22:28,  5.23s/it][A

loss: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:45<22:22,  5.23s/it][A

loss: tensor(0.9843, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:50<22:20,  5.24s/it][A

loss: tensor(0.9129, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [07:55<22:14,  5.23s/it][A

loss: tensor(0.9268, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:00<22:08,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:05<21:58,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:11<21:58,  5.23s/it][A

loss: tensor(0.9361, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:16<21:53,  5.23s/it][A

loss: tensor(0.9700, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:21<21:46,  5.23s/it][A

loss: tensor(0.9469, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:26<21:39,  5.22s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:32<21:39,  5.24s/it][A

loss: tensor(0.9461, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:37<21:31,  5.23s/it][A

loss: tensor(0.9687, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:42<21:23,  5.22s/it][A

loss: tensor(0.9371, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:47<21:18,  5.22s/it][A

loss: tensor(0.9182, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [08:53<21:16,  5.23s/it][A

loss: tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [08:58<21:09,  5.22s/it][A

loss: tensor(1.0156, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:03<21:02,  5.22s/it][A

loss: tensor(0.9568, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:08<20:54,  5.21s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:13<20:54,  5.23s/it][A

loss: tensor(0.9983, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:19<20:47,  5.22s/it][A

loss: tensor(0.9298, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:24<20:40,  5.21s/it][A

loss: tensor(0.9183, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:29<20:40,  5.24s/it][A

loss: tensor(0.9554, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:34<20:31,  5.22s/it][A

loss: tensor(0.9559, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:39<20:26,  5.22s/it][A

loss: tensor(0.9445, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:45<20:20,  5.22s/it][A

loss: tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [09:50<20:19,  5.23s/it][A

loss: tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [09:55<20:11,  5.22s/it][A

loss: tensor(0.9622, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:00<20:06,  5.22s/it][A

loss: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:06<19:59,  5.22s/it][A

loss: tensor(0.9268, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:11<19:57,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:16<19:50,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:21<19:43,  5.21s/it][A

loss: tensor(0.9838, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:26<19:36,  5.20s/it][A

loss: tensor(0.9436, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:32<19:34,  5.22s/it][A

loss: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:37<19:25,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:42<19:17,  5.19s/it][A

loss: tensor(0.9467, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:47<19:09,  5.18s/it][A

loss: tensor(0.9205, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [10:52<19:07,  5.19s/it][A

loss: tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [10:57<18:59,  5.18s/it][A

loss: tensor(0.9553, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:03<18:50,  5.16s/it][A

loss: tensor(0.9178, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:08<18:43,  5.15s/it][A

loss: tensor(0.9323, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:13<18:42,  5.17s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:18<18:34,  5.16s/it][A

loss: tensor(0.9338, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:23<18:25,  5.14s/it][A

loss: tensor(0.9573, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:28<18:25,  5.16s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:34<18:18,  5.16s/it][A

loss: tensor(0.9766, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:39<18:12,  5.15s/it][A

loss: tensor(0.9359, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:44<18:06,  5.15s/it][A

loss: tensor(0.9370, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [11:49<18:04,  5.16s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [11:54<17:56,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [11:59<17:53,  5.16s/it][A

loss: tensor(0.9433, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:04<17:46,  5.15s/it][A

loss: tensor(0.9756, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:10<17:44,  5.17s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:15<17:36,  5.15s/it][A

loss: tensor(0.9425, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:20<17:29,  5.15s/it][A

loss: tensor(0.9207, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:25<17:23,  5.14s/it][A

loss: tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:30<17:21,  5.16s/it][A

loss: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:35<17:14,  5.14s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:40<17:07,  5.14s/it][A

loss: tensor(0.9544, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [12:46<17:01,  5.13s/it][A

loss: tensor(0.9680, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [12:51<17:00,  5.15s/it][A

loss: tensor(0.9473, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [12:56<16:54,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:01<16:48,  5.14s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:06<16:42,  5.14s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:11<16:41,  5.16s/it][A

loss: tensor(0.9144, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:17<16:35,  5.16s/it][A

loss: tensor(0.9213, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:22<16:29,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:27<16:24,  5.15s/it][A

loss: tensor(0.9611, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:32<16:22,  5.17s/it][A

loss: tensor(0.9272, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:37<16:16,  5.17s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [13:42<16:09,  5.16s/it][A

loss: tensor(1.0831, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [13:48<16:07,  5.17s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [13:53<15:57,  5.15s/it][A

loss: tensor(0.9223, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [13:58<15:54,  5.16s/it][A

loss: tensor(1.0359, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:03<15:50,  5.16s/it][A

loss: tensor(0.9794, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:08<15:48,  5.18s/it][A

loss: tensor(0.9299, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:13<15:43,  5.18s/it][A

loss: tensor(0.9322, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:19<15:37,  5.18s/it][A

loss: tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:24<15:32,  5.18s/it][A

loss: tensor(0.9607, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:29<15:32,  5.21s/it][A

loss: tensor(0.9438, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:34<15:26,  5.20s/it][A

loss: tensor(0.9742, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:39<15:20,  5.20s/it][A

loss: tensor(0.9123, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [14:45<15:14,  5.20s/it][A

loss: tensor(0.9587, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [14:50<15:14,  5.23s/it][A

loss: tensor(0.9750, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [14:55<15:07,  5.22s/it][A

loss: tensor(0.9461, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:00<15:02,  5.21s/it][A

loss: tensor(0.9523, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:06<14:56,  5.21s/it][A

loss: tensor(0.9249, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:11<14:55,  5.23s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:16<14:48,  5.23s/it][A

loss: tensor(0.9941, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:21<14:42,  5.22s/it][A

loss: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:27<14:40,  5.24s/it][A

loss: tensor(0.9494, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:32<14:33,  5.23s/it][A

loss: tensor(0.9776, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:37<14:26,  5.22s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [15:42<14:21,  5.22s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [15:47<14:18,  5.23s/it][A

loss: tensor(0.9596, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [15:53<14:09,  5.21s/it][A

loss: tensor(0.9508, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [15:58<14:04,  5.22s/it][A

loss: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:03<14:00,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:08<13:56,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:13<13:49,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:19<13:43,  5.21s/it][A

loss: tensor(0.9838, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:24<13:36,  5.20s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:29<13:34,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:34<13:28,  5.22s/it][A

loss: tensor(0.9633, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [16:40<13:22,  5.21s/it][A

loss: tensor(0.9512, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [16:45<13:17,  5.21s/it][A

loss: tensor(0.9184, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [16:50<13:14,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [16:55<13:08,  5.22s/it][A

loss: tensor(0.9298, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:00<13:02,  5.22s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:06<12:56,  5.21s/it][A

loss: tensor(0.9216, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:11<12:54,  5.23s/it][A

loss: tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:16<12:47,  5.22s/it][A

loss: tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:21<12:41,  5.21s/it][A

loss: tensor(0.9338, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:27<12:36,  5.22s/it][A

loss: tensor(0.9530, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:32<12:34,  5.24s/it][A

loss: tensor(0.9170, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:37<12:27,  5.23s/it][A

loss: tensor(0.9214, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [17:42<12:21,  5.22s/it][A

loss: tensor(0.9354, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [17:48<12:18,  5.24s/it][A

loss: tensor(0.9845, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [17:53<12:10,  5.22s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [17:58<12:05,  5.22s/it][A

loss: tensor(0.9445, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:03<11:59,  5.21s/it][A

loss: tensor(0.9340, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:08<11:56,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:14<11:50,  5.23s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:19<11:45,  5.22s/it][A

loss: tensor(0.9670, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:24<11:38,  5.22s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:29<11:36,  5.24s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:35<11:30,  5.23s/it][A

loss: tensor(0.9718, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [18:40<11:24,  5.23s/it][A

loss: tensor(0.9254, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [18:45<11:18,  5.22s/it][A

loss: tensor(0.9460, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [18:50<11:15,  5.24s/it][A

loss: tensor(0.9119, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [18:55<11:08,  5.22s/it][A

loss: tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:01<11:03,  5.22s/it][A

loss: tensor(0.9620, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:06<10:57,  5.22s/it][A

loss: tensor(0.9383, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:11<10:54,  5.24s/it][A

loss: tensor(0.9354, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:16<10:48,  5.23s/it][A

loss: tensor(0.9286, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:22<10:42,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:27<10:37,  5.22s/it][A

loss: tensor(0.9371, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:32<10:33,  5.24s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [19:37<10:27,  5.23s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [19:42<10:21,  5.23s/it][A

loss: tensor(0.9712, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [19:48<10:18,  5.24s/it][A

loss: tensor(0.9783, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [19:53<10:13,  5.25s/it][A

loss: tensor(0.9132, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [19:58<10:08,  5.24s/it][A

loss: tensor(0.9640, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:03<10:02,  5.24s/it][A

loss: tensor(0.9363, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:09<09:57,  5.24s/it][A

loss: tensor(0.9233, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:14<09:51,  5.24s/it][A

loss: tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:19<09:46,  5.23s/it][A

loss: tensor(0.9148, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:24<09:40,  5.23s/it][A

loss: tensor(0.9580, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:30<09:37,  5.25s/it][A

loss: tensor(1.0190, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [20:35<09:30,  5.24s/it][A

loss: tensor(0.9664, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [20:40<09:23,  5.22s/it][A

loss: tensor(0.9828, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [20:45<09:17,  5.21s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [20:50<09:13,  5.22s/it][A

loss: tensor(0.9996, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [20:56<09:06,  5.21s/it][A

loss: tensor(0.9218, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:01<09:00,  5.19s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:06<08:54,  5.19s/it][A

loss: tensor(0.9153, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:11<08:51,  5.21s/it][A

loss: tensor(0.9853, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:16<08:43,  5.19s/it][A

loss: tensor(0.9384, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:22<08:37,  5.18s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:27<08:32,  5.17s/it][A

loss: tensor(0.9622, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [21:32<08:27,  5.18s/it][A

loss: tensor(0.9794, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [21:37<08:21,  5.17s/it][A

loss: tensor(0.9948, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [21:42<08:15,  5.16s/it][A

loss: tensor(0.9425, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [21:47<08:11,  5.18s/it][A

loss: tensor(0.9380, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [21:53<08:05,  5.16s/it][A

loss: tensor(1.0017, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [21:58<08:00,  5.17s/it][A

loss: tensor(1.0064, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:03<07:54,  5.16s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:08<07:51,  5.18s/it][A

loss: tensor(0.9621, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:13<07:45,  5.18s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:18<07:41,  5.18s/it][A

loss: tensor(0.9897, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:24<07:35,  5.17s/it][A

loss: tensor(1.0003, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [22:29<07:31,  5.19s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [22:34<07:25,  5.18s/it][A

loss: tensor(0.9127, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [22:39<07:19,  5.17s/it][A

loss: tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [22:44<07:14,  5.17s/it][A

loss: tensor(0.9204, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [22:49<07:09,  5.18s/it][A

loss: tensor(0.9365, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [22:55<07:03,  5.16s/it][A

loss: tensor(0.9239, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:00<06:57,  5.15s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:05<06:51,  5.15s/it][A

loss: tensor(0.9615, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:10<06:47,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:15<06:41,  5.15s/it][A

loss: tensor(0.9130, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:20<06:34,  5.13s/it][A

loss: tensor(0.9481, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:26<06:31,  5.15s/it][A

loss: tensor(0.9833, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [23:31<06:26,  5.15s/it][A

loss: tensor(0.9416, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [23:36<06:20,  5.14s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [23:41<06:15,  5.14s/it][A

loss: tensor(0.9173, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [23:46<06:11,  5.16s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [23:51<06:05,  5.15s/it][A

loss: tensor(0.9891, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [23:56<06:00,  5.15s/it][A

loss: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:02<05:54,  5.14s/it][A

loss: tensor(0.9217, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:07<05:50,  5.15s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:12<05:45,  5.16s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:17<05:39,  5.15s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:22<05:34,  5.15s/it][A

loss: tensor(0.9121, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [24:27<05:30,  5.16s/it][A

loss: tensor(0.9464, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [24:32<05:25,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [24:38<05:19,  5.15s/it][A

loss: tensor(0.9401, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [24:43<05:14,  5.15s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [24:48<05:09,  5.16s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [24:53<05:04,  5.16s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [24:58<04:58,  5.16s/it][A

loss: tensor(0.9134, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:03<04:53,  5.15s/it][A

loss: tensor(0.9637, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:09<04:49,  5.16s/it][A

loss: tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:14<04:43,  5.16s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:19<04:38,  5.15s/it][A

loss: tensor(0.9336, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [25:24<04:34,  5.17s/it][A

loss: tensor(0.9321, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [25:29<04:28,  5.16s/it][A

loss: tensor(0.9329, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [25:34<04:23,  5.17s/it][A

loss: tensor(0.9726, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [25:40<04:18,  5.17s/it][A

loss: tensor(0.9326, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [25:45<04:14,  5.19s/it][A

loss: tensor(0.9468, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [25:50<04:08,  5.18s/it][A

loss: tensor(0.9462, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [25:55<04:03,  5.18s/it][A

loss: tensor(1.0104, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:00<03:58,  5.18s/it][A

loss: tensor(0.9611, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:06<03:54,  5.20s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:11<03:49,  5.21s/it][A

loss: tensor(0.9264, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:16<03:43,  5.20s/it][A

loss: tensor(0.9250, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [26:21<03:38,  5.20s/it][A

loss: tensor(0.9890, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [26:26<03:34,  5.22s/it][A

loss: tensor(0.9622, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [26:32<03:28,  5.21s/it][A

loss: tensor(0.9249, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [26:37<03:22,  5.20s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [26:42<03:17,  5.20s/it][A

loss: tensor(0.9431, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [26:47<03:13,  5.22s/it][A

loss: tensor(0.9442, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [26:52<03:07,  5.21s/it][A

loss: tensor(0.9845, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [26:58<03:02,  5.21s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:03<02:56,  5.20s/it][A

loss: tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:08<02:52,  5.22s/it][A

loss: tensor(0.9914, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:13<02:47,  5.24s/it][A

loss: tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [27:19<02:41,  5.22s/it][A

loss: tensor(0.9373, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [27:24<02:37,  5.23s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [27:29<02:31,  5.23s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [27:34<02:26,  5.23s/it][A

loss: tensor(1.0031, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [27:40<02:21,  5.23s/it][A

loss: tensor(1.0244, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [27:45<02:16,  5.25s/it][A

loss: tensor(0.9528, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [27:50<02:11,  5.25s/it][A

loss: tensor(0.9613, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [27:55<02:05,  5.25s/it][A

loss: tensor(0.9144, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:01<02:00,  5.24s/it][A

loss: tensor(0.9643, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:06<01:55,  5.26s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:11<01:50,  5.25s/it][A

loss: tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [28:16<01:44,  5.24s/it][A

loss: tensor(0.9407, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [28:22<01:39,  5.23s/it][A

loss: tensor(0.9424, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [28:27<01:34,  5.25s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [28:32<01:29,  5.24s/it][A

loss: tensor(0.9536, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [28:37<01:24,  5.27s/it][A

loss: tensor(0.9123, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [28:43<01:19,  5.31s/it][A

loss: tensor(0.9270, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [28:48<01:14,  5.35s/it][A

loss: tensor(0.9649, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [28:54<01:09,  5.35s/it][A

loss: tensor(0.9838, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [28:59<01:04,  5.35s/it][A

loss: tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:04<00:58,  5.36s/it][A

loss: tensor(0.9146, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:09<00:51,  5.20s/it][A

loss: tensor(0.9447, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:14<00:45,  5.08s/it][A

loss: tensor(0.9580, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [29:19<00:39,  5.00s/it][A

loss: tensor(0.9272, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [29:24<00:34,  4.96s/it][A

loss: tensor(0.9277, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [29:28<00:29,  4.92s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [29:33<00:24,  4.89s/it][A

loss: tensor(0.9382, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [29:38<00:19,  4.87s/it][A

loss: tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [29:43<00:14,  4.98s/it][A

loss: tensor(0.9701, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [29:49<00:10,  5.09s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [29:54<00:05,  5.18s/it][A

loss: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [29:54<00:00,  5.19s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9418942685416668

	Training acc: 0.9629713656230906

	Training prec: 0.9281066387398499

	Training rec: 0.9629713656230906

	Training f1: 0.9450192782259302

	Current Learning rate:  1e-05



  1%|          | 1/173 [00:00<02:02,  1.40it/s][A
  1%|          | 2/173 [00:01<02:02,  1.39it/s][A
  2%|▏         | 3/173 [00:02<01:55,  1.47it/s][A
  2%|▏         | 4/173 [00:02<01:57,  1.44it/s][A
  3%|▎         | 5/173 [00:03<01:58,  1.41it/s][A
  3%|▎         | 6/173 [00:04<01:54,  1.46it/s][A
  4%|▍         | 7/173 [00:04<01:55,  1.44it/s][A
  5%|▍         | 8/173 [00:05<01:55,  1.43it/s][A
  5%|▌         | 9/173 [00:06<01:51,  1.47it/s][A
  6%|▌         | 10/173 [00:06<01:52,  1.44it/s][A
  6%|▋         | 11/173 [00:07<01:53,  1.43it/s][A
  7%|▋         | 12/173 [00:08<01:49,  1.47it/s][A
  8%|▊         | 13/173 [00:09<01:51,  1.44it/s][A
  8%|▊         | 14/173 [00:09<01:51,  1.42it/s][A
  9%|▊         | 15/173 [00:10<01:48,  1.46it/s][A
  9%|▉         | 16/173 [00:11<01:49,  1.44it/s][A
 10%|▉         | 17/173 [00:11<01:49,  1.43it/s][A
 10%|█         | 18/173 [00:12<01:45,  1.46it/s][A
 11%|█         | 19/173 [00:13<01:46,  1.45it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9440662564569815

	Validation acc: 0.960771271173654

	Validation prec: 0.9245537531732116

	Validation rec: 0.960771271173654

	Validation f1: 0.9419474042643311



  0%|          | 1/346 [00:05<31:14,  5.43s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<30:48,  5.37s/it][A

loss: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:16<30:41,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:21<30:34,  5.37s/it][A

loss: tensor(0.9304, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<30:20,  5.34s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:31<30:01,  5.30s/it][A

loss: tensor(0.9531, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:37<29:49,  5.28s/it][A

loss: tensor(0.9303, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:42<29:36,  5.26s/it][A

loss: tensor(0.9485, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:47<29:33,  5.26s/it][A

loss: tensor(0.9174, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:52<29:24,  5.25s/it][A

loss: tensor(0.9206, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:58<29:14,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:03<29:10,  5.24s/it][A

loss: tensor(0.9794, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:08<29:04,  5.24s/it][A

loss: tensor(0.9659, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:13<28:56,  5.23s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:19<28:50,  5.23s/it][A

loss: tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:24<28:48,  5.24s/it][A

loss: tensor(0.9486, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:29<28:40,  5.23s/it][A

loss: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:34<28:33,  5.22s/it][A

loss: tensor(0.9444, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:39<28:28,  5.22s/it][A

loss: tensor(0.9322, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:45<28:28,  5.24s/it][A

loss: tensor(0.9625, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:50<28:20,  5.23s/it][A

loss: tensor(0.9280, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:55<28:10,  5.22s/it][A

loss: tensor(0.9151, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:00<28:06,  5.22s/it][A

loss: tensor(0.9142, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:06<28:10,  5.25s/it][A

loss: tensor(0.9531, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:11<28:02,  5.24s/it][A

loss: tensor(0.9171, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:16<27:57,  5.24s/it][A

loss: tensor(0.9348, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:21<27:50,  5.24s/it][A

loss: tensor(0.9264, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:27<27:50,  5.25s/it][A

loss: tensor(0.9287, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:32<27:42,  5.25s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:37<27:34,  5.24s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:42<27:28,  5.23s/it][A

loss: tensor(0.9374, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:48<27:27,  5.25s/it][A

loss: tensor(0.9609, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:53<27:21,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [02:58<27:15,  5.24s/it][A

loss: tensor(1.0087, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:03<27:17,  5.27s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:09<27:07,  5.25s/it][A

loss: tensor(0.9199, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:14<27:00,  5.24s/it][A

loss: tensor(1.0018, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:19<26:53,  5.24s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:24<26:51,  5.25s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:30<26:45,  5.25s/it][A

loss: tensor(0.9529, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:35<26:38,  5.24s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:40<26:29,  5.23s/it][A

loss: tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:45<26:33,  5.26s/it][A

loss: tensor(0.9141, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:51<26:26,  5.25s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:56<26:19,  5.25s/it][A

loss: tensor(0.9382, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:01<26:13,  5.24s/it][A

loss: tensor(0.9831, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:06<26:10,  5.25s/it][A

loss: tensor(0.9696, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:12<26:01,  5.24s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:17<25:55,  5.24s/it][A

loss: tensor(0.9559, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:22<25:50,  5.24s/it][A

loss: tensor(0.9628, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:27<25:52,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:33<25:45,  5.26s/it][A

loss: tensor(0.9496, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:38<25:40,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:43<25:39,  5.27s/it][A

loss: tensor(0.9374, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:48<25:30,  5.26s/it][A

loss: tensor(1.0018, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:54<25:23,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [04:59<25:13,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:04<25:12,  5.25s/it][A

loss: tensor(0.9738, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:09<25:02,  5.23s/it][A

loss: tensor(0.9204, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:15<24:57,  5.24s/it][A

loss: tensor(0.9416, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:20<24:51,  5.23s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:25<24:51,  5.25s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:30<24:47,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:36<24:40,  5.25s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:41<24:31,  5.23s/it][A

loss: tensor(0.9563, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:46<24:31,  5.25s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:51<24:23,  5.25s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [05:57<24:18,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:02<24:11,  5.24s/it][A

loss: tensor(0.9310, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:07<24:09,  5.25s/it][A

loss: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:12<24:02,  5.25s/it][A

loss: tensor(0.9415, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:17<23:53,  5.23s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:23<23:45,  5.22s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:28<23:47,  5.25s/it][A

loss: tensor(0.9266, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:33<23:37,  5.23s/it][A

loss: tensor(0.9391, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:38<23:25,  5.21s/it][A

loss: tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:44<23:26,  5.23s/it][A

loss: tensor(0.9723, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:49<23:20,  5.23s/it][A

loss: tensor(0.9435, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:54<23:14,  5.22s/it][A

loss: tensor(0.9380, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [06:59<23:08,  5.22s/it][A

loss: tensor(0.9534, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:05<23:08,  5.24s/it][A

loss: tensor(0.9328, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:10<22:59,  5.22s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:15<22:53,  5.22s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:20<22:46,  5.22s/it][A

loss: tensor(0.9586, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:25<22:48,  5.24s/it][A

loss: tensor(0.9984, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:31<22:40,  5.23s/it][A

loss: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:36<22:32,  5.22s/it][A

loss: tensor(0.9829, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:41<22:24,  5.21s/it][A

loss: tensor(0.9547, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:46<22:23,  5.23s/it][A

loss: tensor(0.9412, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:52<22:18,  5.23s/it][A

loss: tensor(0.9507, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [07:57<22:10,  5.22s/it][A

loss: tensor(0.9393, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:02<22:00,  5.20s/it][A

loss: tensor(0.9457, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:07<22:02,  5.23s/it][A

loss: tensor(0.9757, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:12<21:54,  5.22s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:18<21:48,  5.21s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:23<21:42,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:28<21:44,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:33<21:37,  5.23s/it][A

loss: tensor(0.9606, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:39<21:32,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:44<21:27,  5.23s/it][A

loss: tensor(0.9751, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:49<21:26,  5.25s/it][A

loss: tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [08:54<21:19,  5.24s/it][A

loss: tensor(0.9129, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:00<21:12,  5.24s/it][A

loss: tensor(0.9483, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:05<21:12,  5.26s/it][A

loss: tensor(0.9856, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:10<21:03,  5.24s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:15<20:57,  5.24s/it][A

loss: tensor(0.9242, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:20<20:51,  5.24s/it][A

loss: tensor(0.9191, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:26<20:50,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:31<20:45,  5.25s/it][A

loss: tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:36<20:37,  5.24s/it][A

loss: tensor(0.9491, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:41<20:29,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:47<20:28,  5.25s/it][A

loss: tensor(0.9521, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [09:52<20:22,  5.25s/it][A

loss: tensor(0.9460, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [09:57<20:15,  5.24s/it][A

loss: tensor(0.9325, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:02<20:09,  5.23s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:08<20:09,  5.26s/it][A

loss: tensor(0.9620, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:13<20:02,  5.25s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:18<19:55,  5.24s/it][A

loss: tensor(0.9200, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:23<19:48,  5.24s/it][A

loss: tensor(0.9326, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:29<19:46,  5.25s/it][A

loss: tensor(0.9637, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:34<19:40,  5.25s/it][A

loss: tensor(0.9918, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:39<19:34,  5.25s/it][A

loss: tensor(0.9309, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:45<19:35,  5.27s/it][A

loss: tensor(0.9542, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [10:50<19:27,  5.26s/it][A

loss: tensor(0.9488, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [10:55<19:20,  5.25s/it][A

loss: tensor(0.9624, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:00<19:14,  5.25s/it][A

loss: tensor(0.9470, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:06<19:10,  5.26s/it][A

loss: tensor(0.9202, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:11<19:04,  5.25s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:16<18:57,  5.24s/it][A

loss: tensor(0.9627, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:21<18:51,  5.24s/it][A

loss: tensor(0.9568, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:27<18:50,  5.26s/it][A

loss: tensor(0.9644, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:32<18:44,  5.25s/it][A

loss: tensor(0.9598, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:37<18:37,  5.24s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:42<18:30,  5.24s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:47<18:28,  5.25s/it][A

loss: tensor(0.9512, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [11:53<18:21,  5.25s/it][A

loss: tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [11:58<18:14,  5.24s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:03<18:09,  5.24s/it][A

loss: tensor(0.9234, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:08<18:07,  5.25s/it][A

loss: tensor(0.9299, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:14<17:59,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:19<17:52,  5.23s/it][A

loss: tensor(0.9589, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:24<17:47,  5.23s/it][A

loss: tensor(0.9292, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:29<17:47,  5.26s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:35<17:42,  5.26s/it][A

loss: tensor(0.9475, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:40<17:37,  5.26s/it][A

loss: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:45<17:32,  5.26s/it][A

loss: tensor(0.9634, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [12:51<17:28,  5.27s/it][A

loss: tensor(0.9465, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [12:56<17:22,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:01<17:16,  5.26s/it][A

loss: tensor(0.9189, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:06<17:10,  5.26s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:11<17:03,  5.25s/it][A

loss: tensor(0.9386, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:17<16:58,  5.25s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:22<16:51,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:27<16:49,  5.26s/it][A

loss: tensor(0.9577, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:32<16:42,  5.25s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:38<16:36,  5.24s/it][A

loss: tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:43<16:30,  5.24s/it][A

loss: tensor(0.9415, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [13:48<16:28,  5.26s/it][A

loss: tensor(0.9795, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [13:53<16:21,  5.25s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [13:59<16:14,  5.24s/it][A

loss: tensor(0.9336, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:04<16:08,  5.24s/it][A

loss: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:09<16:07,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:14<16:01,  5.25s/it][A

loss: tensor(0.9689, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:20<15:53,  5.24s/it][A

loss: tensor(0.9844, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:25<15:47,  5.24s/it][A

loss: tensor(1.0016, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:30<15:45,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:35<15:38,  5.24s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:41<15:33,  5.24s/it][A

loss: tensor(0.9661, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:46<15:27,  5.24s/it][A

loss: tensor(0.9646, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [14:51<15:25,  5.26s/it][A

loss: tensor(0.9915, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [14:56<15:18,  5.25s/it][A

loss: tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:02<15:14,  5.25s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:07<15:11,  5.27s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:12<15:03,  5.25s/it][A

loss: tensor(0.9470, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:17<14:56,  5.24s/it][A

loss: tensor(0.9421, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:23<14:49,  5.23s/it][A

loss: tensor(0.9431, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:28<14:47,  5.25s/it][A

loss: tensor(0.9589, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:33<14:41,  5.24s/it][A

loss: tensor(0.9270, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:38<14:36,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:44<14:29,  5.24s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [15:49<14:30,  5.27s/it][A

loss: tensor(0.9698, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [15:54<14:20,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [15:59<14:15,  5.25s/it][A

loss: tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:05<14:09,  5.24s/it][A

loss: tensor(0.9663, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:10<14:08,  5.27s/it][A

loss: tensor(0.9224, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:15<14:00,  5.25s/it][A

loss: tensor(0.9262, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:20<13:54,  5.25s/it][A

loss: tensor(0.9204, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:26<13:48,  5.25s/it][A

loss: tensor(0.9223, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:31<13:47,  5.27s/it][A

loss: tensor(0.9701, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:36<13:40,  5.26s/it][A

loss: tensor(0.9356, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:41<13:33,  5.25s/it][A

loss: tensor(0.9429, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [16:47<13:31,  5.27s/it][A

loss: tensor(0.9264, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [16:52<13:24,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [16:57<13:18,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:02<13:12,  5.25s/it][A

loss: tensor(0.9724, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:08<13:08,  5.26s/it][A

loss: tensor(0.9186, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:13<13:01,  5.25s/it][A

loss: tensor(0.9581, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:18<12:56,  5.24s/it][A

loss: tensor(0.9227, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:23<12:50,  5.24s/it][A

loss: tensor(0.9525, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:29<12:48,  5.26s/it][A

loss: tensor(0.9754, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:34<12:41,  5.25s/it][A

loss: tensor(0.9415, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:39<12:37,  5.26s/it][A

loss: tensor(0.9653, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:45<12:30,  5.25s/it][A

loss: tensor(0.9878, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [17:50<12:27,  5.27s/it][A

loss: tensor(0.9736, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [17:55<12:21,  5.26s/it][A

loss: tensor(0.9391, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:00<12:14,  5.25s/it][A

loss: tensor(0.9320, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:06<12:09,  5.25s/it][A

loss: tensor(0.9202, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:11<12:06,  5.27s/it][A

loss: tensor(0.9507, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:16<12:01,  5.27s/it][A

loss: tensor(0.9684, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:21<11:54,  5.25s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:27<11:49,  5.25s/it][A

loss: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:32<11:46,  5.27s/it][A

loss: tensor(0.9410, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:37<11:39,  5.26s/it][A

loss: tensor(0.9433, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:42<11:33,  5.25s/it][A

loss: tensor(0.9238, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [18:48<11:30,  5.27s/it][A

loss: tensor(1.0056, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [18:53<11:23,  5.25s/it][A

loss: tensor(0.9156, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [18:58<11:17,  5.25s/it][A

loss: tensor(0.9715, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:03<11:11,  5.24s/it][A

loss: tensor(0.9706, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:09<11:06,  5.25s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:14<10:59,  5.24s/it][A

loss: tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:19<10:54,  5.24s/it][A

loss: tensor(0.9453, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:24<10:49,  5.24s/it][A

loss: tensor(1.0277, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:30<10:45,  5.25s/it][A

loss: tensor(0.9137, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:35<10:39,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:40<10:33,  5.24s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [19:45<10:27,  5.23s/it][A

loss: tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [19:51<10:25,  5.25s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [19:56<10:20,  5.26s/it][A

loss: tensor(0.9572, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:01<10:13,  5.25s/it][A

loss: tensor(0.9906, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:06<10:08,  5.25s/it][A

loss: tensor(0.9682, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:12<10:05,  5.26s/it][A

loss: tensor(0.9424, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:17<09:58,  5.25s/it][A

loss: tensor(0.9649, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:22<09:52,  5.25s/it][A

loss: tensor(0.9295, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:27<09:47,  5.25s/it][A

loss: tensor(1.0086, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:33<09:44,  5.27s/it][A

loss: tensor(1.0625, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:38<09:38,  5.26s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [20:43<09:31,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [20:48<09:28,  5.26s/it][A

loss: tensor(0.9549, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [20:54<09:21,  5.25s/it][A

loss: tensor(0.9170, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [20:59<09:16,  5.25s/it][A

loss: tensor(0.9471, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:04<09:11,  5.25s/it][A

loss: tensor(1.0011, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:09<09:07,  5.26s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:15<08:59,  5.24s/it][A

loss: tensor(0.9709, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:20<08:54,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:25<08:49,  5.24s/it][A

loss: tensor(0.9813, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:30<08:46,  5.26s/it][A

loss: tensor(0.9821, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:36<08:39,  5.25s/it][A

loss: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [21:41<08:33,  5.24s/it][A

loss: tensor(0.9751, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [21:46<08:27,  5.23s/it][A

loss: tensor(0.9536, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [21:51<08:23,  5.25s/it][A

loss: tensor(1.0236, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [21:57<08:18,  5.24s/it][A

loss: tensor(0.9306, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:02<08:12,  5.24s/it][A

loss: tensor(0.9532, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:07<08:06,  5.24s/it][A

loss: tensor(0.9425, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:12<08:03,  5.25s/it][A

loss: tensor(0.9263, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:17<07:57,  5.25s/it][A

loss: tensor(0.9592, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:23<07:51,  5.24s/it][A

loss: tensor(0.9262, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:28<07:46,  5.24s/it][A

loss: tensor(0.9398, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:33<07:43,  5.26s/it][A

loss: tensor(0.9181, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [22:39<07:37,  5.26s/it][A

loss: tensor(0.9745, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [22:44<07:31,  5.25s/it][A

loss: tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [22:49<07:27,  5.26s/it][A

loss: tensor(0.9652, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [22:54<07:22,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:00<07:16,  5.26s/it][A

loss: tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:05<07:11,  5.26s/it][A

loss: tensor(0.9148, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:10<07:06,  5.27s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:15<06:59,  5.25s/it][A

loss: tensor(0.9127, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:21<06:54,  5.25s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:26<06:48,  5.24s/it][A

loss: tensor(0.9140, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:31<06:44,  5.25s/it][A

loss: tensor(0.9506, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:36<06:38,  5.25s/it][A

loss: tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [23:42<06:33,  5.24s/it][A

loss: tensor(1.0012, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [23:47<06:28,  5.25s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [23:52<06:23,  5.26s/it][A

loss: tensor(0.9178, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [23:57<06:17,  5.24s/it][A

loss: tensor(0.9455, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:02<06:11,  5.23s/it][A

loss: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:08<06:05,  5.22s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:13<06:01,  5.23s/it][A

loss: tensor(0.9226, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:18<05:55,  5.22s/it][A

loss: tensor(0.9619, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:23<05:49,  5.22s/it][A

loss: tensor(0.9662, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:29<05:44,  5.22s/it][A

loss: tensor(0.9172, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:34<05:40,  5.24s/it][A

loss: tensor(0.9842, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [24:39<05:34,  5.23s/it][A

loss: tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [24:44<05:29,  5.23s/it][A

loss: tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [24:50<05:24,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [24:55<05:20,  5.25s/it][A

loss: tensor(0.9586, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:00<05:14,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:05<05:09,  5.24s/it][A

loss: tensor(0.9411, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [25:11<05:04,  5.26s/it][A

loss: tensor(0.9510, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:16<04:58,  5.24s/it][A

loss: tensor(0.9390, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:21<04:52,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:26<04:47,  5.22s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:31<04:42,  5.24s/it][A

loss: tensor(0.9547, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [25:37<04:37,  5.23s/it][A

loss: tensor(0.9344, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [25:42<04:32,  5.23s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [25:47<04:26,  5.22s/it][A

loss: tensor(0.9258, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [25:52<04:21,  5.24s/it][A

loss: tensor(0.9920, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [25:58<04:16,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:03<04:10,  5.22s/it][A

loss: tensor(0.9640, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:08<04:05,  5.22s/it][A

loss: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:13<04:01,  5.24s/it][A

loss: tensor(0.9705, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:19<03:55,  5.23s/it][A

loss: tensor(0.9391, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:24<03:49,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:29<03:44,  5.21s/it][A

loss: tensor(0.9402, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [26:34<03:39,  5.24s/it][A

loss: tensor(0.9283, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [26:39<03:34,  5.23s/it][A

loss: tensor(1.0543, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [26:45<03:29,  5.23s/it][A

loss: tensor(0.9796, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [26:50<03:23,  5.22s/it][A

loss: tensor(0.9237, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [26:55<03:18,  5.24s/it][A

loss: tensor(0.9191, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:00<03:13,  5.22s/it][A

loss: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:06<03:08,  5.22s/it][A

loss: tensor(0.9483, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [27:11<03:03,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:16<02:57,  5.22s/it][A

loss: tensor(0.9745, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:21<02:51,  5.21s/it][A

loss: tensor(0.9258, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:26<02:46,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [27:32<02:41,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [27:37<02:36,  5.22s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [27:42<02:31,  5.22s/it][A

loss: tensor(1.0209, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [27:47<02:25,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [27:53<02:21,  5.23s/it][A

loss: tensor(0.9375, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [27:58<02:15,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:03<02:10,  5.21s/it][A

loss: tensor(0.9655, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [28:08<02:05,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:13<02:00,  5.24s/it][A

loss: tensor(0.9258, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:19<01:54,  5.23s/it][A

loss: tensor(0.9325, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:24<01:49,  5.22s/it][A

loss: tensor(1.0117, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [28:29<01:44,  5.22s/it][A

loss: tensor(0.9358, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [28:34<01:39,  5.24s/it][A

loss: tensor(0.9888, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [28:40<01:34,  5.23s/it][A

loss: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [28:45<01:28,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [28:50<01:23,  5.24s/it][A

loss: tensor(0.9738, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [28:55<01:18,  5.23s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:00<01:13,  5.23s/it][A

loss: tensor(0.9199, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [29:06<01:07,  5.23s/it][A

loss: tensor(0.9242, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [29:11<01:02,  5.24s/it][A

loss: tensor(0.9958, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:16<00:57,  5.23s/it][A

loss: tensor(0.9254, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:21<00:52,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:27<00:47,  5.23s/it][A

loss: tensor(0.9984, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [29:32<00:41,  5.24s/it][A

loss: tensor(0.9446, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [29:37<00:36,  5.24s/it][A

loss: tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [29:42<00:31,  5.23s/it][A

loss: tensor(0.9598, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [29:48<00:26,  5.23s/it][A

loss: tensor(0.9176, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [29:53<00:20,  5.24s/it][A

loss: tensor(0.9252, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [29:58<00:15,  5.23s/it][A

loss: tensor(0.9657, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [30:03<00:10,  5.22s/it][A

loss: tensor(0.9868, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [30:08<00:05,  5.22s/it][A

loss: tensor(0.9549, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [30:09<00:00,  5.23s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9418866419034198

	Training acc: 0.9629750121891255

	Training prec: 0.9280773647800264

	Training rec: 0.9629750121891255

	Training f1: 0.9450146272070254

	Current Learning rate:  5e-06



  1%|          | 1/173 [00:00<01:45,  1.63it/s][A
  1%|          | 2/173 [00:01<01:52,  1.52it/s][A
  2%|▏         | 3/173 [00:01<01:53,  1.50it/s][A
  2%|▏         | 4/173 [00:02<01:49,  1.54it/s][A
  3%|▎         | 5/173 [00:03<01:51,  1.51it/s][A
  3%|▎         | 6/173 [00:03<01:52,  1.49it/s][A
  4%|▍         | 7/173 [00:04<01:48,  1.54it/s][A
  5%|▍         | 8/173 [00:05<01:49,  1.51it/s][A
  5%|▌         | 9/173 [00:05<01:50,  1.49it/s][A
  6%|▌         | 10/173 [00:06<01:46,  1.53it/s][A
  6%|▋         | 11/173 [00:07<01:47,  1.51it/s][A
  7%|▋         | 12/173 [00:07<01:48,  1.49it/s][A
  8%|▊         | 13/173 [00:08<01:44,  1.53it/s][A
  8%|▊         | 14/173 [00:09<01:45,  1.51it/s][A
  9%|▊         | 15/173 [00:09<01:46,  1.49it/s][A
  9%|▉         | 16/173 [00:10<01:42,  1.53it/s][A
 10%|▉         | 17/173 [00:11<01:43,  1.51it/s][A
 10%|█         | 18/173 [00:11<01:44,  1.49it/s][A
 11%|█         | 19/173 [00:12<01:40,  1.53it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9445773924706299

	Validation acc: 0.9602596023386318

	Validation prec: 0.9236050074678877

	Validation rec: 0.9602596023386318

	Validation f1: 0.9411998224994242



  0%|          | 1/346 [00:05<30:01,  5.22s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 2/346 [00:10<30:04,  5.25s/it][A

loss: tensor(0.9833, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/346 [00:15<30:08,  5.27s/it][A

loss: tensor(0.9582, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/346 [00:20<29:54,  5.25s/it][A

loss: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 5/346 [00:26<29:51,  5.25s/it][A

loss: tensor(0.9158, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 6/346 [00:31<29:43,  5.25s/it][A

loss: tensor(0.9380, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 7/346 [00:36<29:42,  5.26s/it][A

loss: tensor(0.9235, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/346 [00:41<29:32,  5.24s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 9/346 [00:47<29:26,  5.24s/it][A

loss: tensor(0.9173, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 10/346 [00:52<29:18,  5.23s/it][A

loss: tensor(0.9789, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 11/346 [00:57<29:19,  5.25s/it][A

loss: tensor(0.9219, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/346 [01:02<29:10,  5.24s/it][A

loss: tensor(0.9226, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 13/346 [01:08<28:59,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 14/346 [01:13<28:52,  5.22s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 15/346 [01:18<28:57,  5.25s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 16/346 [01:23<28:47,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 17/346 [01:29<28:38,  5.22s/it][A

loss: tensor(0.9598, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 18/346 [01:34<28:31,  5.22s/it][A

loss: tensor(0.9931, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 19/346 [01:39<28:34,  5.24s/it][A

loss: tensor(0.9637, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 20/346 [01:44<28:25,  5.23s/it][A

loss: tensor(0.9708, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 21/346 [01:49<28:17,  5.22s/it][A

loss: tensor(0.9497, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 22/346 [01:55<28:16,  5.24s/it][A

loss: tensor(0.9325, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 23/346 [02:00<28:09,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 24/346 [02:05<28:02,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 25/346 [02:10<27:58,  5.23s/it][A

loss: tensor(0.9381, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 26/346 [02:16<27:58,  5.25s/it][A

loss: tensor(0.9249, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 27/346 [02:21<27:51,  5.24s/it][A

loss: tensor(0.9371, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 28/346 [02:26<27:44,  5.23s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 29/346 [02:31<27:38,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 30/346 [02:37<27:38,  5.25s/it][A

loss: tensor(0.9800, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 31/346 [02:42<27:27,  5.23s/it][A

loss: tensor(0.9582, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 32/346 [02:47<27:21,  5.23s/it][A

loss: tensor(0.9412, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 33/346 [02:52<27:14,  5.22s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 34/346 [02:58<27:16,  5.24s/it][A

loss: tensor(0.9186, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 35/346 [03:03<27:09,  5.24s/it][A

loss: tensor(0.9251, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 36/346 [03:08<27:02,  5.23s/it][A

loss: tensor(0.9510, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 37/346 [03:13<26:57,  5.23s/it][A

loss: tensor(0.9251, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 38/346 [03:19<26:58,  5.25s/it][A

loss: tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 39/346 [03:24<26:50,  5.25s/it][A

loss: tensor(0.9145, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 40/346 [03:29<26:41,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 41/346 [03:34<26:34,  5.23s/it][A

loss: tensor(0.9545, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 42/346 [03:39<26:34,  5.24s/it][A

loss: tensor(0.9909, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 43/346 [03:45<26:25,  5.23s/it][A

loss: tensor(1.0015, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 44/346 [03:50<26:18,  5.23s/it][A

loss: tensor(0.9310, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 45/346 [03:55<26:13,  5.23s/it][A

loss: tensor(0.9458, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 46/346 [04:00<26:14,  5.25s/it][A

loss: tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 47/346 [04:06<26:08,  5.24s/it][A

loss: tensor(0.9407, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 48/346 [04:11<26:01,  5.24s/it][A

loss: tensor(0.9379, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 49/346 [04:16<25:58,  5.25s/it][A

loss: tensor(0.9244, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 50/346 [04:21<25:49,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 51/346 [04:27<25:43,  5.23s/it][A

loss: tensor(1.0139, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 52/346 [04:32<25:37,  5.23s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 53/346 [04:37<25:36,  5.25s/it][A

loss: tensor(0.9344, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 54/346 [04:42<25:29,  5.24s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 55/346 [04:48<25:22,  5.23s/it][A

loss: tensor(0.9288, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 56/346 [04:53<25:15,  5.23s/it][A

loss: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 57/346 [04:58<25:16,  5.25s/it][A

loss: tensor(1.0240, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 58/346 [05:03<25:14,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 59/346 [05:09<25:17,  5.29s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 60/346 [05:14<25:20,  5.31s/it][A

loss: tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 61/346 [05:19<25:25,  5.35s/it][A

loss: tensor(0.9264, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 62/346 [05:25<25:24,  5.37s/it][A

loss: tensor(0.9192, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 63/346 [05:30<25:20,  5.37s/it][A

loss: tensor(0.9540, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 64/346 [05:36<25:19,  5.39s/it][A

loss: tensor(0.9218, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 65/346 [05:41<25:20,  5.41s/it][A

loss: tensor(0.9127, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 66/346 [05:47<25:14,  5.41s/it][A

loss: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 67/346 [05:52<25:10,  5.41s/it][A

loss: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 68/346 [05:57<25:04,  5.41s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 69/346 [06:03<25:02,  5.42s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 70/346 [06:08<24:53,  5.41s/it][A

loss: tensor(0.9707, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 71/346 [06:14<24:43,  5.40s/it][A

loss: tensor(0.9186, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 72/346 [06:19<24:42,  5.41s/it][A

loss: tensor(0.9459, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 73/346 [06:24<24:34,  5.40s/it][A

loss: tensor(0.9541, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 74/346 [06:30<24:28,  5.40s/it][A

loss: tensor(0.9699, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 75/346 [06:35<24:20,  5.39s/it][A

loss: tensor(0.9625, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 76/346 [06:41<24:17,  5.40s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 77/346 [06:46<24:11,  5.40s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 78/346 [06:51<24:04,  5.39s/it][A

loss: tensor(0.9347, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 79/346 [06:57<23:56,  5.38s/it][A

loss: tensor(0.9595, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 80/346 [07:02<23:57,  5.40s/it][A

loss: tensor(0.9965, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 81/346 [07:08<23:48,  5.39s/it][A

loss: tensor(0.9465, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 82/346 [07:13<23:41,  5.39s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 83/346 [07:18<23:36,  5.39s/it][A

loss: tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 84/346 [07:24<23:36,  5.41s/it][A

loss: tensor(0.9368, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 85/346 [07:29<23:27,  5.39s/it][A

loss: tensor(0.9912, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 86/346 [07:35<23:21,  5.39s/it][A

loss: tensor(0.9687, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 87/346 [07:40<23:14,  5.38s/it][A

loss: tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 88/346 [07:45<23:13,  5.40s/it][A

loss: tensor(0.9626, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 89/346 [07:51<23:05,  5.39s/it][A

loss: tensor(0.9748, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 90/346 [07:56<22:27,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 91/346 [08:01<22:42,  5.34s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 92/346 [08:07<22:39,  5.35s/it][A

loss: tensor(0.9464, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 93/346 [08:12<22:40,  5.38s/it][A

loss: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 94/346 [08:17<22:37,  5.39s/it][A

loss: tensor(0.9424, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 95/346 [08:23<22:34,  5.40s/it][A

loss: tensor(0.9554, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 96/346 [08:28<22:28,  5.39s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 97/346 [08:34<22:26,  5.41s/it][A

loss: tensor(0.9275, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 98/346 [08:39<22:19,  5.40s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 99/346 [08:44<22:18,  5.42s/it][A

loss: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 100/346 [08:50<22:14,  5.43s/it][A

loss: tensor(0.9300, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 101/346 [08:55<22:09,  5.43s/it][A

loss: tensor(0.9563, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 102/346 [09:01<22:01,  5.42s/it][A

loss: tensor(0.9132, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 103/346 [09:06<21:59,  5.43s/it][A

loss: tensor(0.9421, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 104/346 [09:12<21:49,  5.41s/it][A

loss: tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 105/346 [09:17<21:43,  5.41s/it][A

loss: tensor(0.9454, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 106/346 [09:22<21:37,  5.41s/it][A

loss: tensor(0.9597, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 107/346 [09:28<21:34,  5.42s/it][A

loss: tensor(0.9718, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 108/346 [09:33<21:24,  5.40s/it][A

loss: tensor(0.9749, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 109/346 [09:39<21:17,  5.39s/it][A

loss: tensor(0.9283, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 110/346 [09:44<21:10,  5.38s/it][A

loss: tensor(0.9327, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 111/346 [09:49<21:11,  5.41s/it][A

loss: tensor(0.9199, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 112/346 [09:55<21:05,  5.41s/it][A

loss: tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 113/346 [10:00<20:56,  5.39s/it][A

loss: tensor(1.0053, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 114/346 [10:06<20:50,  5.39s/it][A

loss: tensor(0.9695, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 115/346 [10:11<20:47,  5.40s/it][A

loss: tensor(0.9616, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 116/346 [10:16<20:39,  5.39s/it][A

loss: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 117/346 [10:22<20:35,  5.39s/it][A

loss: tensor(0.9600, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 118/346 [10:27<20:32,  5.41s/it][A

loss: tensor(0.9891, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 119/346 [10:33<20:26,  5.40s/it][A

loss: tensor(0.9297, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 120/346 [10:38<20:21,  5.40s/it][A

loss: tensor(0.9414, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 121/346 [10:43<20:14,  5.40s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 122/346 [10:49<20:10,  5.40s/it][A

loss: tensor(0.9347, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 123/346 [10:54<20:02,  5.39s/it][A

loss: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 124/346 [11:00<19:57,  5.39s/it][A

loss: tensor(0.9175, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 125/346 [11:05<19:51,  5.39s/it][A

loss: tensor(0.9295, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 126/346 [11:10<19:48,  5.40s/it][A

loss: tensor(0.9804, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 127/346 [11:16<19:41,  5.40s/it][A

loss: tensor(0.9913, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 128/346 [11:21<19:33,  5.38s/it][A

loss: tensor(0.9805, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 129/346 [11:27<19:31,  5.40s/it][A

loss: tensor(0.9488, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 130/346 [11:32<19:28,  5.41s/it][A

loss: tensor(0.9321, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 131/346 [11:37<19:22,  5.41s/it][A

loss: tensor(0.9650, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 132/346 [11:43<19:15,  5.40s/it][A

loss: tensor(0.9377, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 133/346 [11:48<19:10,  5.40s/it][A

loss: tensor(0.9333, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 134/346 [11:54<19:09,  5.42s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 135/346 [11:59<19:02,  5.41s/it][A

loss: tensor(0.9384, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 136/346 [12:04<18:54,  5.40s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 137/346 [12:10<18:46,  5.39s/it][A

loss: tensor(0.9432, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 138/346 [12:15<18:45,  5.41s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 139/346 [12:21<18:39,  5.41s/it][A

loss: tensor(0.9195, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 140/346 [12:25<17:57,  5.23s/it][A

loss: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 141/346 [12:30<17:29,  5.12s/it][A

loss: tensor(0.9381, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 142/346 [12:35<17:05,  5.03s/it][A

loss: tensor(0.9698, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 143/346 [12:40<17:14,  5.09s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 144/346 [12:46<17:25,  5.18s/it][A

loss: tensor(0.9432, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 145/346 [12:51<17:35,  5.25s/it][A

loss: tensor(0.9155, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 146/346 [12:56<17:36,  5.28s/it][A

loss: tensor(0.9241, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 147/346 [13:02<17:37,  5.32s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 148/346 [13:07<17:03,  5.17s/it][A

loss: tensor(0.9584, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 149/346 [13:12<16:41,  5.08s/it][A

loss: tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 150/346 [13:17<16:45,  5.13s/it][A

loss: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 151/346 [13:22<16:53,  5.20s/it][A

loss: tensor(0.9376, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 152/346 [13:28<16:57,  5.25s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 153/346 [13:33<17:04,  5.31s/it][A

loss: tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 154/346 [13:38<17:01,  5.32s/it][A

loss: tensor(0.9345, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 155/346 [13:44<16:56,  5.32s/it][A

loss: tensor(0.9215, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 156/346 [13:49<16:53,  5.33s/it][A

loss: tensor(0.9154, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 157/346 [13:54<16:54,  5.37s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 158/346 [14:00<16:49,  5.37s/it][A

loss: tensor(0.9200, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 159/346 [14:05<16:41,  5.35s/it][A

loss: tensor(0.9623, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 160/346 [14:11<16:39,  5.37s/it][A

loss: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 161/346 [14:16<16:33,  5.37s/it][A

loss: tensor(1.0208, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 162/346 [14:21<16:28,  5.37s/it][A

loss: tensor(0.9810, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 163/346 [14:27<16:22,  5.37s/it][A

loss: tensor(0.9937, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 164/346 [14:32<16:20,  5.39s/it][A

loss: tensor(0.9667, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 165/346 [14:37<16:12,  5.37s/it][A

loss: tensor(0.9426, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 166/346 [14:43<16:05,  5.37s/it][A

loss: tensor(0.9769, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 167/346 [14:48<16:00,  5.37s/it][A

loss: tensor(1.0059, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 168/346 [14:54<15:58,  5.38s/it][A

loss: tensor(0.9216, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 169/346 [14:59<15:50,  5.37s/it][A

loss: tensor(0.9276, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 170/346 [15:04<15:35,  5.31s/it][A

loss: tensor(0.9175, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 171/346 [15:09<15:23,  5.28s/it][A

loss: tensor(0.9924, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 172/346 [15:15<15:18,  5.28s/it][A

loss: tensor(0.9680, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 173/346 [15:20<15:09,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 174/346 [15:25<15:02,  5.25s/it][A

loss: tensor(0.9457, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 175/346 [15:30<14:56,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 176/346 [15:36<14:53,  5.25s/it][A

loss: tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 177/346 [15:41<14:46,  5.24s/it][A

loss: tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 178/346 [15:46<14:41,  5.25s/it][A

loss: tensor(0.9316, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 179/346 [15:51<14:35,  5.24s/it][A

loss: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 180/346 [15:57<14:33,  5.26s/it][A

loss: tensor(0.9890, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 181/346 [16:02<14:29,  5.27s/it][A

loss: tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 182/346 [16:07<14:21,  5.25s/it][A

loss: tensor(0.9449, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 183/346 [16:12<14:18,  5.27s/it][A

loss: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 184/346 [16:18<14:11,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 185/346 [16:23<14:04,  5.24s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 186/346 [16:28<13:56,  5.23s/it][A

loss: tensor(0.9294, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 187/346 [16:33<13:54,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 188/346 [16:39<13:48,  5.24s/it][A

loss: tensor(0.9237, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 189/346 [16:44<13:42,  5.24s/it][A

loss: tensor(0.9177, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 190/346 [16:49<13:36,  5.24s/it][A

loss: tensor(0.9734, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 191/346 [16:54<13:33,  5.25s/it][A

loss: tensor(0.9490, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 192/346 [17:00<13:28,  5.25s/it][A

loss: tensor(0.9648, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 193/346 [17:05<13:22,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 194/346 [17:10<13:17,  5.24s/it][A

loss: tensor(0.9878, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 195/346 [17:15<13:14,  5.26s/it][A

loss: tensor(0.9187, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 196/346 [17:21<13:08,  5.25s/it][A

loss: tensor(0.9400, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 197/346 [17:26<13:02,  5.25s/it][A

loss: tensor(0.9382, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 198/346 [17:31<12:56,  5.25s/it][A

loss: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 199/346 [17:36<12:53,  5.26s/it][A

loss: tensor(0.9225, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 200/346 [17:42<12:46,  5.25s/it][A

loss: tensor(0.9404, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 201/346 [17:47<12:39,  5.24s/it][A

loss: tensor(0.9149, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 202/346 [17:52<12:34,  5.24s/it][A

loss: tensor(0.9146, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 203/346 [17:57<12:29,  5.24s/it][A

loss: tensor(0.9856, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 204/346 [18:02<12:23,  5.24s/it][A

loss: tensor(0.9505, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 205/346 [18:08<12:17,  5.23s/it][A

loss: tensor(0.9309, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 206/346 [18:13<12:14,  5.25s/it][A

loss: tensor(0.9148, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 207/346 [18:18<12:10,  5.26s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 208/346 [18:23<12:03,  5.24s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 209/346 [18:29<11:58,  5.24s/it][A

loss: tensor(0.9452, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 210/346 [18:34<11:55,  5.26s/it][A

loss: tensor(0.9210, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 211/346 [18:39<11:47,  5.24s/it][A

loss: tensor(0.9606, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 212/346 [18:44<11:42,  5.24s/it][A

loss: tensor(0.9230, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 213/346 [18:50<11:37,  5.24s/it][A

loss: tensor(0.9728, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 214/346 [18:55<11:33,  5.26s/it][A

loss: tensor(0.9828, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 215/346 [19:00<11:27,  5.25s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 216/346 [19:05<11:21,  5.24s/it][A

loss: tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 217/346 [19:11<11:14,  5.23s/it][A

loss: tensor(0.9528, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 218/346 [19:16<11:11,  5.24s/it][A

loss: tensor(0.9423, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 219/346 [19:21<11:05,  5.24s/it][A

loss: tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 220/346 [19:26<10:59,  5.23s/it][A

loss: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 221/346 [19:32<10:53,  5.23s/it][A

loss: tensor(0.9671, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 222/346 [19:37<10:49,  5.24s/it][A

loss: tensor(0.9506, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 223/346 [19:42<10:43,  5.23s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 224/346 [19:47<10:36,  5.22s/it][A

loss: tensor(0.9182, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 225/346 [19:52<10:30,  5.21s/it][A

loss: tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 226/346 [19:58<10:27,  5.23s/it][A

loss: tensor(0.9744, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 227/346 [20:03<10:21,  5.22s/it][A

loss: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 228/346 [20:08<10:15,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 229/346 [20:13<10:09,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 230/346 [20:19<10:06,  5.23s/it][A

loss: tensor(0.9718, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 231/346 [20:24<10:00,  5.22s/it][A

loss: tensor(0.9367, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 232/346 [20:29<09:53,  5.21s/it][A

loss: tensor(0.9866, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 233/346 [20:34<09:50,  5.22s/it][A

loss: tensor(0.9773, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 234/346 [20:39<09:43,  5.21s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 235/346 [20:45<09:37,  5.20s/it][A

loss: tensor(0.9366, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 236/346 [20:50<09:31,  5.20s/it][A

loss: tensor(0.9207, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 237/346 [20:55<09:28,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 238/346 [21:00<09:22,  5.21s/it][A

loss: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 239/346 [21:05<09:16,  5.20s/it][A

loss: tensor(0.9914, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 240/346 [21:11<09:10,  5.19s/it][A

loss: tensor(0.9159, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 241/346 [21:16<09:07,  5.21s/it][A

loss: tensor(0.9319, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 242/346 [21:21<09:01,  5.21s/it][A

loss: tensor(0.9415, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 243/346 [21:26<08:57,  5.22s/it][A

loss: tensor(0.9562, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 244/346 [21:31<08:51,  5.22s/it][A

loss: tensor(0.9286, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 245/346 [21:37<08:48,  5.23s/it][A

loss: tensor(0.9419, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 246/346 [21:42<08:42,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 247/346 [21:47<08:36,  5.21s/it][A

loss: tensor(0.9148, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 248/346 [21:52<08:30,  5.21s/it][A

loss: tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 249/346 [21:58<08:25,  5.21s/it][A

loss: tensor(0.9627, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 250/346 [22:03<08:19,  5.21s/it][A

loss: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 251/346 [22:08<08:14,  5.20s/it][A

loss: tensor(0.9202, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 252/346 [22:13<08:08,  5.20s/it][A

loss: tensor(0.9704, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 253/346 [22:18<08:04,  5.21s/it][A

loss: tensor(0.9272, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 254/346 [22:24<07:59,  5.21s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 255/346 [22:29<07:53,  5.20s/it][A

loss: tensor(0.9831, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 256/346 [22:34<07:49,  5.22s/it][A

loss: tensor(0.9521, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 257/346 [22:39<07:43,  5.20s/it][A

loss: tensor(0.9878, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 258/346 [22:44<07:37,  5.20s/it][A

loss: tensor(0.9179, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 259/346 [22:50<07:32,  5.20s/it][A

loss: tensor(0.9706, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 260/346 [22:55<07:28,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 261/346 [23:00<07:22,  5.21s/it][A

loss: tensor(0.9497, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 262/346 [23:05<07:17,  5.21s/it][A

loss: tensor(0.9197, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 263/346 [23:10<07:11,  5.20s/it][A

loss: tensor(0.9463, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 264/346 [23:16<07:07,  5.22s/it][A

loss: tensor(0.9187, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 265/346 [23:21<07:01,  5.20s/it][A

loss: tensor(1.0182, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 266/346 [23:26<06:55,  5.20s/it][A

loss: tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 267/346 [23:31<06:55,  5.26s/it][A

loss: tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 268/346 [23:37<06:54,  5.32s/it][A

loss: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 269/346 [23:42<06:50,  5.34s/it][A

loss: tensor(1.0208, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 270/346 [23:48<06:46,  5.34s/it][A

loss: tensor(0.9522, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 271/346 [23:53<06:41,  5.35s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 272/346 [23:58<06:38,  5.38s/it][A

loss: tensor(0.9140, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 273/346 [24:04<06:32,  5.38s/it][A

loss: tensor(0.9406, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 274/346 [24:09<06:27,  5.38s/it][A

loss: tensor(0.9810, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 275/346 [24:15<06:23,  5.40s/it][A

loss: tensor(1.0164, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 276/346 [24:20<06:17,  5.40s/it][A

loss: tensor(0.9857, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 277/346 [24:25<06:11,  5.39s/it][A

loss: tensor(0.9718, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 278/346 [24:31<06:05,  5.38s/it][A

loss: tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 279/346 [24:36<06:01,  5.40s/it][A

loss: tensor(0.9738, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 280/346 [24:42<05:55,  5.39s/it][A

loss: tensor(0.9243, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 281/346 [24:47<05:50,  5.39s/it][A

loss: tensor(0.9841, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 282/346 [24:52<05:44,  5.38s/it][A

loss: tensor(0.9643, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 283/346 [24:58<05:40,  5.40s/it][A

loss: tensor(0.9575, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 284/346 [25:03<05:34,  5.39s/it][A

loss: tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 285/346 [25:09<05:28,  5.39s/it][A

loss: tensor(0.9416, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 286/346 [25:14<05:23,  5.39s/it][A

loss: tensor(0.9426, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 287/346 [25:19<05:18,  5.41s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 288/346 [25:25<05:12,  5.39s/it][A

loss: tensor(0.9198, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 289/346 [25:30<05:06,  5.38s/it][A

loss: tensor(0.9427, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 290/346 [25:35<05:01,  5.39s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 291/346 [25:41<04:55,  5.37s/it][A

loss: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 292/346 [25:46<04:47,  5.32s/it][A

loss: tensor(0.9904, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 293/346 [25:51<04:40,  5.29s/it][A

loss: tensor(0.9380, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 294/346 [25:56<04:33,  5.26s/it][A

loss: tensor(0.9612, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 295/346 [26:02<04:28,  5.27s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 296/346 [26:07<04:22,  5.24s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 297/346 [26:12<04:16,  5.23s/it][A

loss: tensor(0.9648, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 298/346 [26:17<04:11,  5.25s/it][A

loss: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 299/346 [26:23<04:06,  5.24s/it][A

loss: tensor(0.9126, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 300/346 [26:28<04:00,  5.22s/it][A

loss: tensor(0.9564, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 301/346 [26:33<03:54,  5.22s/it][A

loss: tensor(0.9300, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 302/346 [26:38<03:50,  5.24s/it][A

loss: tensor(0.9341, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 303/346 [26:43<03:44,  5.23s/it][A

loss: tensor(0.9355, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 304/346 [26:49<03:39,  5.22s/it][A

loss: tensor(1.0091, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 305/346 [26:54<03:33,  5.21s/it][A

loss: tensor(0.9494, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 306/346 [26:59<03:28,  5.22s/it][A

loss: tensor(0.9882, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 307/346 [27:04<03:23,  5.21s/it][A

loss: tensor(0.9708, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 308/346 [27:09<03:18,  5.21s/it][A

loss: tensor(0.9687, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 309/346 [27:15<03:12,  5.20s/it][A

loss: tensor(0.9422, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 310/346 [27:20<03:08,  5.23s/it][A

loss: tensor(0.9903, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 311/346 [27:25<03:02,  5.23s/it][A

loss: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 312/346 [27:30<02:57,  5.22s/it][A

loss: tensor(0.9285, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 313/346 [27:36<02:55,  5.31s/it][A

loss: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 314/346 [27:41<02:50,  5.32s/it][A

loss: tensor(0.9309, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 315/346 [27:46<02:43,  5.29s/it][A

loss: tensor(0.9387, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 316/346 [27:52<02:37,  5.26s/it][A

loss: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 317/346 [27:57<02:32,  5.25s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 318/346 [28:02<02:27,  5.26s/it][A

loss: tensor(0.9126, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 319/346 [28:07<02:21,  5.25s/it][A

loss: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 320/346 [28:13<02:16,  5.24s/it][A

loss: tensor(0.9456, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 321/346 [28:18<02:10,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 322/346 [28:23<02:05,  5.24s/it][A

loss: tensor(0.9317, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 323/346 [28:28<02:00,  5.23s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 324/346 [28:33<01:54,  5.22s/it][A

loss: tensor(0.9385, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 325/346 [28:39<01:49,  5.23s/it][A

loss: tensor(0.9589, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 326/346 [28:44<01:44,  5.22s/it][A

loss: tensor(0.9527, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 327/346 [28:49<01:39,  5.22s/it][A

loss: tensor(0.9513, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 328/346 [28:54<01:33,  5.21s/it][A

loss: tensor(0.9198, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 329/346 [29:00<01:28,  5.23s/it][A

loss: tensor(0.9550, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 330/346 [29:05<01:23,  5.22s/it][A

loss: tensor(0.9200, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 331/346 [29:10<01:18,  5.22s/it][A

loss: tensor(0.9493, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 332/346 [29:15<01:12,  5.21s/it][A

loss: tensor(0.9628, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 333/346 [29:20<01:08,  5.23s/it][A

loss: tensor(0.9128, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 334/346 [29:26<01:02,  5.22s/it][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 335/346 [29:31<00:57,  5.25s/it][A

loss: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 336/346 [29:36<00:52,  5.25s/it][A

loss: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 337/346 [29:42<00:47,  5.26s/it][A

loss: tensor(0.9189, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 338/346 [29:47<00:41,  5.24s/it][A

loss: tensor(0.9725, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 339/346 [29:52<00:36,  5.24s/it][A

loss: tensor(1.0231, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 340/346 [29:57<00:31,  5.24s/it][A

loss: tensor(0.9435, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 341/346 [30:03<00:26,  5.27s/it][A

loss: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 342/346 [30:08<00:21,  5.27s/it][A

loss: tensor(0.9364, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 343/346 [30:13<00:15,  5.26s/it][A

loss: tensor(0.9291, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 344/346 [30:18<00:10,  5.26s/it][A

loss: tensor(0.9533, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 345/346 [30:24<00:05,  5.27s/it][A

loss: tensor(0.9343, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 346/346 [30:24<00:00,  5.27s/it][A

  0%|          | 0/173 [00:00<?, ?it/s][A

loss: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)

	Training Loss: 0.9420827516930641

	Training acc: 0.962777088865799

	Training prec: 0.9276663779050563

	Training rec: 0.962777088865799

	Training f1: 0.944712797120034

	Current Learning rate:  0.0



  1%|          | 1/173 [00:00<02:01,  1.42it/s][A
  1%|          | 2/173 [00:01<01:57,  1.46it/s][A
  2%|▏         | 3/173 [00:01<01:51,  1.53it/s][A
  2%|▏         | 4/173 [00:02<01:53,  1.49it/s][A
  3%|▎         | 5/173 [00:03<01:52,  1.49it/s][A
  3%|▎         | 6/173 [00:03<01:49,  1.53it/s][A
  4%|▍         | 7/173 [00:04<01:50,  1.50it/s][A
  5%|▍         | 8/173 [00:05<01:50,  1.50it/s][A
  5%|▌         | 9/173 [00:05<01:46,  1.54it/s][A
  6%|▌         | 10/173 [00:06<01:48,  1.51it/s][A
  6%|▋         | 11/173 [00:07<01:48,  1.50it/s][A
  7%|▋         | 12/173 [00:07<01:44,  1.54it/s][A
  8%|▊         | 13/173 [00:08<01:46,  1.51it/s][A
  8%|▊         | 14/173 [00:09<01:46,  1.50it/s][A
  9%|▊         | 15/173 [00:09<01:43,  1.53it/s][A
  9%|▉         | 16/173 [00:10<01:44,  1.50it/s][A
 10%|▉         | 17/173 [00:11<01:44,  1.50it/s][A
 10%|█         | 18/173 [00:11<01:41,  1.53it/s][A
 11%|█         | 19/173 [00:12<01:43,  1.49it/s][A
 12%|█▏        | 20/


	Validation Loss: 0.9447498338759979

	Validation acc: 0.9600869765316626

	Validation prec: 0.9233928177922559

	Validation rec: 0.9600869765316626

	Validation f1: 0.9409790707898106





### Evaluation on the test dataset

In [17]:

############ test eval metrics ######################
nb_test_steps = 0 # Tracking variables
test_loss = []
test_acc = []
test_prec = []
test_rec = []
test_f1 = []

########################################################
for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch     # unpack inputs from dataloader

    with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
        model.eval() # put model in evaluation mode for validation set
        logits = model(**{"input_ids":t_input_ids, "attention_mask":t_input_mask, "token_type_ids":t_token_type_ids}) # forward pass, calculates logit predictions

    ######################################################

    # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
    t_active_loss = t_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
    t_active_logits = logits.view(-1, N_bio_tags)[t_active_loss] # 5 
    t_active_tags = t_bio_tags.view(-1)[t_active_loss]
    t_loss = loss_fn(t_active_logits, t_active_tags)             
    test_loss.append(t_loss.item())

    #########################################################
    logits = logits.detach().to('cpu').numpy()
    tags_ids = t_bio_tags.to('cpu').numpy()

    # calculate performance measures only on tokens and not subwords or special tokens
    tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
    tags = tags_ids[tags_mask]#.flatten()                          

    metrics = compute_metrics(pred, tags)
    test_acc.append(metrics["accuracy"])
    test_prec.append(metrics["precision"])
    test_rec.append(metrics["recall"])
    test_f1.append(metrics["f1"])

    nb_test_steps += 1

print(F'\n\tTest Loss: {np.mean(test_loss)}')
print(F'\n\tTest acc: {np.mean(test_acc)}')
print(F'\n\tTest prec: {np.mean(test_prec)}')
print(F'\n\tTest rec: {np.mean(test_rec)}')
print(F'\n\tTest f1: {np.mean(test_f1)}')


100%|██████████| 216/216 [02:56<00:00,  1.23it/s]


	Test Loss: 0.9381278394548981

	Test acc: 0.9667090029174668

	Test prec: 0.9359379798546343

	Test rec: 0.9667090029174668

	Test f1: 0.9507271474514754





### Save model

In [18]:
torch.save(model.state_dict(), "finetuned-NER-10-epochs.pth")

### Load model locally

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CausalNER()
model.load_state_dict(torch.load("finetuned-NER-10-epochs.pth"))
model.to(device)
model.eval()

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.3.output.dense.weight', 'lm_head.decoder.weight', 'roberta.encoder.layer.6.intermediate.dense.bias', 'roberta.encoder.layer.8.output.dense.bias', 'roberta.encoder.layer.11.attention.self.value.bias', 'roberta.encoder.layer.1.attention.output.dense.weight', 'roberta.encoder.layer.3.output.LayerNorm.bias', 'roberta.encoder.layer.9.attention.self.key.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.5.intermediate.dense.bias', 'roberta.encoder.layer.9.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.6.intermediate.dense.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'roberta.encoder.layer.5.attention.self.

CausalNER(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  

# Questions to Vivek?

### If there is only one cause and no effect ; or only one effect and no cause => ignore ? -> YES


### Small example

In [22]:
# Small steps
sample = trainingData.sample(n=5, random_state=11)[3:]
sample.head()

Unnamed: 0,tweet,Causal association,BIOtags
447,I've been light headed and shakey for the last...,1.0,"[O, O, O, B-E, I-E, O, B-E, O, O, O, O, O, O, ..."
7584,2 before to 0.,0.0,"[O, O, O, O, O]"


In [23]:
N_bio_tags = 5 
train_dataset = TweetDataSet(sample["tweet"].map(normalizeTweet).values.tolist()
                           , sample["Causal association"].values.tolist()
                           , sample["BIOtags"].values.tolist()
                           , tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

print("Tweet:")
print(sample.iloc[0]["tweet"])
print("BIO tags:")
print(sample.iloc[0]["BIOtags"])
print("\ntokenized:")
print(tokenizer.convert_ids_to_tokens(train_dataset[1]["input_ids"]))
print("BIO tags extended:")
print(train_dataset[0]["bio_tags"])
print("\nids:")
print(train_dataset[0]["input_ids"])
print("BIO tags extended:")
print(train_dataset[0]["bio_tags"])
print("attention mask:")
print(train_dataset[0]["attention_mask"])


Tweet:
I've been light headed and shakey for the last 4 hours due to low blood sugar and it's uncomfortable and debilitating !
BIO tags:
['O', 'O', 'O', 'B-E', 'I-E', 'O', 'B-E', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-C', 'I-C', 'I-C', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

tokenized:
['<s>', '2', 'before', 'to', '0', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
BIO tags extended:
tensor([-100,    0,    0,    0,    3,    4,    0,    3, -100,    0,    0,    0,
           0,    0,    0,    0,    1,    2,    2,    0,    0,    0,    0,    0,
           0, -100, -100,    0, -100])

ids:
tensor([    0,     8,   120,   108,   937,  4432,    13,  2258,  1499,    19,
            6,   175,   204,   493,  1006,     9,  1101,  1945,  4057,    13,
           18,    20,  6976,    13, 13084, 41480,  1526,    12,     2])
BIO tags extended:
te

In [24]:
for batch in tqdm(train_loader):
    optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch["token_type_ids"].to(device)
    bio_tags = batch['bio_tags'].to(device)
    print("BATCH:")
    print("tweet A:", tokenizer.convert_ids_to_tokens(input_ids[0]))
    print("tweet B:", tokenizer.convert_ids_to_tokens(input_ids[1]))
    print("tweet A shape:", len(tokenizer.convert_ids_to_tokens(input_ids[0])))
    print("tweet B shape:", len(tokenizer.convert_ids_to_tokens(input_ids[1])))    
    print("============\n")
    
    ################################################
    model.train() # set model to training mode
    logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

    print("logits.shape:", logits.shape)
    print("bio_tags.shape:", bio_tags.shape)
    print("============\n")

100%|██████████| 1/1 [00:00<00:00, 25.72it/s]

BATCH:
tweet A: ['<s>', 'I', "'ve", 'been', 'light', 'headed', 'and', 'sha@@', 'key', 'for', 'the', 'last', '4', 'hours', 'due', 'to', 'low', 'blood', 'sugar', 'and', 'it', "'s", 'uncomfortable', 'and', 'deb@@', 'ilit@@', 'ating', '!', '</s>']
tweet B: ['<s>', '2', 'before', 'to', '0', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
tweet A shape: 29
tweet B shape: 29

logits.shape: torch.Size([2, 29, 5])
bio_tags.shape: torch.Size([2, 29])






In [25]:


#################################################
# similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
print("active_loss.shape:", active_loss.shape)
print("active_loss:", active_loss)

#active_loss2 = bio_tags.view(-1) != -100   # excludes all special tokens including <CLS>, <SEP>
active_logits = logits.view(-1, N_bio_tags)[active_loss] # 5 
active_tags = bio_tags.view(-1)[active_loss]
loss = loss_fn(active_logits, active_tags)
print("active_logits:", active_logits.shape)
print("active_tags:", active_tags.shape)
print("loss:", loss)
print("============\n")


active_loss.shape: torch.Size([58])
active_loss: tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False],
       device='cuda:0')
active_logits: torch.Size([36, 5])
active_tags: torch.Size([36])
loss: tensor(1.1117, device='cuda:0', grad_fn=<NllLossBackward>)



In [26]:

logits = logits.detach().to('cpu').numpy()
tags_ids = bio_tags.to('cpu').numpy()

# calculate performance measures only on tokens and not subwords or special tokens
tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
print("pred.shape:", pred.shape)
print("pred:", pred)    
tags = tags_ids[tags_mask]#.flatten()
print("tags.shape", tags.shape)
print("tags:", tags)

print("acc:", accuracy_score(tags, pred))

pred.shape: (29,)
pred: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
tags.shape (29,)
tags: [0 0 0 3 4 0 3 0 0 0 0 0 0 0 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0]
acc: 0.7931034482758621


In [None]:

TODO: 
    - write annotation guidelines
    - check model predictions, where does it fail?
    - 
