## A model build using TweetBERT to classify tweet as causal or non-causal 

The causal sentence prediction model will be trained in several steps using an active learning approach, where in each step the training dataset will be augmented.
In each step the causal sentence classifier is trained and applied on a subsample of unlabeled tweets to identify tweets with causal elements. Those tweets are then manually labeled for the two tasks: causal sentence prediction and cause-effect identification (NER). The newly labeled data will be added to the training dataset and the causal sentence classifier will be retrained with the augmented dataset to increase performance

In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, SubsetRandomSampler
import transformers
from tqdm import tqdm, trange
import io
from utils import normalizeTweet, split_into_sentences, EarlyStopping
import matplotlib.pyplot as plt

########################### Check if cuda available ############################
# print("Cuda available: ", torch.cuda.is_available())
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



########################### DATA FILE ###################################
# dataPath = "/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality_tweets_data.xlsx"
dataPath = "../data/Causality_tweets_data.xlsx"


########################### MODEL PARAMETERS ############################
# lr = 1e-3    
# adam_eps = 1e-8
# epochs = 35
# num_warmup_steps = 0
# early_patience = 5 # how long to wait after last time validation loss improved

# train_batch_size = 16
# val_batch_size = 16
# test_batch_size = 32
# test_to_train_ratio = 0.1 # 10% test and 90% train
# val_to_train_ratio = 0.2

lr = 1e-5  
adam_eps = 1e-8
epochs = 50
num_warmup_steps = 0
early_patience = 10# how long to wait after last time validation loss improved

train_batch_size = 16
val_batch_size = 16
test_batch_size = 16
test_to_train_ratio = 0.1 # 10% test and 90% train
val_to_train_ratio = 0.2



#metrics_average = "binary" # this will give measure for class_1,i.e., causal class


### naming the model 
active_learning_round = 1
n_trained_epochs = 0
saveModelName = "../model_causal-sentences/new_model_{}_finetuned-{}-epochs-lr_{}.pth".format(active_learning_round,n_trained_epochs-early_patience, lr) 



#### Checking if thec cuda is available and then select the `gpu`

In [2]:
########################### Check if cuda available ############################
print("Cuda available: ", torch.cuda.is_available())
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Selected {} for this notebook".format(device))

Cuda available:  True
Selected cuda:0 for this notebook


In [3]:
##### DATA TO LOAD ######

data_sheet1 = pd.read_excel(dataPath, sheet_name="round0")
data_sheet1 = data_sheet1[data_sheet1["Causal association"].notnull()] # some tweets at the end are not labeled yet
data_sheet1 = data_sheet1[["full_text", "Intent", "Cause", "Effect", "Causal association"]]
print("Data sheet 1 (tweets!):")
print(data_sheet1["Causal association"].value_counts())
print()


##### additional data labeled through active learning strategy - round 1 ########
data_sheet2 = pd.read_excel(dataPath, sheet_name="round1")
data_sheet2 = data_sheet2[data_sheet2["Causal association"].notnull()]
data_sheet2 = data_sheet2[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_sheet2.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("New labeled data after active learning (sentences!)")
print(data_sheet2["Causal association"].value_counts())


#### merge both datasets ######
data = data_sheet1.append(data_sheet2)
print("\nAfter merge:")
print(data["Causal association"].value_counts())
data.head()

Data sheet 1 (tweets!):
0.0    3710
1.0    1290
Name: Causal association, dtype: int64

New labeled data after active learning (sentences!)
0.0    1763
1.0     429
Name: Causal association, dtype: int64

After merge:
0.0    5473
1.0    1719
Name: Causal association, dtype: int64


Unnamed: 0,full_text,Intent,Cause,Effect,Causal association
0,"tonight , I learned my older girl will back he...",,,,0.0
1,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0
2,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0
3,USER Cheers ! Have one for this diabetic too !,mS,,,0.0
4,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0


### Preprocessing

In [4]:
def get_start_end_index_of_sentence_in_tweet(tweet, sentence):
    """ 
    The sentence tokens are included in the tweet tokens.
    Return the start end end indices of the sentence tokens in the tweet tokens

    """

    sentence_start_word = sentence[0]
    start_indices = [i for i, x in enumerate(tweet) if x == sentence_start_word] # find all indices of the start word of the sentence 
    try:
        for start_index in start_indices:
            isTrueStartIndex = all([tweet[start_index+i] == sentence[i] for i in range(len(sentence))])
            #print("start_index:", start_index, "isTrueStartIndex:", isTrueStartIndex)
            if isTrueStartIndex:
                return start_index, start_index + len(sentence) 
    except:
        print("ERROR: StartIndex should have been found for sentence:")
        print("tweet:")
        print(tweet)
        print("sentence:")
        print(sentence)
    return -1, -2 # should not be returned


def split_tweets_to_sentences(data):
    """ 
        Splits tweets into sentences and associates the appropriate intent, causes, effects and causal association
        to each sentence.
        
        Parameters:
        - min_words_in_sentences: Minimal number of words in a sentence such that the sentence is kept. 
                                  Assumption: A sentence with too few words does not have enough information
                              
                              
                              
        Ex.:
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what? type 1 causes insulin dependence | q;msS  | type 1|insulin dependence | 1       | ...  
        
        New dataframe returned: 
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what?                                  |   q    |       |        |       0            | ...
        type 1 causes insulin dependence       |        | type 1| insulin dependence | 1       | ...  
    """

    newDF = pd.DataFrame(columns=["sentence", "Intent", "Cause", "Effect", "Causal association", "tokenized"])
    
    for i,row in data.iterrows():
        causes = row["Cause"]
        effects = row["Effect"]
        sentences = split_into_sentences(normalizeTweet(row["full_text"]))
        
        # single sentence in tweet
        if len(sentences) == 1:
            singleSentenceIntent = ""
            if isinstance(row["Intent"], str):
                if len(row["Intent"].split(";")) > 1:
                    singleSentenceIntent = row["Intent"].strip().replace(";msS", "").replace("msS;", "").replace(";mS", "").replace("mS;", "")
                else:
                    if row["Intent"] == "mS" or row["Intent"] == "msS":
                        singleSentenceIntent = ""
                    else:
                        singleSentenceIntent = row["Intent"].strip()
                    
            newDF=newDF.append(pd.Series({"sentence": sentences[0] # only one sentence
                         , "Intent": singleSentenceIntent
                         , "Cause" : row["Cause"]
                         , "Effect": row["Effect"]
                         , "Causal association" : row["Causal association"]
                         , "tokenized": row["tokenized"]}), ignore_index=True)
        
        # tweet has several sentences
        else: 
            intents = str(row["Intent"]).strip().split(";")
            for sentence in sentences:
                sent_tokenized = sentence.split(" ")
                causeInSentence = np.nan if not isinstance(causes, str) or not any([cause in sentence for cause in causes.split(";")]) else ";".join([cause for cause in causes.split(";") if cause in sentence])
                effectInSentence = np.nan if not isinstance(effects, str) or not any([effect in sentence for effect in effects.split(";")]) else ";".join([effect for effect in effects.split(";") if effect in sentence])
                causalAssociationInSentence = 1 if isinstance(causeInSentence, str) and isinstance(effectInSentence, str) else 0
                startIndex, endIndex = get_start_end_index_of_sentence_in_tweet(row["tokenized"], sent_tokenized)
                sentence_tokenized = row["tokenized"][startIndex:endIndex]
                
                if "q" in intents and sentence[-1] == "?": # if current sentence is question
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "q", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)                    
                elif "joke" in intents: # all sentences with "joke" in tweet keep the intent "joke"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "joke", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)   
                elif "neg" in intents: # all sentences with "neg" in tweet keep intent "neg"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "neg", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)               
                elif isinstance(causeInSentence, str) and isinstance(effectInSentence, str): # cause effect sentence
                    causalIntent = ""
                    if len(causeInSentence.split(";")) > 1:
                        causalIntent = "mC"
                        if len(effectInSentence.split(";")) > 1:
                            causalIntent = "mC;mE"
                    elif len(effectInSentence.split(";")) > 1:
                        causalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": causalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)                                  
                else:
                    nonCausalIntent = ""
                    if isinstance(causeInSentence, str): # only cause is given
                        if len(causeInSentence.split(";")) > 1:
                            nonCausalIntent = "mC"
                    elif isinstance(effectInSentence, str): # only effect is given
                        if len(effectInSentence.split(";")) > 1:
                            nonCausalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": nonCausalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)

    return newDF

In [5]:
### Split tweets into sentences (train classifier on sentence level) ####

print("N tweets:", data.shape[0])

data["tokenized"] = data["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
dataSentences = split_tweets_to_sentences(data)
print("N sentences:", dataSentences.shape[0])
dataSentences.head()

N tweets: 7192
N sentences: 13948


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will..."
1,Fiercely .,,,,0,"[Fiercely, .]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]"
3,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0,"[USER, USER, I, knew, diabetes, and, fibromyal..."
4,:face_with_rolling_eyes:,joke,,,0,[:face_with_rolling_eyes:]


In [6]:
########## Remove sentences with joke, question, negation and keep only sentences with more than 3 tokens #####

print("N sentences before filtering: ", dataSentences.shape[0])
dataSentFiltered = dataSentences[~dataSentences["Intent"].str.contains("neg|joke|q")] 
dataSentFiltered = dataSentFiltered[dataSentFiltered["tokenized"].map(len) > 3] 
print("N sentences after filtering: ", dataSentFiltered.shape[0])
print("Distribution:")
print(dataSentFiltered["Causal association"].value_counts())
dataSentFiltered.head()

N sentences before filtering:  13948
N sentences after filtering:  10468
Distribution:
0.0    9007
1.0    1461
Name: Causal association, dtype: int64


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will..."
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]"
5,:down_arrow: :down_arrow: :down_arrow: THIS :d...,,,,0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI..."
6,I 'm a trans woman .,,,,0,"[I, 'm, a, trans, woman, .]"
7,"Both of us could use a world where "" brave and...",,,,0,"[Both, of, us, could, use, a, world, where, "",..."


### Training

In [7]:
####################### Stratified splits ####################


## ONLY FOR TESTING ---------------
#dataSentFiltered = dataSentFiltered[0:500] # for testing

text = dataSentFiltered["sentence"].map(normalizeTweet).values.tolist()
labels = dataSentFiltered["Causal association"].values.tolist()
# first split the data into training and testing label in the ratio of 90:10
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=test_to_train_ratio, stratify=labels, random_state=99)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=val_to_train_ratio, stratify=train_labels, random_state=99)



data_count_info = pd.Series(labels).value_counts(normalize=True)
train_count_info = pd.Series(train_labels).value_counts(normalize=True)
val_count_info = pd.Series(val_labels).value_counts(normalize=True)
test_count_info = pd.Series(test_labels).value_counts(normalize=True)

# for class-imbalanced dataset, the class weight for a ith class
# to be specified for balancing in the loss function is given by:
# weight[i] = num_samples / (num_classes * num_samples[i])
# since train_count_info obtained above has fraction of samples
# for ith class, hence the corresponding weight calculation is:
class_weight = (1/train_count_info)/len(train_count_info)

print("All: Count = {}, % of 0 = {}, % of 1 = {}".format(
    len(labels), *data_count_info.round(4).to_list()))
print("Train: Count = {}, % of 0 = {}, % of 1 = {}".format(
    len(train_labels), *train_count_info.round(4).to_list()))
print("Val: Count = {}, % of 0 = {}, % of 1 = {}".format(
    len(val_labels), *val_count_info.round(4).to_list()))
print("Test: Count = {}, % of 0 = {}, % of 1 = {}".format(
    len(test_labels), *test_count_info.round(4).to_list()))
print("Balancing class wts: for 0 = {}, for 1 = {}".format(
    *class_weight.round(4).to_list()))

All: Count = 10468, % of 0 = 0.8604, % of 1 = 0.1396
Train: Count = 7536, % of 0 = 0.8604, % of 1 = 0.1396
Val: Count = 1885, % of 0 = 0.8605, % of 1 = 0.1395
Test: Count = 1047, % of 0 = 0.8606, % of 1 = 0.1394
Balancing class wts: for 0 = 0.5811, for 1 = 3.5817


In [8]:
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
        }      

    def __len__(self):
        return len(self.labels)

    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train_texts, train_labels, tokenizer)
val_dataset = TweetDataSet(val_texts, val_labels, tokenizer)
test_dataset = TweetDataSet(test_texts, test_labels, tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# During training: In each epoch one part of the training data will be used as validation set
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


7536
1885
1047


### Evaluation Metrics

In [9]:
## we are measuring weighted metrics - as our dataset is imbalanced 
# Calculate metrics for each label, and find their average weighted by support
# (the number of true instances for each label). 
# This alters ‘macro’ to account for label imbalance; 
# it can result in an F-score that is not between precision and recall.


from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef

def compute_metrics(pred, labels, average="macro"):
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    precision, recall, f1, _ = precision_recall_fscore_support(labels,pred, average=average)
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



### Model defintion

In [10]:


class CausalityBERT(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalityBERT, self).__init__()
        self.num_labels = 2
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token        
        output_2 = self.dropout(output_1)
        output_3 = self.linear1(output_2)  
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        return output_5

In [11]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = CausalityBERT()
model.to(device)

# fine-tune only the task-specific parameters
for param in model.bert.parameters():
    param.requires_grad = False

num_training_steps = np.ceil(len(train_dataset)/train_batch_size)*epochs
optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # scheduler with a linearly decreasing learning rate from the initial lr set in the optimizer to 0; after a warmup period durnig which it increases linearly from 0 to the initial lr set in the optimizer

## penalising more for class with less number of exaplmes 
loss_fn = CrossEntropyLoss(torch.tensor(class_weight.to_list()).to(device))

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.6.attention.self.query.weight', 'roberta.encoder.layer.2.intermediate.dense.bias', 'roberta.encoder.layer.5.attention.self.query.bias', 'roberta.encoder.layer.6.attention.self.value.weight', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.7.intermediate.dense.weight', 'roberta.encoder.layer.4.output.dense.bias', 'roberta.encoder.layer.10.attention.self.key.bias', 'roberta.encoder.layer.11.attention.self.query.weight', 'roberta.encoder.layer.7.attention.self.value.weight', 'roberta.encoder.layer.10.intermediate.dense.weight', 'roberta.encoder.layer.6.attention.self.key.bias', 'roberta.encoder.layer.6.attention.self.query.bias', 'roberta.encoder.layer.10.attention.output.LayerNorm.bias', 'rober

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.3.attention.self.query.bias', 'encoder.layer.5.attention.self.value.bias', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.7.attention.self.value.weight', 'encoder.layer.5.attention.output.LayerNorm.bias', 'encoder.layer.4.attention.self.value.weight', 'encoder.layer.8.intermediate.dense.weight', 'encoder.layer.5.attention.output.LayerNorm.weight', 'encoder.layer.7.output.dense.weight', 'encoder.layer.5.attention.self.key.bias', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.5.output.LayerNorm.bias', 'pooler.dense.weight', 'encoder.layer.2.attention.self.value.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.8.attention.self.key.weight', 'encoder.layer.3.output.dense.weigh

In [None]:
############ TRAINING #############
# saveModelName = "./model-causal-model/new_model_{}_finetuned-{}-epochs-lr_{}.pth".format(active_learning_round,n_trained_epochs-early_patience, lr) 

# initialise the early_stopping object
early_stopping = EarlyStopping(patience=early_patience, path=saveModelName, verbose=True)

train_avg_loss = [] # avg training loss per epoch
val_avg_loss = [] # avg validation loss per epoch
train_avg_acc = [] # avg training accuracy per epoch
val_avg_acc = [] # avg val accuracy per epoch
n_trained_epochs = 0


for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")
    
    
    ########### training eval metrics #############################
    train_accuracy = []
    train_loss = []
    train_acc = []
    train_prec = []
    train_rec = []
    train_f1 = []
    
    ###################################################
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        ###########################################################################
        model.train()
        logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass
        #############################################################################
        loss = loss_fn(logits, labels)
        print("loss:", loss)
        loss.backward() # backward pass
        optim.step()    # update parameters and take a step up using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
        
    
        ############# Training Accuracy Measure ###################################

        # move logits and labels to CPU
        logits = logits.detach().to('cpu').numpy()
        label_ids = labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        metrics = compute_metrics(pred_flat, labels_flat)        
        
        train_acc.append(metrics["accuracy"])
        train_prec.append(metrics["precision"])
        train_rec.append(metrics["recall"])
        train_f1.append(metrics["f1"])
        
    train_avg_loss.append(np.mean(train_loss))
    train_avg_acc.append(np.mean(train_acc))
    print(F'\n\tTrain loss: {np.mean(train_loss)}')
    print(F'\n\ttrain acc: {np.mean(train_acc)}')
    print(F'\n\ttraining prec: {np.mean(train_prec)}')
    print(F'\n\ttraining rec: {np.mean(train_rec)}')
    print(F'\n\ttraining f1: {np.mean(train_f1)}')
    
    n_trained_epochs += 1
    
    ###################################################################################

    
    ## ---- Validation ------
    val_accuracy = []
    val_loss = []
    val_acc = []
    val_prec = []
    val_rec = []
    val_f1 = []
    
    
    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch     # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            ##################################################################################
            model.eval()
            logits = model(**{"input_ids":b_input_ids, "attention_mask":b_input_mask, "token_type_ids":b_token_type_ids}) # forward pass, calculates logit predictions 

                        
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())
        
        # move logits and labels to CPU
        logits = logits.detach().to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics = compute_metrics(pred_flat, labels_flat)
        val_acc.append(metrics["accuracy"])
        val_prec.append(metrics["precision"])
        val_rec.append(metrics["recall"])
        val_f1.append(metrics["f1"])

    val_avg_loss.append(np.mean(val_loss))
    val_avg_acc.append(np.mean(val_acc))
    print(F'\n\tValidation loss: {np.mean(val_loss)}')
    print(F'\n\tValidation acc: {np.mean(val_acc)}')
    print(F'\n\tValidation prec: {np.mean(val_prec)}')
    print(F'\n\tValidation rec: {np.mean(val_rec)}')
    print(F'\n\tValidation f1: {np.mean(val_f1)}')

    # early_stopping needs the validation loss to check if it has decreased,
    # and if it has, it will make a checkpoint of the current model
    saveModelName = "./model-causal-model/new_model_{}_finetuned-{}-epochs-lr_{}.pth".format(active_learning_round,n_trained_epochs-early_patience, lr) 
    early_stopping.path = saveModelName
    early_stopping(np.average(val_loss), model)
    
    if early_stopping.early_stop:
        print("Early stopping")
        break

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]




  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

  0%|          | 1/471 [00:08<1:03:58,  8.17s/it][A

loss: tensor(0.7404, device='cuda:0', grad_fn=<NllLossBackward>)



  0%|          | 2/471 [00:15<1:00:31,  7.74s/it][A

loss: tensor(0.6609, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/471 [00:23<59:43,  7.66s/it]  [A

loss: tensor(0.7318, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/471 [00:30<59:10,  7.60s/it][A

loss: tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 5/471 [00:38<58:50,  7.58s/it][A

loss: tensor(0.6922, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 6/471 [00:45<58:21,  7.53s/it][A

loss: tensor(0.7030, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 7/471 [00:53<58:13,  7.53s/it][A

loss: tensor(0.7391, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/471 [01:00<58:05,  7.53s/it][A

loss: tensor(0.7012, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 9/471 [01:08<57:55,  7.52s/it][A

loss: tensor(0.6681, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 10/471 [01:15<57:34,  7.49s/it][A

loss: tensor(0.6105, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 11/471 [01:23<57:29,  7.50s/it][A

loss: tensor(0.6913, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/471 [01:30<57:25,  7.51s/it][A

loss: tensor(0.6559, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 13/471 [01:38<57:05,  7.48s/it][A

loss: tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 14/471 [01:45<57:04,  7.49s/it][A

loss: tensor(0.6920, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 15/471 [01:53<56:59,  7.50s/it][A

loss: tensor(0.7756, device='cuda:0', grad_fn=<NllLossBackward>)


  _warn_prf(average, modifier, msg_start, len(result))

  3%|▎         | 16/471 [02:00<56:53,  7.50s/it][A

loss: tensor(0.6636, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▎         | 17/471 [02:08<56:34,  7.48s/it][A

loss: tensor(0.5948, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 18/471 [02:15<56:39,  7.50s/it][A

loss: tensor(0.6415, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 19/471 [02:23<56:42,  7.53s/it][A

loss: tensor(0.6730, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 20/471 [02:30<56:38,  7.53s/it][A

loss: tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 21/471 [02:38<56:21,  7.51s/it][A

loss: tensor(0.6718, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 22/471 [02:45<56:21,  7.53s/it][A

loss: tensor(0.7193, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 23/471 [02:53<56:18,  7.54s/it][A

loss: tensor(0.7023, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 24/471 [03:00<56:14,  7.55s/it][A

loss: tensor(0.6727, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 25/471 [03:08<55:56,  7.53s/it][A

loss: tensor(0.7408, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 26/471 [03:15<55:50,  7.53s/it][A

loss: tensor(0.7827, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 27/471 [03:23<55:47,  7.54s/it][A

loss: tensor(0.7271, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 28/471 [03:30<55:27,  7.51s/it][A

loss: tensor(0.7455, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 29/471 [03:38<55:25,  7.52s/it][A

loss: tensor(0.7362, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 30/471 [03:46<55:19,  7.53s/it][A

loss: tensor(0.7128, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 31/471 [03:53<55:20,  7.55s/it][A

loss: tensor(0.6622, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 32/471 [04:01<55:07,  7.53s/it][A

loss: tensor(0.6867, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 33/471 [04:08<55:01,  7.54s/it][A

loss: tensor(0.6372, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 34/471 [04:16<54:54,  7.54s/it][A

loss: tensor(0.6728, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 35/471 [04:23<54:46,  7.54s/it][A

loss: tensor(0.6726, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 36/471 [04:31<54:26,  7.51s/it][A

loss: tensor(0.6307, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 37/471 [04:38<54:19,  7.51s/it][A

loss: tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 38/471 [04:46<54:15,  7.52s/it][A

loss: tensor(0.7022, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 39/471 [04:53<53:59,  7.50s/it][A

loss: tensor(0.7024, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 40/471 [05:01<53:59,  7.52s/it][A

loss: tensor(0.7428, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 41/471 [05:08<53:57,  7.53s/it][A

loss: tensor(0.6793, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 42/471 [05:16<53:58,  7.55s/it][A

loss: tensor(0.6283, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 43/471 [05:23<53:39,  7.52s/it][A

loss: tensor(0.6871, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 44/471 [05:31<53:36,  7.53s/it][A

loss: tensor(0.6800, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 45/471 [05:39<53:35,  7.55s/it][A

loss: tensor(0.6229, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 46/471 [05:46<53:33,  7.56s/it][A

loss: tensor(0.7748, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 47/471 [05:54<53:11,  7.53s/it][A

loss: tensor(0.7717, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 48/471 [06:01<53:08,  7.54s/it][A

loss: tensor(0.7480, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 49/471 [06:09<53:13,  7.57s/it][A

loss: tensor(0.6523, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 50/471 [06:16<53:13,  7.59s/it][A

loss: tensor(0.6408, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 51/471 [06:24<52:58,  7.57s/it][A

loss: tensor(0.7481, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 52/471 [06:32<52:57,  7.58s/it][A

loss: tensor(0.8216, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 53/471 [06:39<52:57,  7.60s/it][A

loss: tensor(0.6410, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 54/471 [06:47<52:38,  7.57s/it][A

loss: tensor(0.7017, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 55/471 [06:54<52:35,  7.58s/it][A

loss: tensor(0.6354, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 56/471 [07:02<52:29,  7.59s/it][A

loss: tensor(0.7423, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 57/471 [07:10<52:26,  7.60s/it][A

loss: tensor(0.7131, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 58/471 [07:17<52:08,  7.57s/it][A

loss: tensor(0.7128, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 59/471 [07:25<52:04,  7.58s/it][A

loss: tensor(0.7704, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 60/471 [07:32<51:59,  7.59s/it][A

loss: tensor(0.7374, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 61/471 [07:40<51:57,  7.60s/it][A

loss: tensor(0.6777, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 62/471 [07:47<51:42,  7.59s/it][A

loss: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 63/471 [07:55<51:35,  7.59s/it][A

loss: tensor(0.7220, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 64/471 [08:03<51:26,  7.58s/it][A

loss: tensor(0.7186, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 65/471 [08:10<51:12,  7.57s/it][A

loss: tensor(0.7020, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 66/471 [08:18<50:52,  7.54s/it][A

loss: tensor(0.7430, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 67/471 [08:25<50:46,  7.54s/it][A

loss: tensor(0.6182, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 68/471 [08:33<50:50,  7.57s/it][A

loss: tensor(0.6896, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 69/471 [08:40<50:39,  7.56s/it][A

loss: tensor(0.6878, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 70/471 [08:48<50:39,  7.58s/it][A

loss: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 71/471 [08:56<50:34,  7.59s/it][A

loss: tensor(0.6664, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 72/471 [09:03<50:30,  7.60s/it][A

loss: tensor(0.8289, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 73/471 [09:11<50:14,  7.57s/it][A

loss: tensor(0.6854, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 74/471 [09:18<50:01,  7.56s/it][A

loss: tensor(0.7614, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 75/471 [09:26<49:55,  7.57s/it][A

loss: tensor(0.7107, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 76/471 [09:33<49:53,  7.58s/it][A

loss: tensor(0.7504, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 77/471 [09:41<49:38,  7.56s/it][A

loss: tensor(0.6437, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 78/471 [09:49<49:33,  7.57s/it][A

loss: tensor(0.6836, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 79/471 [09:56<49:28,  7.57s/it][A

loss: tensor(0.7096, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 80/471 [10:04<49:13,  7.55s/it][A

loss: tensor(0.7436, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 81/471 [10:11<49:13,  7.57s/it][A

loss: tensor(0.7022, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 82/471 [10:19<49:02,  7.56s/it][A

loss: tensor(0.6604, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 83/471 [10:26<48:55,  7.57s/it][A

loss: tensor(0.6413, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 84/471 [10:34<48:33,  7.53s/it][A

loss: tensor(0.6881, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 85/471 [10:41<48:29,  7.54s/it][A

loss: tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 86/471 [10:49<48:22,  7.54s/it][A

loss: tensor(0.6859, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 87/471 [10:56<48:16,  7.54s/it][A

loss: tensor(0.6777, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▊        | 88/471 [11:04<47:57,  7.51s/it][A

loss: tensor(0.7107, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 89/471 [11:11<47:52,  7.52s/it][A

loss: tensor(0.6431, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 90/471 [11:19<47:53,  7.54s/it][A

loss: tensor(0.7383, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 91/471 [11:27<47:55,  7.57s/it][A

loss: tensor(0.6605, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 92/471 [11:34<47:44,  7.56s/it][A

loss: tensor(0.7240, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 93/471 [11:42<47:39,  7.56s/it][A

loss: tensor(0.6302, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 94/471 [11:49<47:38,  7.58s/it][A

loss: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 95/471 [11:57<47:22,  7.56s/it][A

loss: tensor(0.6455, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 96/471 [12:04<47:21,  7.58s/it][A

loss: tensor(0.6407, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 97/471 [12:12<47:19,  7.59s/it][A

loss: tensor(0.6402, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 98/471 [12:20<47:03,  7.57s/it][A

loss: tensor(0.7351, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 99/471 [12:27<46:39,  7.53s/it][A

loss: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 100/471 [12:35<46:30,  7.52s/it][A

loss: tensor(0.6791, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 101/471 [12:42<46:22,  7.52s/it][A

loss: tensor(0.7197, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 102/471 [12:50<46:16,  7.53s/it][A

loss: tensor(0.7247, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 103/471 [12:57<46:02,  7.51s/it][A

loss: tensor(0.7036, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 104/471 [13:05<46:02,  7.53s/it][A

loss: tensor(0.6902, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 105/471 [13:12<46:07,  7.56s/it][A

loss: tensor(0.6693, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 106/471 [13:20<46:07,  7.58s/it][A

loss: tensor(0.6306, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 107/471 [13:27<45:54,  7.57s/it][A

loss: tensor(0.6635, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 108/471 [13:35<45:50,  7.58s/it][A

loss: tensor(0.7268, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 109/471 [13:43<45:42,  7.58s/it][A

loss: tensor(0.7359, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 110/471 [13:50<45:25,  7.55s/it][A

loss: tensor(0.7001, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 111/471 [13:58<45:19,  7.55s/it][A

loss: tensor(0.7014, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 112/471 [14:05<45:09,  7.55s/it][A

loss: tensor(0.6180, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 113/471 [14:13<45:01,  7.55s/it][A

loss: tensor(0.7419, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 114/471 [14:20<44:40,  7.51s/it][A

loss: tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 115/471 [14:28<44:34,  7.51s/it][A

loss: tensor(0.6415, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 116/471 [14:35<44:26,  7.51s/it][A

loss: tensor(0.7404, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 117/471 [14:43<44:17,  7.51s/it][A

loss: tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 118/471 [14:50<44:01,  7.48s/it][A

loss: tensor(0.6809, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 119/471 [14:58<43:53,  7.48s/it][A

loss: tensor(0.6988, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 120/471 [15:05<43:48,  7.49s/it][A

loss: tensor(0.6385, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 121/471 [15:13<43:43,  7.50s/it][A

loss: tensor(0.6319, device='cuda:0', grad_fn=<NllLossBackward>)


  _warn_prf(average, modifier, msg_start, len(result))

 26%|██▌       | 122/471 [15:20<43:51,  7.54s/it][A

loss: tensor(0.5984, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 123/471 [15:28<43:45,  7.54s/it][A

loss: tensor(0.7525, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 124/471 [15:35<43:41,  7.56s/it][A

loss: tensor(0.6693, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 125/471 [15:43<43:27,  7.54s/it][A

loss: tensor(0.6950, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 126/471 [15:51<43:27,  7.56s/it][A

loss: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 127/471 [15:58<43:26,  7.58s/it][A

loss: tensor(0.8321, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 128/471 [16:06<43:22,  7.59s/it][A

loss: tensor(0.6844, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 129/471 [16:13<43:09,  7.57s/it][A

loss: tensor(0.6921, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 130/471 [16:21<43:05,  7.58s/it][A

loss: tensor(0.7009, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 131/471 [16:29<43:00,  7.59s/it][A

loss: tensor(0.6393, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 132/471 [16:36<42:54,  7.59s/it][A

loss: tensor(0.7710, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 133/471 [16:44<42:32,  7.55s/it][A

loss: tensor(0.7660, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 134/471 [16:51<42:37,  7.59s/it][A

loss: tensor(0.8100, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 135/471 [16:59<42:35,  7.61s/it][A

loss: tensor(0.7551, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 136/471 [17:06<42:12,  7.56s/it][A

loss: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 137/471 [17:14<41:56,  7.53s/it][A

loss: tensor(0.5969, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 138/471 [17:21<41:42,  7.52s/it][A

loss: tensor(0.7243, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 139/471 [17:29<41:33,  7.51s/it][A

loss: tensor(0.7923, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 140/471 [17:36<41:11,  7.47s/it][A

loss: tensor(0.6321, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 141/471 [17:44<41:04,  7.47s/it][A

loss: tensor(0.8255, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 142/471 [17:51<40:55,  7.46s/it][A

loss: tensor(0.6829, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 143/471 [17:59<40:50,  7.47s/it][A

loss: tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 144/471 [18:06<40:37,  7.45s/it][A

loss: tensor(0.7154, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 145/471 [18:13<40:34,  7.47s/it][A

loss: tensor(0.6284, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 146/471 [18:21<40:30,  7.48s/it][A

loss: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 147/471 [18:28<40:23,  7.48s/it][A

loss: tensor(0.6312, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███▏      | 148/471 [18:36<40:08,  7.46s/it][A

loss: tensor(0.6234, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 149/471 [18:43<40:12,  7.49s/it][A

loss: tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 150/471 [18:51<40:22,  7.55s/it][A

loss: tensor(0.6254, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 151/471 [18:59<40:31,  7.60s/it][A

loss: tensor(0.6018, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 152/471 [19:07<40:43,  7.66s/it][A

loss: tensor(0.6100, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 153/471 [19:14<40:26,  7.63s/it][A

loss: tensor(0.7010, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 154/471 [19:22<40:12,  7.61s/it][A

loss: tensor(0.6191, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 155/471 [19:29<39:49,  7.56s/it][A

loss: tensor(0.6573, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 156/471 [19:37<39:36,  7.55s/it][A

loss: tensor(0.6253, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 157/471 [19:44<39:22,  7.52s/it][A

loss: tensor(0.6954, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 158/471 [19:52<39:09,  7.51s/it][A

loss: tensor(0.7261, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 159/471 [19:59<38:49,  7.47s/it][A

loss: tensor(0.6845, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 160/471 [20:06<38:41,  7.46s/it][A

loss: tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 161/471 [20:14<38:48,  7.51s/it][A

loss: tensor(0.7160, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 162/471 [20:22<38:44,  7.52s/it][A

loss: tensor(0.7390, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 163/471 [20:29<38:48,  7.56s/it][A

loss: tensor(0.5631, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 164/471 [20:37<38:49,  7.59s/it][A

loss: tensor(0.6867, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 165/471 [20:45<38:50,  7.61s/it][A

loss: tensor(0.7102, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 166/471 [20:52<38:35,  7.59s/it][A

loss: tensor(0.6793, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 167/471 [21:00<38:30,  7.60s/it][A

loss: tensor(0.7017, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 168/471 [21:07<38:23,  7.60s/it][A

loss: tensor(0.7142, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 169/471 [21:15<38:18,  7.61s/it][A

loss: tensor(0.7438, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 170/471 [21:23<38:02,  7.58s/it][A

loss: tensor(0.7363, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 171/471 [21:30<37:57,  7.59s/it][A

loss: tensor(0.6708, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 172/471 [21:38<37:48,  7.59s/it][A

loss: tensor(0.5803, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 173/471 [21:45<37:42,  7.59s/it][A

loss: tensor(0.7083, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 174/471 [21:53<37:28,  7.57s/it][A

loss: tensor(0.6548, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 175/471 [22:00<37:23,  7.58s/it][A

loss: tensor(0.6744, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 176/471 [22:08<37:18,  7.59s/it][A

loss: tensor(0.8548, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 177/471 [22:16<37:04,  7.57s/it][A

loss: tensor(0.5852, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 178/471 [22:23<36:52,  7.55s/it][A

loss: tensor(0.6081, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 179/471 [22:31<36:40,  7.53s/it][A

loss: tensor(0.7230, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 180/471 [22:38<36:30,  7.53s/it][A

loss: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 181/471 [22:45<36:10,  7.48s/it][A

loss: tensor(0.7706, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 182/471 [22:53<36:01,  7.48s/it][A

loss: tensor(0.7897, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 183/471 [23:00<35:51,  7.47s/it][A

loss: tensor(0.7954, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 184/471 [23:08<35:46,  7.48s/it][A

loss: tensor(0.7212, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 185/471 [23:15<35:34,  7.46s/it][A

loss: tensor(0.7944, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 186/471 [23:23<35:36,  7.50s/it][A

loss: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 187/471 [23:31<35:40,  7.54s/it][A

loss: tensor(0.8277, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 188/471 [23:38<35:42,  7.57s/it][A

loss: tensor(0.7549, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 189/471 [23:46<35:34,  7.57s/it][A

loss: tensor(0.7765, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 190/471 [23:53<35:31,  7.58s/it][A

loss: tensor(0.6520, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 191/471 [24:01<35:16,  7.56s/it][A

loss: tensor(0.7974, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 192/471 [24:08<34:54,  7.51s/it][A

loss: tensor(0.6068, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 193/471 [24:16<34:48,  7.51s/it][A

loss: tensor(0.6629, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 194/471 [24:23<34:39,  7.51s/it][A

loss: tensor(0.6614, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 195/471 [24:31<34:35,  7.52s/it][A

loss: tensor(0.6655, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 196/471 [24:38<34:21,  7.49s/it][A

loss: tensor(0.6469, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 197/471 [24:46<34:16,  7.51s/it][A

loss: tensor(0.7642, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 198/471 [24:53<34:11,  7.51s/it][A

loss: tensor(0.6429, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 199/471 [25:01<34:08,  7.53s/it][A

loss: tensor(0.7312, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 200/471 [25:08<33:57,  7.52s/it][A

loss: tensor(0.6319, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 201/471 [25:16<33:52,  7.53s/it][A

loss: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 202/471 [25:24<33:49,  7.55s/it][A

loss: tensor(0.6682, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 203/471 [25:31<33:40,  7.54s/it][A

loss: tensor(0.7066, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 204/471 [25:39<33:39,  7.56s/it][A

loss: tensor(0.7575, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 205/471 [25:46<33:36,  7.58s/it][A

loss: tensor(0.7416, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 206/471 [25:54<33:34,  7.60s/it][A

loss: tensor(0.7265, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 207/471 [26:01<33:18,  7.57s/it][A

loss: tensor(0.6729, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 208/471 [26:09<33:10,  7.57s/it][A

loss: tensor(0.6242, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 209/471 [26:17<33:02,  7.57s/it][A

loss: tensor(0.5583, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 210/471 [26:24<32:55,  7.57s/it][A

loss: tensor(0.5541, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 211/471 [26:32<32:41,  7.54s/it][A

loss: tensor(0.5927, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 212/471 [26:39<32:36,  7.56s/it][A

loss: tensor(0.8011, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 213/471 [26:47<32:32,  7.57s/it][A

loss: tensor(0.7700, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 214/471 [26:54<32:24,  7.57s/it][A

loss: tensor(0.7635, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 215/471 [27:02<32:05,  7.52s/it][A

loss: tensor(0.6326, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 216/471 [27:09<31:56,  7.51s/it][A

loss: tensor(0.5845, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 217/471 [27:17<31:50,  7.52s/it][A

loss: tensor(0.6882, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▋     | 218/471 [27:24<31:34,  7.49s/it][A

loss: tensor(0.6407, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▋     | 219/471 [27:32<31:28,  7.49s/it][A

loss: tensor(0.5382, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 220/471 [27:39<31:20,  7.49s/it][A

loss: tensor(0.7429, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 221/471 [27:47<31:13,  7.49s/it][A

loss: tensor(0.7109, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 222/471 [27:54<30:58,  7.46s/it][A

loss: tensor(0.5841, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 223/471 [28:02<30:51,  7.47s/it][A

loss: tensor(0.6119, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 224/471 [28:09<30:43,  7.47s/it][A

loss: tensor(0.7105, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 225/471 [28:17<30:38,  7.47s/it][A

loss: tensor(0.6919, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 226/471 [28:24<30:33,  7.48s/it][A

loss: tensor(0.7630, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 227/471 [28:32<30:34,  7.52s/it][A

loss: tensor(0.5291, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 228/471 [28:39<30:27,  7.52s/it][A

loss: tensor(0.8057, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 229/471 [28:47<30:19,  7.52s/it][A

loss: tensor(0.7032, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 230/471 [28:54<30:03,  7.48s/it][A

loss: tensor(0.8077, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 231/471 [29:02<29:54,  7.48s/it][A

loss: tensor(0.5415, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 232/471 [29:09<29:49,  7.49s/it][A

loss: tensor(0.7652, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 233/471 [29:16<29:34,  7.46s/it][A

loss: tensor(0.7315, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 234/471 [29:24<29:29,  7.47s/it][A

loss: tensor(0.7853, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 235/471 [29:31<29:22,  7.47s/it][A

loss: tensor(0.6950, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 236/471 [29:39<29:16,  7.47s/it][A

loss: tensor(0.7757, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 237/471 [29:46<29:02,  7.45s/it][A

loss: tensor(0.5929, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 238/471 [29:54<28:55,  7.45s/it][A

loss: tensor(0.6741, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 239/471 [30:01<28:49,  7.45s/it][A

loss: tensor(0.7545, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 240/471 [30:09<28:41,  7.45s/it][A

loss: tensor(0.6736, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 241/471 [30:16<28:28,  7.43s/it][A

loss: tensor(0.6759, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 242/471 [30:23<28:22,  7.43s/it][A

loss: tensor(0.5814, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 243/471 [30:31<28:17,  7.44s/it][A

loss: tensor(0.7363, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 244/471 [30:38<28:06,  7.43s/it][A

loss: tensor(0.6460, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 245/471 [30:46<28:05,  7.46s/it][A

loss: tensor(0.5592, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 246/471 [30:53<28:01,  7.47s/it][A

loss: tensor(0.7056, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 247/471 [31:01<27:58,  7.49s/it][A

loss: tensor(0.6526, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 248/471 [31:08<27:45,  7.47s/it][A

loss: tensor(0.7433, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 249/471 [31:16<27:43,  7.49s/it][A

loss: tensor(0.7588, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 250/471 [31:23<27:35,  7.49s/it][A

loss: tensor(0.5790, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 251/471 [31:31<27:29,  7.50s/it][A

loss: tensor(0.7347, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▎    | 252/471 [31:38<27:16,  7.47s/it][A

loss: tensor(0.6464, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▎    | 253/471 [31:46<27:10,  7.48s/it][A

loss: tensor(0.6564, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 254/471 [31:53<27:05,  7.49s/it][A

loss: tensor(0.6991, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 255/471 [32:01<26:58,  7.49s/it][A

loss: tensor(0.5878, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 256/471 [32:08<26:45,  7.47s/it][A

loss: tensor(0.7572, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 257/471 [32:16<26:38,  7.47s/it][A

loss: tensor(0.7271, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 258/471 [32:23<26:33,  7.48s/it][A

loss: tensor(0.5161, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 259/471 [32:31<26:20,  7.46s/it][A

loss: tensor(0.5700, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 260/471 [32:38<26:15,  7.47s/it][A

loss: tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 261/471 [32:46<26:09,  7.47s/it][A

loss: tensor(0.7864, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 262/471 [32:53<26:02,  7.48s/it][A

loss: tensor(0.7465, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 263/471 [33:00<25:51,  7.46s/it][A

loss: tensor(0.7158, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 264/471 [33:08<25:46,  7.47s/it][A

loss: tensor(0.7543, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 265/471 [33:15<25:41,  7.48s/it][A

loss: tensor(0.5310, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 266/471 [33:23<25:36,  7.49s/it][A

loss: tensor(0.7810, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 267/471 [33:30<25:25,  7.48s/it][A

loss: tensor(0.5322, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 268/471 [33:38<25:19,  7.49s/it][A

loss: tensor(0.6510, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 269/471 [33:45<25:13,  7.49s/it][A

loss: tensor(0.6609, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 270/471 [33:53<25:03,  7.48s/it][A

loss: tensor(0.7464, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 271/471 [34:00<24:53,  7.47s/it][A

loss: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 272/471 [34:08<24:48,  7.48s/it][A

loss: tensor(0.7431, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 273/471 [34:15<24:42,  7.49s/it][A

loss: tensor(0.7429, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 274/471 [34:23<24:30,  7.46s/it][A

loss: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 275/471 [34:30<24:25,  7.48s/it][A

loss: tensor(0.7124, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 276/471 [34:38<24:19,  7.48s/it][A

loss: tensor(0.6109, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 277/471 [34:45<24:13,  7.49s/it][A

loss: tensor(0.7041, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 278/471 [34:53<24:00,  7.46s/it][A

loss: tensor(0.6441, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 279/471 [35:00<23:54,  7.47s/it][A

loss: tensor(0.8166, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 280/471 [35:08<23:48,  7.48s/it][A

loss: tensor(0.7788, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 281/471 [35:15<23:42,  7.49s/it][A

loss: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 282/471 [35:23<23:32,  7.47s/it][A

loss: tensor(0.7194, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 283/471 [35:30<23:28,  7.49s/it][A

loss: tensor(0.5450, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 284/471 [35:38<23:25,  7.52s/it][A

loss: tensor(0.7069, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 285/471 [35:45<23:17,  7.51s/it][A

loss: tensor(0.5416, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 286/471 [35:53<23:15,  7.54s/it][A

loss: tensor(0.5743, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 287/471 [36:00<23:09,  7.55s/it][A

loss: tensor(0.6521, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 288/471 [36:08<23:02,  7.55s/it][A

loss: tensor(0.6341, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 289/471 [36:15<22:46,  7.51s/it][A

loss: tensor(0.6596, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 290/471 [36:23<22:40,  7.52s/it][A

loss: tensor(0.6577, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 291/471 [36:30<22:32,  7.51s/it][A

loss: tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 292/471 [36:38<22:24,  7.51s/it][A

loss: tensor(0.5628, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 293/471 [36:45<22:10,  7.48s/it][A

loss: tensor(0.7949, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 294/471 [36:53<22:05,  7.49s/it][A

loss: tensor(0.6739, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 295/471 [37:00<21:59,  7.49s/it][A

loss: tensor(0.7964, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 296/471 [37:08<21:50,  7.49s/it][A

loss: tensor(0.6659, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 297/471 [37:15<21:37,  7.46s/it][A

loss: tensor(0.8029, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 298/471 [37:23<21:31,  7.47s/it][A

loss: tensor(0.7762, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 299/471 [37:30<21:29,  7.50s/it][A

loss: tensor(0.5932, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 300/471 [37:38<21:22,  7.50s/it][A

loss: tensor(0.7463, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 301/471 [37:45<21:21,  7.54s/it][A

loss: tensor(0.6245, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 302/471 [37:53<21:17,  7.56s/it][A

loss: tensor(0.6564, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 303/471 [38:01<21:12,  7.58s/it][A

loss: tensor(0.6913, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 304/471 [38:08<21:01,  7.56s/it][A

loss: tensor(0.7806, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 305/471 [38:16<20:51,  7.54s/it][A

loss: tensor(0.8132, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 306/471 [38:23<20:41,  7.52s/it][A

loss: tensor(0.6518, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 307/471 [38:31<20:33,  7.52s/it][A

loss: tensor(0.7488, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 308/471 [38:38<20:19,  7.48s/it][A

loss: tensor(0.7538, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 309/471 [38:45<20:12,  7.48s/it][A

loss: tensor(0.7083, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 310/471 [38:53<20:07,  7.50s/it][A

loss: tensor(0.7425, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 311/471 [39:01<20:01,  7.51s/it][A

loss: tensor(0.7070, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 312/471 [39:08<19:49,  7.48s/it][A

loss: tensor(0.7062, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 313/471 [39:15<19:43,  7.49s/it][A

loss: tensor(0.7575, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 314/471 [39:23<19:36,  7.49s/it][A

loss: tensor(0.6980, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 315/471 [39:30<19:25,  7.47s/it][A

loss: tensor(0.7392, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 316/471 [39:38<19:18,  7.48s/it][A

loss: tensor(0.7067, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 317/471 [39:45<19:12,  7.48s/it][A

loss: tensor(0.7295, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 318/471 [39:53<19:06,  7.49s/it][A

loss: tensor(0.7523, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 319/471 [40:00<18:54,  7.46s/it][A

loss: tensor(0.7408, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 320/471 [40:08<18:48,  7.47s/it][A

loss: tensor(0.7757, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 321/471 [40:15<18:42,  7.48s/it][A

loss: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 322/471 [40:23<18:36,  7.49s/it][A

loss: tensor(0.7029, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▊   | 323/471 [40:30<18:24,  7.46s/it][A

loss: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 324/471 [40:38<18:17,  7.47s/it][A

loss: tensor(0.6213, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 325/471 [40:45<18:09,  7.46s/it][A

loss: tensor(0.7212, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 326/471 [40:53<17:58,  7.44s/it][A

loss: tensor(0.7418, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 327/471 [41:00<17:53,  7.46s/it][A

loss: tensor(0.7201, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 328/471 [41:07<17:47,  7.46s/it][A

loss: tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 329/471 [41:15<17:51,  7.55s/it][A

loss: tensor(0.6977, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 330/471 [41:23<17:46,  7.57s/it][A

loss: tensor(0.7478, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 331/471 [41:31<17:44,  7.60s/it][A

loss: tensor(0.7430, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 332/471 [41:38<17:38,  7.62s/it][A

loss: tensor(0.6814, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 333/471 [41:46<17:32,  7.62s/it][A

loss: tensor(0.6622, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 334/471 [41:53<17:15,  7.56s/it][A

loss: tensor(0.6123, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 335/471 [42:01<17:07,  7.56s/it][A

loss: tensor(0.7357, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 336/471 [42:08<17:04,  7.59s/it][A

loss: tensor(0.7378, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 337/471 [42:16<16:59,  7.61s/it][A

loss: tensor(0.6735, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 338/471 [42:24<16:49,  7.59s/it][A

loss: tensor(0.6449, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 339/471 [42:31<16:44,  7.61s/it][A

loss: tensor(0.6580, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 340/471 [42:39<16:38,  7.62s/it][A

loss: tensor(0.6291, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 341/471 [42:46<16:27,  7.60s/it][A

loss: tensor(0.6438, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 342/471 [42:54<16:17,  7.58s/it][A

loss: tensor(0.6187, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 343/471 [43:02<16:06,  7.55s/it][A

loss: tensor(0.6732, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 344/471 [43:09<15:56,  7.53s/it][A

loss: tensor(0.7128, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 345/471 [43:16<15:43,  7.48s/it][A

loss: tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 346/471 [43:24<15:34,  7.48s/it][A

loss: tensor(0.7653, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 347/471 [43:31<15:26,  7.47s/it][A

loss: tensor(0.7114, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 348/471 [43:39<15:19,  7.48s/it][A

loss: tensor(0.7161, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 349/471 [43:46<15:10,  7.46s/it][A

loss: tensor(0.6746, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 350/471 [43:54<15:04,  7.48s/it][A

loss: tensor(0.6249, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 351/471 [44:01<15:02,  7.52s/it][A

loss: tensor(0.6565, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 352/471 [44:09<14:55,  7.52s/it][A

loss: tensor(0.6540, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 353/471 [44:16<14:44,  7.50s/it][A

loss: tensor(0.7317, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 354/471 [44:24<14:37,  7.50s/it][A

loss: tensor(0.7045, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 355/471 [44:31<14:30,  7.50s/it][A

loss: tensor(0.6559, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 356/471 [44:39<14:23,  7.51s/it][A

loss: tensor(0.6429, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 357/471 [44:46<14:16,  7.52s/it][A

loss: tensor(0.7299, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 358/471 [44:54<14:09,  7.52s/it][A

loss: tensor(0.6775, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 359/471 [45:01<14:02,  7.53s/it][A

loss: tensor(0.6329, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 360/471 [45:09<13:54,  7.52s/it][A

loss: tensor(0.6442, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 361/471 [45:17<13:50,  7.55s/it][A

loss: tensor(0.6434, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 362/471 [45:24<13:42,  7.54s/it][A

loss: tensor(0.7462, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 363/471 [45:32<13:40,  7.60s/it][A

loss: tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 364/471 [45:40<13:35,  7.62s/it][A

loss: tensor(0.7500, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 365/471 [45:47<13:30,  7.65s/it][A

loss: tensor(0.6919, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 366/471 [45:55<13:24,  7.67s/it][A

loss: tensor(0.7335, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 367/471 [46:03<13:16,  7.66s/it][A

loss: tensor(0.6659, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 368/471 [46:10<13:11,  7.69s/it][A

loss: tensor(0.6993, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 369/471 [46:18<13:05,  7.70s/it][A

loss: tensor(0.7035, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 370/471 [46:26<13:00,  7.73s/it][A

loss: tensor(0.6646, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 371/471 [46:33<12:46,  7.67s/it][A

loss: tensor(0.6443, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 372/471 [46:41<12:34,  7.62s/it][A

loss: tensor(0.7800, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 373/471 [46:48<12:23,  7.59s/it][A

loss: tensor(0.6330, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 374/471 [46:56<12:14,  7.57s/it][A

loss: tensor(0.6216, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 375/471 [47:03<12:04,  7.54s/it][A

loss: tensor(0.7184, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 376/471 [47:11<11:59,  7.57s/it][A

loss: tensor(0.6868, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 377/471 [47:19<11:53,  7.59s/it][A

loss: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 378/471 [47:26<11:46,  7.59s/it][A

loss: tensor(0.6457, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 379/471 [47:34<11:39,  7.61s/it][A

loss: tensor(0.6915, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 380/471 [47:42<11:36,  7.65s/it][A

loss: tensor(0.7354, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 381/471 [47:49<11:32,  7.70s/it][A

loss: tensor(0.6839, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 382/471 [47:57<11:21,  7.65s/it][A

loss: tensor(0.5884, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████▏ | 383/471 [48:05<11:11,  7.63s/it][A

loss: tensor(0.7146, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 384/471 [48:12<11:00,  7.59s/it][A

loss: tensor(0.6611, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 385/471 [48:20<10:50,  7.57s/it][A

loss: tensor(0.7118, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 386/471 [48:27<10:39,  7.52s/it][A

loss: tensor(0.7057, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 387/471 [48:35<10:31,  7.51s/it][A

loss: tensor(0.6661, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 388/471 [48:42<10:23,  7.51s/it][A

loss: tensor(0.7377, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 389/471 [48:50<10:15,  7.50s/it][A

loss: tensor(0.6668, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 390/471 [48:57<10:06,  7.48s/it][A

loss: tensor(0.6913, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 391/471 [49:04<09:59,  7.50s/it][A

loss: tensor(0.7641, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 392/471 [49:12<09:52,  7.50s/it][A

loss: tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 393/471 [49:20<09:45,  7.51s/it][A

loss: tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 394/471 [49:27<09:36,  7.49s/it][A

loss: tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 395/471 [49:34<09:29,  7.49s/it][A

loss: tensor(0.6815, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 396/471 [49:42<09:22,  7.51s/it][A

loss: tensor(0.7317, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 397/471 [49:49<09:13,  7.48s/it][A

loss: tensor(0.7494, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 398/471 [49:57<09:06,  7.48s/it][A

loss: tensor(0.7153, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 399/471 [50:04<08:59,  7.49s/it][A

loss: tensor(0.7473, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 400/471 [50:12<08:52,  7.50s/it][A

loss: tensor(0.6651, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 401/471 [50:19<08:42,  7.47s/it][A

loss: tensor(0.6752, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 402/471 [50:27<08:36,  7.48s/it][A

loss: tensor(0.7114, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 403/471 [50:34<08:29,  7.49s/it][A

loss: tensor(0.7048, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 404/471 [50:42<08:21,  7.49s/it][A

loss: tensor(0.6515, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 405/471 [50:49<08:12,  7.46s/it][A

loss: tensor(0.7038, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 406/471 [50:57<08:04,  7.46s/it][A

loss: tensor(0.7114, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 407/471 [51:04<07:58,  7.47s/it][A

loss: tensor(0.5815, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 408/471 [51:12<07:49,  7.45s/it][A

loss: tensor(0.6846, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 409/471 [51:19<07:43,  7.47s/it][A

loss: tensor(0.7006, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 410/471 [51:27<07:36,  7.48s/it][A

loss: tensor(0.6192, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 411/471 [51:34<07:30,  7.51s/it][A

loss: tensor(0.6745, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 412/471 [51:42<07:22,  7.51s/it][A

loss: tensor(0.7735, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 413/471 [51:49<07:16,  7.52s/it][A

loss: tensor(0.7016, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 414/471 [51:57<07:08,  7.52s/it][A

loss: tensor(0.7090, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 415/471 [52:04<07:02,  7.54s/it][A

loss: tensor(0.6716, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 416/471 [52:12<06:52,  7.51s/it][A

loss: tensor(0.7174, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 417/471 [52:19<06:46,  7.52s/it][A

loss: tensor(0.6652, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 418/471 [52:27<06:39,  7.55s/it][A

loss: tensor(0.7565, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 419/471 [52:35<06:33,  7.57s/it][A

loss: tensor(0.7199, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 420/471 [52:42<06:25,  7.55s/it][A

loss: tensor(0.6643, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 421/471 [52:50<06:18,  7.57s/it][A

loss: tensor(0.6097, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 422/471 [52:57<06:11,  7.59s/it][A

loss: tensor(0.6068, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 423/471 [53:05<06:03,  7.57s/it][A

loss: tensor(0.6067, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 424/471 [53:12<05:55,  7.57s/it][A

loss: tensor(0.6686, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 425/471 [53:20<05:48,  7.57s/it][A

loss: tensor(0.6498, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 426/471 [53:27<05:39,  7.56s/it][A

loss: tensor(0.6242, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 427/471 [53:35<05:30,  7.52s/it][A

loss: tensor(0.6466, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 428/471 [53:42<05:23,  7.52s/it][A

loss: tensor(0.6211, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 429/471 [53:50<05:16,  7.54s/it][A

loss: tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 430/471 [53:58<05:09,  7.56s/it][A

loss: tensor(0.6546, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 431/471 [54:05<05:02,  7.55s/it][A

loss: tensor(0.7101, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 432/471 [54:13<04:55,  7.57s/it][A

loss: tensor(0.6399, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 433/471 [54:20<04:47,  7.56s/it][A

loss: tensor(0.6803, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 434/471 [54:28<04:39,  7.56s/it][A

loss: tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 435/471 [54:35<04:30,  7.53s/it][A

loss: tensor(0.7190, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 436/471 [54:43<04:23,  7.53s/it][A

loss: tensor(0.6390, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 437/471 [54:50<04:16,  7.53s/it][A

loss: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 438/471 [54:58<04:07,  7.50s/it][A

loss: tensor(0.7173, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 439/471 [55:05<04:00,  7.51s/it][A

loss: tensor(0.6599, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 440/471 [55:13<03:52,  7.51s/it][A

loss: tensor(0.6680, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 441/471 [55:20<03:45,  7.52s/it][A

loss: tensor(0.6612, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 442/471 [55:28<03:37,  7.49s/it][A

loss: tensor(0.6881, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 443/471 [55:35<03:29,  7.50s/it][A

loss: tensor(0.6838, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 444/471 [55:43<03:22,  7.50s/it][A

loss: tensor(0.6039, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 445/471 [55:50<03:14,  7.50s/it][A

loss: tensor(0.7117, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 446/471 [55:58<03:06,  7.47s/it][A

loss: tensor(0.6078, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 447/471 [56:05<02:59,  7.47s/it][A

loss: tensor(0.7299, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 448/471 [56:13<02:51,  7.48s/it][A

loss: tensor(0.7247, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 449/471 [56:20<02:44,  7.46s/it][A

loss: tensor(0.7374, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 450/471 [56:28<02:37,  7.48s/it][A

loss: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 451/471 [56:35<02:29,  7.49s/it][A

loss: tensor(0.6005, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 452/471 [56:43<02:22,  7.51s/it][A

loss: tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 453/471 [56:50<02:14,  7.49s/it][A

loss: tensor(0.6854, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▋| 454/471 [56:58<02:07,  7.51s/it][A

loss: tensor(0.7071, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 455/471 [57:05<02:00,  7.51s/it][A

loss: tensor(0.7366, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 456/471 [57:13<01:52,  7.52s/it][A

loss: tensor(0.7100, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 457/471 [57:20<01:44,  7.48s/it][A

loss: tensor(0.7281, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 458/471 [57:28<01:37,  7.49s/it][A

loss: tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 459/471 [57:35<01:30,  7.50s/it][A

loss: tensor(0.8125, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 460/471 [57:43<01:22,  7.51s/it][A

loss: tensor(0.6988, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 461/471 [57:50<01:14,  7.49s/it][A

loss: tensor(0.6003, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 462/471 [57:58<01:07,  7.49s/it][A

loss: tensor(0.6951, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 463/471 [58:05<01:00,  7.50s/it][A

loss: tensor(0.6952, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 464/471 [58:13<00:52,  7.47s/it][A

loss: tensor(0.7092, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 465/471 [58:20<00:44,  7.48s/it][A

loss: tensor(0.7395, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 466/471 [58:28<00:37,  7.48s/it][A

loss: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 467/471 [58:35<00:29,  7.48s/it][A

loss: tensor(0.6136, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 468/471 [58:42<00:22,  7.45s/it][A

loss: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 469/471 [58:50<00:14,  7.46s/it][A

loss: tensor(0.6937, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 470/471 [58:57<00:07,  7.47s/it][A

loss: tensor(0.6994, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 471/471 [59:05<00:00,  7.53s/it][A


loss: tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward>)

	Train loss: 0.6869234562299813

	train acc: 0.6985138004246284

	training prec: 0.5168402118640972

	training rec: 0.516981037363203

	training f1: 0.4914371251610223



  0%|          | 0/118 [00:00<?, ?it/s][A
  1%|          | 1/118 [00:01<03:25,  1.76s/it][A
  2%|▏         | 2/118 [00:03<03:25,  1.77s/it][A
  3%|▎         | 3/118 [00:05<03:24,  1.78s/it][A
  3%|▎         | 4/118 [00:07<03:23,  1.78s/it][A
  4%|▍         | 5/118 [00:08<03:21,  1.78s/it][A
  5%|▌         | 6/118 [00:10<03:19,  1.78s/it][A
  6%|▌         | 7/118 [00:12<03:18,  1.79s/it][A
  7%|▋         | 8/118 [00:14<03:16,  1.79s/it][A
  8%|▊         | 9/118 [00:15<03:12,  1.77s/it][A
  8%|▊         | 10/118 [00:17<03:11,  1.78s/it][A
  9%|▉         | 11/118 [00:19<03:10,  1.78s/it][A
 10%|█         | 12/118 [00:21<03:09,  1.78s/it][A
 11%|█         | 13/118 [00:23<03:07,  1.79s/it][A
 12%|█▏        | 14/118 [00:24<03:05,  1.79s/it][A
 13%|█▎        | 15/118 [00:26<03:04,  1.79s/it][A
 14%|█▎        | 16/118 [00:28<03:02,  1.79s/it][A
 14%|█▍        | 17/118 [00:30<02:58,  1.77s/it][A
 15%|█▌        | 18/118 [00:32<02:57,  1.78s/it][A
 16%|█▌        | 19/118 [00:3


	Validation loss: 0.6703488841905432

	Validation acc: 0.8604546936114732

	Validation prec: 0.4683629400260756

	Validation rec: 0.538135593220339

	Validation f1: 0.49962659634823636
Validation loss decreased (inf --> 0.670349).  Saving model ...


Epoch:   2%|▏         | 1/50 [1:02:35<51:06:59, 3755.49s/it]




  0%|          | 0/471 [00:00<?, ?it/s][A
  0%|          | 1/471 [00:07<59:07,  7.55s/it][A

loss: tensor(0.7633, device='cuda:0', grad_fn=<NllLossBackward>)



  0%|          | 2/471 [00:15<59:06,  7.56s/it][A

loss: tensor(0.7178, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/471 [00:22<58:33,  7.51s/it][A

loss: tensor(0.6837, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/471 [00:30<58:37,  7.53s/it][A

loss: tensor(0.6902, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 5/471 [00:37<58:28,  7.53s/it][A

loss: tensor(0.6861, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 6/471 [00:45<58:24,  7.54s/it][A

loss: tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 7/471 [00:52<57:58,  7.50s/it][A

loss: tensor(0.7550, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/471 [01:00<57:53,  7.50s/it][A

loss: tensor(0.6029, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 9/471 [01:07<57:48,  7.51s/it][A

loss: tensor(0.6799, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 10/471 [01:15<57:45,  7.52s/it][A

loss: tensor(0.6300, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 11/471 [01:22<57:27,  7.49s/it][A

loss: tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/471 [01:30<57:24,  7.50s/it][A

loss: tensor(0.6773, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 13/471 [01:37<57:21,  7.51s/it][A

loss: tensor(0.6564, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 14/471 [01:45<57:00,  7.49s/it][A

loss: tensor(0.7009, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 15/471 [01:52<56:57,  7.49s/it][A

loss: tensor(0.6131, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 16/471 [02:00<56:50,  7.50s/it][A

loss: tensor(0.5539, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▎         | 17/471 [02:07<56:43,  7.50s/it][A

loss: tensor(0.7578, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 18/471 [02:15<56:25,  7.47s/it][A

loss: tensor(0.7103, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 19/471 [02:22<56:34,  7.51s/it][A

loss: tensor(0.7038, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 20/471 [02:30<56:46,  7.55s/it][A

loss: tensor(0.7175, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 21/471 [02:37<56:54,  7.59s/it][A

loss: tensor(0.6255, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 22/471 [02:45<56:44,  7.58s/it][A

loss: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 23/471 [02:53<56:46,  7.60s/it][A

loss: tensor(0.7568, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 24/471 [03:00<56:36,  7.60s/it][A

loss: tensor(0.7817, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 25/471 [03:08<56:24,  7.59s/it][A

loss: tensor(0.6909, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 26/471 [03:15<55:56,  7.54s/it][A

loss: tensor(0.5887, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 27/471 [03:23<55:45,  7.54s/it][A

loss: tensor(0.6085, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 28/471 [03:30<55:39,  7.54s/it][A

loss: tensor(0.6328, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 29/471 [03:38<55:17,  7.51s/it][A

loss: tensor(0.7448, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 30/471 [03:45<55:14,  7.52s/it][A

loss: tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 31/471 [03:53<55:07,  7.52s/it][A

loss: tensor(0.6079, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 32/471 [04:00<55:04,  7.53s/it][A

loss: tensor(0.6441, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 33/471 [04:08<54:50,  7.51s/it][A

loss: tensor(0.6655, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 34/471 [04:15<54:55,  7.54s/it][A

loss: tensor(0.5935, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 35/471 [04:23<54:53,  7.55s/it][A

loss: tensor(0.7247, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 36/471 [04:31<54:49,  7.56s/it][A

loss: tensor(0.7072, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 37/471 [04:38<54:33,  7.54s/it][A

loss: tensor(0.6463, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 38/471 [04:46<54:37,  7.57s/it][A

loss: tensor(0.7849, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 39/471 [04:53<54:34,  7.58s/it][A

loss: tensor(0.6879, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 40/471 [05:01<54:32,  7.59s/it][A

loss: tensor(0.6573, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 41/471 [05:08<54:11,  7.56s/it][A

loss: tensor(0.6991, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 42/471 [05:16<54:04,  7.56s/it][A

loss: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 43/471 [05:24<53:56,  7.56s/it][A

loss: tensor(0.6798, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 44/471 [05:31<53:31,  7.52s/it][A

loss: tensor(0.6499, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 45/471 [05:39<53:27,  7.53s/it][A

loss: tensor(0.6657, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 46/471 [05:46<53:20,  7.53s/it][A

loss: tensor(0.6445, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 47/471 [05:54<53:14,  7.53s/it][A

loss: tensor(0.6423, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 48/471 [06:01<52:53,  7.50s/it][A

loss: tensor(0.7797, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 49/471 [06:09<52:49,  7.51s/it][A

loss: tensor(0.5767, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 50/471 [06:16<52:43,  7.51s/it][A

loss: tensor(0.6844, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 51/471 [06:24<52:39,  7.52s/it][A

loss: tensor(0.6637, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 52/471 [06:31<52:20,  7.50s/it][A

loss: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 53/471 [06:39<52:15,  7.50s/it][A

loss: tensor(0.6552, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 54/471 [06:46<52:13,  7.51s/it][A

loss: tensor(0.7200, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 55/471 [06:54<51:53,  7.48s/it][A

loss: tensor(0.6282, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 56/471 [07:01<51:51,  7.50s/it][A

loss: tensor(0.7173, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 57/471 [07:09<51:44,  7.50s/it][A

loss: tensor(0.7162, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 58/471 [07:16<51:41,  7.51s/it][A

loss: tensor(0.6782, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 59/471 [07:24<51:25,  7.49s/it][A

loss: tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 60/471 [07:31<51:24,  7.51s/it][A

loss: tensor(0.7676, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 61/471 [07:39<51:20,  7.51s/it][A

loss: tensor(0.7368, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 62/471 [07:46<51:17,  7.52s/it][A

loss: tensor(0.6948, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 63/471 [07:54<50:59,  7.50s/it][A

loss: tensor(0.6944, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 64/471 [08:01<50:54,  7.50s/it][A

loss: tensor(0.6122, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 65/471 [08:09<50:49,  7.51s/it][A

loss: tensor(0.7012, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 66/471 [08:16<50:44,  7.52s/it][A

loss: tensor(0.6809, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 67/471 [08:24<50:26,  7.49s/it][A

loss: tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 68/471 [08:31<50:24,  7.50s/it][A

loss: tensor(0.7042, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 69/471 [08:39<50:22,  7.52s/it][A

loss: tensor(0.6875, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 70/471 [08:46<50:05,  7.49s/it][A

loss: tensor(0.6529, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 71/471 [08:54<50:02,  7.51s/it][A

loss: tensor(0.6891, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 72/471 [09:01<49:57,  7.51s/it][A

loss: tensor(0.6446, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 73/471 [09:09<49:51,  7.52s/it][A

loss: tensor(0.6560, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 74/471 [09:16<49:32,  7.49s/it][A

loss: tensor(0.7196, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 75/471 [09:24<49:26,  7.49s/it][A

loss: tensor(0.6674, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 76/471 [09:31<49:19,  7.49s/it][A

loss: tensor(0.6371, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 77/471 [09:39<49:14,  7.50s/it][A

loss: tensor(0.6867, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 78/471 [09:46<48:59,  7.48s/it][A

loss: tensor(0.6657, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 79/471 [09:54<49:00,  7.50s/it][A

loss: tensor(0.6559, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 80/471 [10:01<49:00,  7.52s/it][A

loss: tensor(0.6276, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 81/471 [10:09<48:56,  7.53s/it][A

loss: tensor(0.7683, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 82/471 [10:16<48:48,  7.53s/it][A

loss: tensor(0.6585, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 83/471 [10:24<48:52,  7.56s/it][A

loss: tensor(0.7282, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 84/471 [10:32<48:55,  7.59s/it][A

loss: tensor(0.7104, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 85/471 [10:39<48:34,  7.55s/it][A

loss: tensor(0.6813, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 86/471 [10:47<48:33,  7.57s/it][A

loss: tensor(0.6802, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 87/471 [10:54<48:32,  7.59s/it][A

loss: tensor(0.6646, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▊        | 88/471 [11:02<48:32,  7.60s/it][A

loss: tensor(0.6297, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 89/471 [11:10<48:18,  7.59s/it][A

loss: tensor(0.6878, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 90/471 [11:17<48:18,  7.61s/it][A

loss: tensor(0.6505, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 91/471 [11:25<48:04,  7.59s/it][A

loss: tensor(0.6718, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 92/471 [11:32<47:50,  7.57s/it][A

loss: tensor(0.6035, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 93/471 [11:40<47:28,  7.53s/it][A

loss: tensor(0.6328, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 94/471 [11:47<47:16,  7.52s/it][A

loss: tensor(0.7106, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 95/471 [11:55<47:08,  7.52s/it][A

loss: tensor(0.6064, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 96/471 [12:02<46:50,  7.49s/it][A

loss: tensor(0.6809, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 97/471 [12:10<46:47,  7.51s/it][A

loss: tensor(0.6865, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 98/471 [12:17<46:42,  7.51s/it][A

loss: tensor(0.7003, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 99/471 [12:25<46:41,  7.53s/it][A

loss: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 100/471 [12:32<46:36,  7.54s/it][A

loss: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 101/471 [12:40<46:42,  7.57s/it][A

loss: tensor(0.5583, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 102/471 [12:48<46:37,  7.58s/it][A

loss: tensor(0.6356, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 103/471 [12:55<46:29,  7.58s/it][A

loss: tensor(0.6502, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 104/471 [13:03<46:07,  7.54s/it][A

loss: tensor(0.7022, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 105/471 [13:10<46:00,  7.54s/it][A

loss: tensor(0.7291, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 106/471 [13:18<46:01,  7.57s/it][A

loss: tensor(0.7110, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 107/471 [13:25<45:53,  7.56s/it][A

loss: tensor(0.6106, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 108/471 [13:33<45:34,  7.53s/it][A

loss: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 109/471 [13:40<45:26,  7.53s/it][A

loss: tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)



 23%|██▎       | 110/471 [13:48<45:21,  7.54s/it][A

loss: tensor(0.7111, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▎       | 111/471 [13:55<45:03,  7.51s/it][A

loss: tensor(0.6314, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 112/471 [14:03<44:58,  7.52s/it][A

loss: tensor(0.7101, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 113/471 [14:10<45:00,  7.54s/it][A

loss: tensor(0.6687, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 114/471 [14:18<45:01,  7.57s/it][A

loss: tensor(0.6449, device='cuda:0', grad_fn=<NllLossBackward>)



 24%|██▍       | 115/471 [14:26<44:38,  7.52s/it][A

loss: tensor(0.5916, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 116/471 [14:33<44:30,  7.52s/it][A

loss: tensor(0.7482, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▍       | 117/471 [14:41<44:21,  7.52s/it][A

loss: tensor(0.6180, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 118/471 [14:48<44:18,  7.53s/it][A

loss: tensor(0.6704, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 119/471 [14:56<44:05,  7.52s/it][A

loss: tensor(0.7020, device='cuda:0', grad_fn=<NllLossBackward>)



 25%|██▌       | 120/471 [15:03<44:03,  7.53s/it][A

loss: tensor(0.7469, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 121/471 [15:11<43:56,  7.53s/it][A

loss: tensor(0.5233, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 122/471 [15:18<43:49,  7.53s/it][A

loss: tensor(0.6475, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▌       | 123/471 [15:26<43:32,  7.51s/it][A

loss: tensor(0.6945, device='cuda:0', grad_fn=<NllLossBackward>)



 26%|██▋       | 124/471 [15:33<43:26,  7.51s/it][A

loss: tensor(0.6610, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 125/471 [15:41<43:23,  7.52s/it][A

loss: tensor(0.7090, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 126/471 [15:48<43:08,  7.50s/it][A

loss: tensor(0.6732, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 127/471 [15:56<43:09,  7.53s/it][A

loss: tensor(0.6273, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 128/471 [16:03<43:05,  7.54s/it][A

loss: tensor(0.6974, device='cuda:0', grad_fn=<NllLossBackward>)



 27%|██▋       | 129/471 [16:11<43:06,  7.56s/it][A

loss: tensor(0.7063, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 130/471 [16:18<42:54,  7.55s/it][A

loss: tensor(0.7206, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 131/471 [16:26<42:54,  7.57s/it][A

loss: tensor(0.6717, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 132/471 [16:34<42:45,  7.57s/it][A

loss: tensor(0.6418, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 133/471 [16:41<42:34,  7.56s/it][A

loss: tensor(0.6436, device='cuda:0', grad_fn=<NllLossBackward>)



 28%|██▊       | 134/471 [16:49<42:13,  7.52s/it][A

loss: tensor(0.6237, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▊       | 135/471 [16:56<42:03,  7.51s/it][A

loss: tensor(0.6538, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 136/471 [17:04<42:01,  7.53s/it][A

loss: tensor(0.6685, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 137/471 [17:11<41:46,  7.51s/it][A

loss: tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)



 29%|██▉       | 138/471 [17:19<41:43,  7.52s/it][A

loss: tensor(0.6817, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 139/471 [17:26<41:43,  7.54s/it][A

loss: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 140/471 [17:34<41:39,  7.55s/it][A

loss: tensor(0.6919, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|██▉       | 141/471 [17:41<41:24,  7.53s/it][A

loss: tensor(0.7736, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 142/471 [17:49<41:25,  7.56s/it][A

loss: tensor(0.6247, device='cuda:0', grad_fn=<NllLossBackward>)



 30%|███       | 143/471 [17:57<41:22,  7.57s/it][A

loss: tensor(0.5707, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 144/471 [18:04<41:27,  7.61s/it][A

loss: tensor(0.5522, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 145/471 [18:12<41:14,  7.59s/it][A

loss: tensor(0.5672, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 146/471 [18:19<41:03,  7.58s/it][A

loss: tensor(0.8341, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███       | 147/471 [18:27<40:52,  7.57s/it][A

loss: tensor(0.7230, device='cuda:0', grad_fn=<NllLossBackward>)



 31%|███▏      | 148/471 [18:34<40:41,  7.56s/it][A

loss: tensor(0.6968, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 149/471 [18:42<40:22,  7.52s/it][A

loss: tensor(0.7734, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 150/471 [18:49<40:15,  7.52s/it][A

loss: tensor(0.7062, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 151/471 [18:57<40:11,  7.54s/it][A

loss: tensor(0.7776, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 152/471 [19:04<39:51,  7.50s/it][A

loss: tensor(0.6613, device='cuda:0', grad_fn=<NllLossBackward>)



 32%|███▏      | 153/471 [19:12<39:48,  7.51s/it][A

loss: tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 154/471 [19:19<39:42,  7.51s/it][A

loss: tensor(0.7214, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 155/471 [19:27<39:35,  7.52s/it][A

loss: tensor(0.6736, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 156/471 [19:34<39:18,  7.49s/it][A

loss: tensor(0.5894, device='cuda:0', grad_fn=<NllLossBackward>)



 33%|███▎      | 157/471 [19:42<39:13,  7.49s/it][A

loss: tensor(0.6309, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▎      | 158/471 [19:49<39:07,  7.50s/it][A

loss: tensor(0.7195, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 159/471 [19:57<39:02,  7.51s/it][A

loss: tensor(0.6596, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 160/471 [20:04<38:49,  7.49s/it][A

loss: tensor(0.7646, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 161/471 [20:12<38:50,  7.52s/it][A

loss: tensor(0.6737, device='cuda:0', grad_fn=<NllLossBackward>)



 34%|███▍      | 162/471 [20:20<38:48,  7.54s/it][A

loss: tensor(0.7150, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 163/471 [20:27<38:40,  7.54s/it][A

loss: tensor(0.7500, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▍      | 164/471 [20:35<38:28,  7.52s/it][A

loss: tensor(0.6044, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 165/471 [20:42<38:24,  7.53s/it][A

loss: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 166/471 [20:50<38:20,  7.54s/it][A

loss: tensor(0.6640, device='cuda:0', grad_fn=<NllLossBackward>)



 35%|███▌      | 167/471 [20:57<38:05,  7.52s/it][A

loss: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 168/471 [21:05<38:00,  7.53s/it][A

loss: tensor(0.5814, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 169/471 [21:12<37:52,  7.53s/it][A

loss: tensor(0.6627, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▌      | 170/471 [21:20<37:47,  7.53s/it][A

loss: tensor(0.6812, device='cuda:0', grad_fn=<NllLossBackward>)



 36%|███▋      | 171/471 [21:27<37:31,  7.50s/it][A

loss: tensor(0.5490, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 172/471 [21:35<37:29,  7.52s/it][A

loss: tensor(0.5672, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 173/471 [21:42<37:25,  7.54s/it][A

loss: tensor(0.6975, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 174/471 [21:50<37:22,  7.55s/it][A

loss: tensor(0.7314, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 175/471 [21:57<37:07,  7.53s/it][A

loss: tensor(0.7097, device='cuda:0', grad_fn=<NllLossBackward>)



 37%|███▋      | 176/471 [22:05<36:58,  7.52s/it][A

loss: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 177/471 [22:12<36:51,  7.52s/it][A

loss: tensor(0.5935, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 178/471 [22:20<36:38,  7.50s/it][A

loss: tensor(0.6965, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 179/471 [22:27<36:37,  7.52s/it][A

loss: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 180/471 [22:35<36:33,  7.54s/it][A

loss: tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward>)



 38%|███▊      | 181/471 [22:43<36:28,  7.55s/it][A

loss: tensor(0.7281, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▊      | 182/471 [22:50<36:12,  7.52s/it][A

loss: tensor(0.7705, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 183/471 [22:58<36:09,  7.53s/it][A

loss: tensor(0.7069, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 184/471 [23:05<36:02,  7.54s/it][A

loss: tensor(0.7672, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 185/471 [23:13<36:00,  7.56s/it][A

loss: tensor(0.5716, device='cuda:0', grad_fn=<NllLossBackward>)



 39%|███▉      | 186/471 [23:20<35:46,  7.53s/it][A

loss: tensor(0.6697, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 187/471 [23:28<35:45,  7.55s/it][A

loss: tensor(0.7619, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|███▉      | 188/471 [23:35<35:45,  7.58s/it][A

loss: tensor(0.6110, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 189/471 [23:43<35:44,  7.60s/it][A

loss: tensor(0.6687, device='cuda:0', grad_fn=<NllLossBackward>)



 40%|████      | 190/471 [23:51<35:30,  7.58s/it][A

loss: tensor(0.6243, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 191/471 [23:58<35:25,  7.59s/it][A

loss: tensor(0.8694, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 192/471 [24:06<35:16,  7.59s/it][A

loss: tensor(0.7369, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 193/471 [24:13<34:55,  7.54s/it][A

loss: tensor(0.5816, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████      | 194/471 [24:21<34:46,  7.53s/it][A

loss: tensor(0.6938, device='cuda:0', grad_fn=<NllLossBackward>)



 41%|████▏     | 195/471 [24:28<34:39,  7.54s/it][A

loss: tensor(0.6581, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 196/471 [24:36<34:34,  7.54s/it][A

loss: tensor(0.7586, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 197/471 [24:43<34:21,  7.52s/it][A

loss: tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 198/471 [24:51<34:19,  7.54s/it][A

loss: tensor(0.7927, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 199/471 [24:59<34:13,  7.55s/it][A

loss: tensor(0.7571, device='cuda:0', grad_fn=<NllLossBackward>)



 42%|████▏     | 200/471 [25:06<34:08,  7.56s/it][A

loss: tensor(0.7579, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 201/471 [25:14<33:52,  7.53s/it][A

loss: tensor(0.7326, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 202/471 [25:21<33:47,  7.54s/it][A

loss: tensor(0.7090, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 203/471 [25:29<33:45,  7.56s/it][A

loss: tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward>)



 43%|████▎     | 204/471 [25:36<33:38,  7.56s/it][A

loss: tensor(0.7444, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 205/471 [25:44<33:29,  7.55s/it][A

loss: tensor(0.5650, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▎     | 206/471 [25:51<33:27,  7.58s/it][A

loss: tensor(0.6515, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 207/471 [25:59<33:22,  7.59s/it][A

loss: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 208/471 [26:06<33:03,  7.54s/it][A

loss: tensor(0.6356, device='cuda:0', grad_fn=<NllLossBackward>)



 44%|████▍     | 209/471 [26:14<32:56,  7.54s/it][A

loss: tensor(0.6959, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 210/471 [26:22<32:54,  7.56s/it][A

loss: tensor(0.7059, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▍     | 211/471 [26:29<32:45,  7.56s/it][A

loss: tensor(0.5217, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 212/471 [26:37<32:27,  7.52s/it][A

loss: tensor(0.6324, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 213/471 [26:44<32:21,  7.53s/it][A

loss: tensor(0.5545, device='cuda:0', grad_fn=<NllLossBackward>)



 45%|████▌     | 214/471 [26:52<32:14,  7.53s/it][A

loss: tensor(0.6519, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 215/471 [26:59<32:06,  7.53s/it][A

loss: tensor(0.6312, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 216/471 [27:07<31:51,  7.50s/it][A

loss: tensor(0.6856, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▌     | 217/471 [27:14<31:43,  7.50s/it][A

loss: tensor(0.6746, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▋     | 218/471 [27:22<31:43,  7.52s/it][A

loss: tensor(0.6815, device='cuda:0', grad_fn=<NllLossBackward>)



 46%|████▋     | 219/471 [27:29<31:34,  7.52s/it][A

loss: tensor(0.6052, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 220/471 [27:37<31:34,  7.55s/it][A

loss: tensor(0.5373, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 221/471 [27:44<31:26,  7.55s/it][A

loss: tensor(0.7591, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 222/471 [27:52<31:21,  7.56s/it][A

loss: tensor(0.6452, device='cuda:0', grad_fn=<NllLossBackward>)



 47%|████▋     | 223/471 [27:59<31:08,  7.53s/it][A

loss: tensor(0.7817, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 224/471 [28:07<31:00,  7.53s/it][A

loss: tensor(0.6148, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 225/471 [28:15<30:53,  7.53s/it][A

loss: tensor(0.6125, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 226/471 [28:22<30:52,  7.56s/it][A

loss: tensor(0.7287, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 227/471 [28:30<30:37,  7.53s/it][A

loss: tensor(0.5617, device='cuda:0', grad_fn=<NllLossBackward>)



 48%|████▊     | 228/471 [28:37<30:30,  7.53s/it][A

loss: tensor(0.8009, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▊     | 229/471 [28:45<30:25,  7.54s/it][A

loss: tensor(0.6780, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 230/471 [28:52<30:19,  7.55s/it][A

loss: tensor(0.7411, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 231/471 [29:00<30:03,  7.51s/it][A

loss: tensor(0.6918, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 232/471 [29:07<29:56,  7.52s/it][A

loss: tensor(0.7217, device='cuda:0', grad_fn=<NllLossBackward>)



 49%|████▉     | 233/471 [29:15<29:50,  7.52s/it][A

loss: tensor(0.6708, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 234/471 [29:22<29:38,  7.50s/it][A

loss: tensor(0.6429, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|████▉     | 235/471 [29:30<29:34,  7.52s/it][A

loss: tensor(0.6315, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 236/471 [29:37<29:27,  7.52s/it][A

loss: tensor(0.6463, device='cuda:0', grad_fn=<NllLossBackward>)



 50%|█████     | 237/471 [29:45<29:21,  7.53s/it][A

loss: tensor(0.7011, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 238/471 [29:52<29:05,  7.49s/it][A

loss: tensor(0.7132, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 239/471 [30:00<28:58,  7.50s/it][A

loss: tensor(0.6889, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 240/471 [30:07<28:56,  7.52s/it][A

loss: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████     | 241/471 [30:15<28:56,  7.55s/it][A

loss: tensor(0.7260, device='cuda:0', grad_fn=<NllLossBackward>)



 51%|█████▏    | 242/471 [30:23<28:49,  7.55s/it][A

loss: tensor(0.6684, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 243/471 [30:30<28:43,  7.56s/it][A

loss: tensor(0.6887, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 244/471 [30:38<28:36,  7.56s/it][A

loss: tensor(0.6340, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 245/471 [30:45<28:28,  7.56s/it][A

loss: tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 246/471 [30:53<28:12,  7.52s/it][A

loss: tensor(0.6361, device='cuda:0', grad_fn=<NllLossBackward>)



 52%|█████▏    | 247/471 [31:00<28:03,  7.52s/it][A

loss: tensor(0.5535, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 248/471 [31:08<28:02,  7.54s/it][A

loss: tensor(0.6787, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 249/471 [31:15<27:53,  7.54s/it][A

loss: tensor(0.6459, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 250/471 [31:23<27:52,  7.57s/it][A

loss: tensor(0.6923, device='cuda:0', grad_fn=<NllLossBackward>)



 53%|█████▎    | 251/471 [31:30<27:42,  7.56s/it][A

loss: tensor(0.6621, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▎    | 252/471 [31:38<27:33,  7.55s/it][A

loss: tensor(0.6509, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▎    | 253/471 [31:45<27:18,  7.52s/it][A

loss: tensor(0.6031, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 254/471 [31:53<27:12,  7.52s/it][A

loss: tensor(0.5679, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 255/471 [32:01<27:05,  7.53s/it][A

loss: tensor(0.6260, device='cuda:0', grad_fn=<NllLossBackward>)



 54%|█████▍    | 256/471 [32:08<26:59,  7.53s/it][A

loss: tensor(0.6532, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 257/471 [32:15<26:45,  7.50s/it][A

loss: tensor(0.6484, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 258/471 [32:23<26:38,  7.51s/it][A

loss: tensor(0.6284, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▍    | 259/471 [32:31<26:32,  7.51s/it][A

loss: tensor(0.7218, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 260/471 [32:38<26:19,  7.49s/it][A

loss: tensor(0.6921, device='cuda:0', grad_fn=<NllLossBackward>)



 55%|█████▌    | 261/471 [32:45<26:16,  7.51s/it][A

loss: tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 262/471 [32:53<26:11,  7.52s/it][A

loss: tensor(0.6796, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 263/471 [33:01<26:08,  7.54s/it][A

loss: tensor(0.5824, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▌    | 264/471 [33:08<25:57,  7.52s/it][A

loss: tensor(0.6327, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 265/471 [33:16<25:54,  7.55s/it][A

loss: tensor(0.7257, device='cuda:0', grad_fn=<NllLossBackward>)



 56%|█████▋    | 266/471 [33:23<25:48,  7.55s/it][A

loss: tensor(0.6232, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 267/471 [33:31<25:43,  7.57s/it][A

loss: tensor(0.5435, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 268/471 [33:38<25:28,  7.53s/it][A

loss: tensor(0.5969, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 269/471 [33:46<25:21,  7.53s/it][A

loss: tensor(0.6638, device='cuda:0', grad_fn=<NllLossBackward>)



 57%|█████▋    | 270/471 [33:53<25:15,  7.54s/it][A

loss: tensor(0.6109, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 271/471 [34:01<25:06,  7.53s/it][A

loss: tensor(0.7367, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 272/471 [34:08<24:57,  7.53s/it][A

loss: tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 273/471 [34:16<24:50,  7.53s/it][A

loss: tensor(0.5680, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 274/471 [34:24<24:43,  7.53s/it][A

loss: tensor(0.6496, device='cuda:0', grad_fn=<NllLossBackward>)



 58%|█████▊    | 275/471 [34:31<24:28,  7.49s/it][A

loss: tensor(0.7392, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▊    | 276/471 [34:39<24:25,  7.52s/it][A

loss: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 277/471 [34:46<24:16,  7.51s/it][A

loss: tensor(0.6462, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 278/471 [34:54<24:09,  7.51s/it][A

loss: tensor(0.7282, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 279/471 [35:01<23:57,  7.49s/it][A

loss: tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward>)



 59%|█████▉    | 280/471 [35:09<23:56,  7.52s/it][A

loss: tensor(0.6023, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 281/471 [35:16<23:48,  7.52s/it][A

loss: tensor(0.6217, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|█████▉    | 282/471 [35:24<23:41,  7.52s/it][A

loss: tensor(0.6493, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 283/471 [35:31<23:35,  7.53s/it][A

loss: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)



 60%|██████    | 284/471 [35:39<23:35,  7.57s/it][A

loss: tensor(0.8006, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 285/471 [35:46<23:33,  7.60s/it][A

loss: tensor(0.7173, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 286/471 [35:54<23:26,  7.60s/it][A

loss: tensor(0.7608, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 287/471 [36:02<23:11,  7.56s/it][A

loss: tensor(0.6426, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████    | 288/471 [36:09<23:01,  7.55s/it][A

loss: tensor(0.7063, device='cuda:0', grad_fn=<NllLossBackward>)



 61%|██████▏   | 289/471 [36:17<22:55,  7.56s/it][A

loss: tensor(0.7450, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 290/471 [36:24<22:42,  7.53s/it][A

loss: tensor(0.6191, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 291/471 [36:32<22:36,  7.53s/it][A

loss: tensor(0.7737, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 292/471 [36:39<22:27,  7.53s/it][A

loss: tensor(0.7468, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 293/471 [36:47<22:21,  7.53s/it][A

loss: tensor(0.6133, device='cuda:0', grad_fn=<NllLossBackward>)



 62%|██████▏   | 294/471 [36:54<22:07,  7.50s/it][A

loss: tensor(0.6518, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 295/471 [37:02<22:04,  7.52s/it][A

loss: tensor(0.6678, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 296/471 [37:09<21:57,  7.53s/it][A

loss: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 297/471 [37:17<21:50,  7.53s/it][A

loss: tensor(0.7900, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 298/471 [37:24<21:38,  7.51s/it][A

loss: tensor(0.7513, device='cuda:0', grad_fn=<NllLossBackward>)



 63%|██████▎   | 299/471 [37:32<21:31,  7.51s/it][A

loss: tensor(0.6491, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▎   | 300/471 [37:39<21:25,  7.52s/it][A

loss: tensor(0.6952, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 301/471 [37:47<21:14,  7.50s/it][A

loss: tensor(0.6959, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 302/471 [37:54<21:16,  7.55s/it][A

loss: tensor(0.6670, device='cuda:0', grad_fn=<NllLossBackward>)



 64%|██████▍   | 303/471 [38:02<21:13,  7.58s/it][A

loss: tensor(0.7152, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 304/471 [38:10<21:06,  7.58s/it][A

loss: tensor(0.6426, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 305/471 [38:17<20:54,  7.56s/it][A

loss: tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▍   | 306/471 [38:25<20:50,  7.58s/it][A

loss: tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 307/471 [38:32<20:44,  7.59s/it][A

loss: tensor(0.6571, device='cuda:0', grad_fn=<NllLossBackward>)



 65%|██████▌   | 308/471 [38:40<20:34,  7.58s/it][A

loss: tensor(0.7786, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 309/471 [38:47<20:20,  7.53s/it][A

loss: tensor(0.6893, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 310/471 [38:55<20:14,  7.54s/it][A

loss: tensor(0.7120, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 311/471 [39:03<20:13,  7.58s/it][A

loss: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▌   | 312/471 [39:10<20:04,  7.58s/it][A

loss: tensor(0.6113, device='cuda:0', grad_fn=<NllLossBackward>)



 66%|██████▋   | 313/471 [39:18<19:56,  7.57s/it][A

loss: tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 314/471 [39:25<19:47,  7.56s/it][A

loss: tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 315/471 [39:33<19:40,  7.57s/it][A

loss: tensor(0.6822, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 316/471 [39:40<19:30,  7.55s/it][A

loss: tensor(0.6855, device='cuda:0', grad_fn=<NllLossBackward>)



 67%|██████▋   | 317/471 [39:48<19:24,  7.56s/it][A

loss: tensor(0.6957, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 318/471 [39:56<19:16,  7.56s/it][A

loss: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 319/471 [40:03<19:11,  7.57s/it][A

loss: tensor(0.7199, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 320/471 [40:11<19:01,  7.56s/it][A

loss: tensor(0.6298, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 321/471 [40:18<18:53,  7.56s/it][A

loss: tensor(0.7458, device='cuda:0', grad_fn=<NllLossBackward>)



 68%|██████▊   | 322/471 [40:26<18:45,  7.56s/it][A

loss: tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▊   | 323/471 [40:33<18:40,  7.57s/it][A

loss: tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 324/471 [40:41<18:28,  7.54s/it][A

loss: tensor(0.6736, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 325/471 [40:48<18:20,  7.54s/it][A

loss: tensor(0.5815, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 326/471 [40:56<18:12,  7.53s/it][A

loss: tensor(0.5886, device='cuda:0', grad_fn=<NllLossBackward>)



 69%|██████▉   | 327/471 [41:03<18:03,  7.52s/it][A

loss: tensor(0.6833, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 328/471 [41:11<17:52,  7.50s/it][A

loss: tensor(0.6482, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|██████▉   | 329/471 [41:18<17:48,  7.52s/it][A

loss: tensor(0.6862, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 330/471 [41:26<17:44,  7.55s/it][A

loss: tensor(0.6962, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 331/471 [41:33<17:33,  7.52s/it][A

loss: tensor(0.6316, device='cuda:0', grad_fn=<NllLossBackward>)



 70%|███████   | 332/471 [41:41<17:26,  7.53s/it][A

loss: tensor(0.7046, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 333/471 [41:49<17:19,  7.53s/it][A

loss: tensor(0.6927, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 334/471 [41:56<17:13,  7.54s/it][A

loss: tensor(0.6523, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████   | 335/471 [42:04<17:01,  7.51s/it][A

loss: tensor(0.6683, device='cuda:0', grad_fn=<NllLossBackward>)



 71%|███████▏  | 336/471 [42:11<16:54,  7.52s/it][A

loss: tensor(0.5973, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 337/471 [42:19<16:48,  7.52s/it][A

loss: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 338/471 [42:26<16:41,  7.53s/it][A

loss: tensor(0.6927, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 339/471 [42:34<16:30,  7.50s/it][A

loss: tensor(0.5410, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 340/471 [42:41<16:22,  7.50s/it][A

loss: tensor(0.5798, device='cuda:0', grad_fn=<NllLossBackward>)



 72%|███████▏  | 341/471 [42:49<16:15,  7.51s/it][A

loss: tensor(0.6133, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 342/471 [42:56<16:08,  7.51s/it][A

loss: tensor(0.6340, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 343/471 [43:04<16:02,  7.52s/it][A

loss: tensor(0.6392, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 344/471 [43:11<15:54,  7.52s/it][A

loss: tensor(0.7000, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 345/471 [43:19<15:51,  7.55s/it][A

loss: tensor(0.6920, device='cuda:0', grad_fn=<NllLossBackward>)



 73%|███████▎  | 346/471 [43:26<15:42,  7.54s/it][A

loss: tensor(0.6424, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▎  | 347/471 [43:34<15:37,  7.56s/it][A

loss: tensor(0.6857, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 348/471 [43:42<15:31,  7.57s/it][A

loss: tensor(0.7081, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 349/471 [43:49<15:26,  7.59s/it][A

loss: tensor(0.6160, device='cuda:0', grad_fn=<NllLossBackward>)



 74%|███████▍  | 350/471 [43:57<15:16,  7.57s/it][A

loss: tensor(0.5726, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 351/471 [44:04<15:07,  7.56s/it][A

loss: tensor(0.7010, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 352/471 [44:12<14:58,  7.55s/it][A

loss: tensor(0.5673, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▍  | 353/471 [44:19<14:50,  7.55s/it][A

loss: tensor(0.6480, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 354/471 [44:27<14:38,  7.51s/it][A

loss: tensor(0.7040, device='cuda:0', grad_fn=<NllLossBackward>)



 75%|███████▌  | 355/471 [44:34<14:30,  7.51s/it][A

loss: tensor(0.5557, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 356/471 [44:42<14:24,  7.51s/it][A

loss: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 357/471 [44:49<14:13,  7.49s/it][A

loss: tensor(0.7637, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 358/471 [44:57<14:08,  7.51s/it][A

loss: tensor(0.6815, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▌  | 359/471 [45:04<14:00,  7.51s/it][A

loss: tensor(0.6849, device='cuda:0', grad_fn=<NllLossBackward>)



 76%|███████▋  | 360/471 [45:12<13:54,  7.52s/it][A

loss: tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 361/471 [45:19<13:42,  7.48s/it][A

loss: tensor(0.6755, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 362/471 [45:27<13:36,  7.49s/it][A

loss: tensor(0.7078, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 363/471 [45:34<13:29,  7.49s/it][A

loss: tensor(0.7215, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 364/471 [45:42<13:22,  7.50s/it][A

loss: tensor(0.6623, device='cuda:0', grad_fn=<NllLossBackward>)



 77%|███████▋  | 365/471 [45:49<13:14,  7.50s/it][A

loss: tensor(0.6661, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 366/471 [45:57<13:08,  7.51s/it][A

loss: tensor(0.6986, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 367/471 [46:04<13:02,  7.52s/it][A

loss: tensor(0.5415, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 368/471 [46:12<12:56,  7.54s/it][A

loss: tensor(0.6978, device='cuda:0', grad_fn=<NllLossBackward>)



 78%|███████▊  | 369/471 [46:19<12:50,  7.55s/it][A

loss: tensor(0.6661, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▊  | 370/471 [46:27<12:44,  7.56s/it][A

loss: tensor(0.5751, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 371/471 [46:35<12:39,  7.59s/it][A

loss: tensor(0.6130, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 372/471 [46:42<12:28,  7.56s/it][A

loss: tensor(0.7015, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 373/471 [46:50<12:22,  7.57s/it][A

loss: tensor(0.6597, device='cuda:0', grad_fn=<NllLossBackward>)



 79%|███████▉  | 374/471 [46:57<12:14,  7.57s/it][A

loss: tensor(0.7168, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 375/471 [47:05<12:06,  7.57s/it][A

loss: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|███████▉  | 376/471 [47:12<11:55,  7.53s/it][A

loss: tensor(0.7188, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 377/471 [47:20<11:48,  7.53s/it][A

loss: tensor(0.8831, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 378/471 [47:27<11:39,  7.53s/it][A

loss: tensor(0.6397, device='cuda:0', grad_fn=<NllLossBackward>)



 80%|████████  | 379/471 [47:35<11:32,  7.52s/it][A

loss: tensor(0.7613, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 380/471 [47:42<11:21,  7.49s/it][A

loss: tensor(0.6732, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 381/471 [47:50<11:14,  7.49s/it][A

loss: tensor(0.6733, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████  | 382/471 [47:57<11:07,  7.50s/it][A

loss: tensor(0.7980, device='cuda:0', grad_fn=<NllLossBackward>)



 81%|████████▏ | 383/471 [48:05<10:58,  7.48s/it][A

loss: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 384/471 [48:12<10:52,  7.50s/it][A

loss: tensor(0.7654, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 385/471 [48:20<10:46,  7.52s/it][A

loss: tensor(0.7372, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 386/471 [48:28<10:40,  7.53s/it][A

loss: tensor(0.6426, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 387/471 [48:35<10:30,  7.50s/it][A

loss: tensor(0.6703, device='cuda:0', grad_fn=<NllLossBackward>)



 82%|████████▏ | 388/471 [48:43<10:26,  7.55s/it][A

loss: tensor(0.5768, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 389/471 [48:50<10:18,  7.55s/it][A

loss: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 390/471 [48:58<10:11,  7.55s/it][A

loss: tensor(0.7429, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 391/471 [49:05<10:01,  7.52s/it][A

loss: tensor(0.7440, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 392/471 [49:13<09:54,  7.53s/it][A

loss: tensor(0.7580, device='cuda:0', grad_fn=<NllLossBackward>)



 83%|████████▎ | 393/471 [49:20<09:47,  7.53s/it][A

loss: tensor(0.7594, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▎ | 394/471 [49:28<09:40,  7.54s/it][A

loss: tensor(0.7511, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 395/471 [49:35<09:30,  7.50s/it][A

loss: tensor(0.6972, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 396/471 [49:43<09:25,  7.54s/it][A

loss: tensor(0.6493, device='cuda:0', grad_fn=<NllLossBackward>)



 84%|████████▍ | 397/471 [49:50<09:19,  7.56s/it][A

loss: tensor(0.6881, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 398/471 [49:58<09:09,  7.53s/it][A

loss: tensor(0.6893, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 399/471 [50:05<09:02,  7.53s/it][A

loss: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▍ | 400/471 [50:13<08:54,  7.53s/it][A

loss: tensor(0.6389, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 401/471 [50:21<08:47,  7.53s/it][A

loss: tensor(0.8099, device='cuda:0', grad_fn=<NllLossBackward>)



 85%|████████▌ | 402/471 [50:28<08:37,  7.50s/it][A

loss: tensor(0.6835, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 403/471 [50:35<08:30,  7.50s/it][A

loss: tensor(0.6426, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 404/471 [50:43<08:23,  7.51s/it][A

loss: tensor(0.6634, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 405/471 [50:51<08:16,  7.53s/it][A

loss: tensor(0.6297, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▌ | 406/471 [50:58<08:08,  7.52s/it][A

loss: tensor(0.7896, device='cuda:0', grad_fn=<NllLossBackward>)



 86%|████████▋ | 407/471 [51:06<08:01,  7.52s/it][A

loss: tensor(0.6632, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 408/471 [51:13<07:54,  7.53s/it][A

loss: tensor(0.6955, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 409/471 [51:21<07:46,  7.53s/it][A

loss: tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 410/471 [51:28<07:38,  7.51s/it][A

loss: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 411/471 [51:36<07:30,  7.51s/it][A

loss: tensor(0.6837, device='cuda:0', grad_fn=<NllLossBackward>)



 87%|████████▋ | 412/471 [51:43<07:23,  7.52s/it][A

loss: tensor(0.7528, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 413/471 [51:51<07:14,  7.50s/it][A

loss: tensor(0.6754, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 414/471 [51:58<07:09,  7.53s/it][A

loss: tensor(0.7451, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 415/471 [52:06<07:03,  7.56s/it][A

loss: tensor(0.7196, device='cuda:0', grad_fn=<NllLossBackward>)



 88%|████████▊ | 416/471 [52:13<06:56,  7.57s/it][A

loss: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 417/471 [52:21<06:46,  7.52s/it][A

loss: tensor(0.6748, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▊ | 418/471 [52:28<06:39,  7.53s/it][A

loss: tensor(0.6080, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 419/471 [52:36<06:32,  7.54s/it][A

loss: tensor(0.6548, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 420/471 [52:43<06:24,  7.53s/it][A

loss: tensor(0.6734, device='cuda:0', grad_fn=<NllLossBackward>)



 89%|████████▉ | 421/471 [52:51<06:14,  7.50s/it][A

loss: tensor(0.6703, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 422/471 [52:58<06:08,  7.52s/it][A

loss: tensor(0.6461, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|████████▉ | 423/471 [53:06<06:02,  7.54s/it][A

loss: tensor(0.6454, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 424/471 [53:14<05:53,  7.52s/it][A

loss: tensor(0.8067, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 425/471 [53:21<05:46,  7.53s/it][A

loss: tensor(0.6138, device='cuda:0', grad_fn=<NllLossBackward>)



 90%|█████████ | 426/471 [53:29<05:39,  7.53s/it][A

loss: tensor(0.6166, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 427/471 [53:36<05:32,  7.55s/it][A

loss: tensor(0.6476, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 428/471 [53:44<05:23,  7.51s/it][A

loss: tensor(0.6152, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████ | 429/471 [53:51<05:15,  7.52s/it][A

loss: tensor(0.5268, device='cuda:0', grad_fn=<NllLossBackward>)



 91%|█████████▏| 430/471 [53:59<05:08,  7.53s/it][A

loss: tensor(0.6791, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 431/471 [54:06<05:01,  7.54s/it][A

loss: tensor(0.6630, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 432/471 [54:14<04:52,  7.51s/it][A

loss: tensor(0.6021, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 433/471 [54:21<04:46,  7.53s/it][A

loss: tensor(0.8154, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 434/471 [54:29<04:38,  7.52s/it][A

loss: tensor(0.6130, device='cuda:0', grad_fn=<NllLossBackward>)



 92%|█████████▏| 435/471 [54:36<04:30,  7.52s/it][A

loss: tensor(0.6400, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 436/471 [54:44<04:23,  7.52s/it][A

loss: tensor(0.6075, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 437/471 [54:51<04:16,  7.54s/it][A

loss: tensor(0.7498, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 438/471 [54:59<04:09,  7.56s/it][A

loss: tensor(0.5920, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 439/471 [55:06<04:00,  7.52s/it][A

loss: tensor(0.7314, device='cuda:0', grad_fn=<NllLossBackward>)



 93%|█████████▎| 440/471 [55:14<03:53,  7.52s/it][A

loss: tensor(0.6988, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▎| 441/471 [55:21<03:45,  7.51s/it][A

loss: tensor(0.6845, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 442/471 [55:29<03:37,  7.52s/it][A

loss: tensor(0.6926, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 443/471 [55:37<03:30,  7.51s/it][A

loss: tensor(0.7559, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 444/471 [55:44<03:25,  7.61s/it][A

loss: tensor(0.6452, device='cuda:0', grad_fn=<NllLossBackward>)



 94%|█████████▍| 445/471 [55:52<03:19,  7.67s/it][A

loss: tensor(0.6165, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 446/471 [56:00<03:12,  7.69s/it][A

loss: tensor(0.6401, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▍| 447/471 [56:07<03:03,  7.64s/it][A

loss: tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 448/471 [56:15<02:55,  7.61s/it][A

loss: tensor(0.6034, device='cuda:0', grad_fn=<NllLossBackward>)



 95%|█████████▌| 449/471 [56:23<02:47,  7.59s/it][A

loss: tensor(0.6101, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 450/471 [56:30<02:39,  7.58s/it][A

loss: tensor(0.6611, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 451/471 [56:38<02:30,  7.54s/it][A

loss: tensor(0.6621, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 452/471 [56:45<02:23,  7.54s/it][A

loss: tensor(0.6528, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▌| 453/471 [56:53<02:15,  7.54s/it][A

loss: tensor(0.5904, device='cuda:0', grad_fn=<NllLossBackward>)



 96%|█████████▋| 454/471 [57:00<02:07,  7.50s/it][A

loss: tensor(0.5814, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 455/471 [57:08<02:00,  7.51s/it][A

loss: tensor(0.5753, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 456/471 [57:15<01:52,  7.53s/it][A

loss: tensor(0.7860, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 457/471 [57:23<01:45,  7.56s/it][A

loss: tensor(0.7069, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 458/471 [57:30<01:37,  7.54s/it][A

loss: tensor(0.7291, device='cuda:0', grad_fn=<NllLossBackward>)



 97%|█████████▋| 459/471 [57:38<01:30,  7.55s/it][A

loss: tensor(0.6466, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 460/471 [57:45<01:23,  7.56s/it][A

loss: tensor(0.7392, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 461/471 [57:53<01:15,  7.56s/it][A

loss: tensor(0.7650, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 462/471 [58:00<01:07,  7.54s/it][A

loss: tensor(0.6509, device='cuda:0', grad_fn=<NllLossBackward>)



 98%|█████████▊| 463/471 [58:08<01:00,  7.55s/it][A

loss: tensor(0.8079, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 464/471 [58:16<00:52,  7.56s/it][A

loss: tensor(0.5591, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▊| 465/471 [58:23<00:45,  7.54s/it][A

loss: tensor(0.5509, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 466/471 [58:31<00:37,  7.54s/it][A

loss: tensor(0.6943, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 467/471 [58:38<00:30,  7.54s/it][A

loss: tensor(0.6573, device='cuda:0', grad_fn=<NllLossBackward>)



 99%|█████████▉| 468/471 [58:46<00:22,  7.55s/it][A

loss: tensor(0.7460, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 469/471 [58:53<00:15,  7.52s/it][A

loss: tensor(0.6560, device='cuda:0', grad_fn=<NllLossBackward>)



100%|█████████▉| 470/471 [59:01<00:07,  7.54s/it][A

loss: tensor(0.7783, device='cuda:0', grad_fn=<NllLossBackward>)



100%|██████████| 471/471 [59:08<00:00,  7.53s/it][A


loss: tensor(0.7420, device='cuda:0', grad_fn=<NllLossBackward>)

	Train loss: 0.6705948170076763

	train acc: 0.741640127388535

	training prec: 0.5288431210803822

	training rec: 0.540269327570283

	training f1: 0.5143905697829476



  0%|          | 0/118 [00:00<?, ?it/s][A
  1%|          | 1/118 [00:01<03:30,  1.80s/it][A
  2%|▏         | 2/118 [00:03<03:25,  1.78s/it][A
  3%|▎         | 3/118 [00:05<03:18,  1.73s/it][A
  3%|▎         | 4/118 [00:06<03:18,  1.74s/it][A
  4%|▍         | 5/118 [00:08<03:17,  1.75s/it][A
  5%|▌         | 6/118 [00:10<03:16,  1.75s/it][A
  6%|▌         | 7/118 [00:12<03:14,  1.76s/it][A
  7%|▋         | 8/118 [00:14<03:14,  1.76s/it][A
  8%|▊         | 9/118 [00:15<03:12,  1.77s/it][A
  8%|▊         | 10/118 [00:17<03:10,  1.77s/it][A
  9%|▉         | 11/118 [00:19<03:06,  1.74s/it][A
 10%|█         | 12/118 [00:21<03:05,  1.75s/it][A
 11%|█         | 13/118 [00:22<03:04,  1.76s/it][A
 12%|█▏        | 14/118 [00:24<03:03,  1.76s/it][A
 13%|█▎        | 15/118 [00:26<03:01,  1.76s/it][A
 14%|█▎        | 16/118 [00:28<03:00,  1.77s/it][A
 14%|█▍        | 17/118 [00:29<02:58,  1.77s/it][A
 15%|█▌        | 18/118 [00:31<02:57,  1.77s/it][A
 16%|█▌        | 19/118 [00:3


	Validation loss: 0.6544705619246273

	Validation acc: 0.8492095827900913

	Validation prec: 0.5906418793071337

	Validation rec: 0.5749035039818938

	Validation f1: 0.5638042851659473
Validation loss decreased (0.670349 --> 0.654471).  Saving model ...


Epoch:   4%|▍         | 2/50 [2:05:14<50:05:50, 3757.30s/it]




  0%|          | 0/471 [00:00<?, ?it/s][A
  0%|          | 1/471 [00:07<58:48,  7.51s/it][A

loss: tensor(0.6813, device='cuda:0', grad_fn=<NllLossBackward>)



  0%|          | 2/471 [00:15<59:05,  7.56s/it][A

loss: tensor(0.6677, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 3/471 [00:22<59:06,  7.58s/it][A

loss: tensor(0.6638, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 4/471 [00:30<58:44,  7.55s/it][A

loss: tensor(0.6259, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|          | 5/471 [00:37<58:46,  7.57s/it][A

loss: tensor(0.6276, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 6/471 [00:45<58:42,  7.57s/it][A

loss: tensor(0.6491, device='cuda:0', grad_fn=<NllLossBackward>)



  1%|▏         | 7/471 [00:53<58:39,  7.58s/it][A

loss: tensor(0.7054, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 8/471 [01:00<58:18,  7.56s/it][A

loss: tensor(0.6355, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 9/471 [01:08<58:14,  7.56s/it][A

loss: tensor(0.6885, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 10/471 [01:15<58:09,  7.57s/it][A

loss: tensor(0.6573, device='cuda:0', grad_fn=<NllLossBackward>)



  2%|▏         | 11/471 [01:23<58:03,  7.57s/it][A

loss: tensor(0.6330, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 12/471 [01:30<57:43,  7.55s/it][A

loss: tensor(0.8068, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 13/471 [01:38<57:36,  7.55s/it][A

loss: tensor(0.6186, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 14/471 [01:45<57:34,  7.56s/it][A

loss: tensor(0.6080, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 15/471 [01:53<57:13,  7.53s/it][A

loss: tensor(0.6760, device='cuda:0', grad_fn=<NllLossBackward>)



  3%|▎         | 16/471 [02:00<57:11,  7.54s/it][A

loss: tensor(0.7293, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▎         | 17/471 [02:08<57:05,  7.54s/it][A

loss: tensor(0.6264, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 18/471 [02:16<57:02,  7.56s/it][A

loss: tensor(0.6624, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 19/471 [02:23<56:47,  7.54s/it][A

loss: tensor(0.6119, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 20/471 [02:31<56:48,  7.56s/it][A

loss: tensor(0.5924, device='cuda:0', grad_fn=<NllLossBackward>)



  4%|▍         | 21/471 [02:38<56:43,  7.56s/it][A

loss: tensor(0.6372, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 22/471 [02:46<56:37,  7.57s/it][A

loss: tensor(0.7669, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▍         | 23/471 [02:53<56:20,  7.55s/it][A

loss: tensor(0.5830, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 24/471 [03:01<56:16,  7.55s/it][A

loss: tensor(0.7133, device='cuda:0', grad_fn=<NllLossBackward>)



  5%|▌         | 25/471 [03:08<56:14,  7.57s/it][A

loss: tensor(0.6019, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 26/471 [03:16<56:07,  7.57s/it][A

loss: tensor(0.6305, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 27/471 [03:23<55:50,  7.55s/it][A

loss: tensor(0.6852, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 28/471 [03:31<55:41,  7.54s/it][A

loss: tensor(0.6586, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▌         | 29/471 [03:39<55:38,  7.55s/it][A

loss: tensor(0.5887, device='cuda:0', grad_fn=<NllLossBackward>)



  6%|▋         | 30/471 [03:46<55:20,  7.53s/it][A

loss: tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 31/471 [03:54<55:16,  7.54s/it][A

loss: tensor(0.6407, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 32/471 [04:01<55:10,  7.54s/it][A

loss: tensor(0.6735, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 33/471 [04:09<55:05,  7.55s/it][A

loss: tensor(0.5997, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 34/471 [04:16<54:44,  7.52s/it][A

loss: tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)



  7%|▋         | 35/471 [04:24<54:40,  7.52s/it][A

loss: tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 36/471 [04:31<54:36,  7.53s/it][A

loss: tensor(0.5615, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 37/471 [04:39<54:34,  7.54s/it][A

loss: tensor(0.6395, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 38/471 [04:46<54:21,  7.53s/it][A

loss: tensor(0.6262, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 39/471 [04:54<54:21,  7.55s/it][A

loss: tensor(0.6822, device='cuda:0', grad_fn=<NllLossBackward>)



  8%|▊         | 40/471 [05:02<54:18,  7.56s/it][A

loss: tensor(0.6814, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▊         | 41/471 [05:09<54:15,  7.57s/it][A

loss: tensor(0.6837, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 42/471 [05:17<53:59,  7.55s/it][A

loss: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 43/471 [05:24<53:54,  7.56s/it][A

loss: tensor(0.6785, device='cuda:0', grad_fn=<NllLossBackward>)



  9%|▉         | 44/471 [05:32<53:53,  7.57s/it][A

loss: tensor(0.6587, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 45/471 [05:39<53:33,  7.54s/it][A

loss: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 46/471 [05:47<53:31,  7.56s/it][A

loss: tensor(0.7818, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|▉         | 47/471 [05:54<53:21,  7.55s/it][A

loss: tensor(0.6713, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 48/471 [06:02<53:14,  7.55s/it][A

loss: tensor(0.6039, device='cuda:0', grad_fn=<NllLossBackward>)



 10%|█         | 49/471 [06:09<52:53,  7.52s/it][A

loss: tensor(0.6495, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 50/471 [06:17<52:48,  7.53s/it][A

loss: tensor(0.6920, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 51/471 [06:24<52:41,  7.53s/it][A

loss: tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█         | 52/471 [06:32<52:32,  7.52s/it][A

loss: tensor(0.7237, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 53/471 [06:39<52:15,  7.50s/it][A

loss: tensor(0.6518, device='cuda:0', grad_fn=<NllLossBackward>)



 11%|█▏        | 54/471 [06:47<52:09,  7.50s/it][A

loss: tensor(0.7555, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 55/471 [06:55<52:04,  7.51s/it][A

loss: tensor(0.7133, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 56/471 [07:02<51:50,  7.49s/it][A

loss: tensor(0.6469, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 57/471 [07:10<51:50,  7.51s/it][A

loss: tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)



 12%|█▏        | 58/471 [07:17<51:47,  7.52s/it][A

loss: tensor(0.6640, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 59/471 [07:25<51:47,  7.54s/it][A

loss: tensor(0.6825, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 60/471 [07:32<51:28,  7.52s/it][A

loss: tensor(0.6107, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 61/471 [07:40<51:27,  7.53s/it][A

loss: tensor(0.6587, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 62/471 [07:47<51:21,  7.53s/it][A

loss: tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward>)



 13%|█▎        | 63/471 [07:55<51:17,  7.54s/it][A

loss: tensor(0.7376, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▎        | 64/471 [08:02<50:58,  7.51s/it][A

loss: tensor(0.6528, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 65/471 [08:10<50:56,  7.53s/it][A

loss: tensor(0.6310, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 66/471 [08:17<50:52,  7.54s/it][A

loss: tensor(0.6622, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 67/471 [08:25<50:46,  7.54s/it][A

loss: tensor(0.5786, device='cuda:0', grad_fn=<NllLossBackward>)



 14%|█▍        | 68/471 [08:32<50:28,  7.51s/it][A

loss: tensor(0.6548, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 69/471 [08:40<50:23,  7.52s/it][A

loss: tensor(0.6133, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▍        | 70/471 [08:47<50:20,  7.53s/it][A

loss: tensor(0.7058, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 71/471 [08:55<50:04,  7.51s/it][A

loss: tensor(0.6006, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 72/471 [09:02<49:58,  7.52s/it][A

loss: tensor(0.6883, device='cuda:0', grad_fn=<NllLossBackward>)



 15%|█▌        | 73/471 [09:10<49:52,  7.52s/it][A

loss: tensor(0.6476, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 74/471 [09:17<49:46,  7.52s/it][A

loss: tensor(0.6377, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 75/471 [09:25<49:26,  7.49s/it][A

loss: tensor(0.6455, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▌        | 76/471 [09:32<49:21,  7.50s/it][A

loss: tensor(0.6425, device='cuda:0', grad_fn=<NllLossBackward>)



 16%|█▋        | 77/471 [09:40<49:15,  7.50s/it][A

loss: tensor(0.5927, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 78/471 [09:47<49:12,  7.51s/it][A

loss: tensor(0.6860, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 79/471 [09:55<49:00,  7.50s/it][A

loss: tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 80/471 [10:03<49:01,  7.52s/it][A

loss: tensor(0.6405, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 81/471 [10:10<48:56,  7.53s/it][A

loss: tensor(0.7350, device='cuda:0', grad_fn=<NllLossBackward>)



 17%|█▋        | 82/471 [10:18<48:50,  7.53s/it][A

loss: tensor(0.6974, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 83/471 [10:25<48:34,  7.51s/it][A

loss: tensor(0.6508, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 84/471 [10:33<48:30,  7.52s/it][A

loss: tensor(0.6531, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 85/471 [10:40<48:30,  7.54s/it][A

loss: tensor(0.7352, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 86/471 [10:48<48:12,  7.51s/it][A

loss: tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)



 18%|█▊        | 87/471 [10:55<48:14,  7.54s/it][A

loss: tensor(0.6933, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▊        | 88/471 [11:03<48:09,  7.54s/it][A

loss: tensor(0.5823, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 89/471 [11:10<48:04,  7.55s/it][A

loss: tensor(0.6887, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 90/471 [11:18<47:46,  7.52s/it][A

loss: tensor(0.6694, device='cuda:0', grad_fn=<NllLossBackward>)



 19%|█▉        | 91/471 [11:25<47:42,  7.53s/it][A

loss: tensor(0.6918, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 92/471 [11:33<47:36,  7.54s/it][A

loss: tensor(0.6480, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 93/471 [11:40<47:26,  7.53s/it][A

loss: tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|█▉        | 94/471 [11:48<47:05,  7.49s/it][A

loss: tensor(0.5983, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 95/471 [11:55<46:56,  7.49s/it][A

loss: tensor(0.6472, device='cuda:0', grad_fn=<NllLossBackward>)



 20%|██        | 96/471 [12:03<46:51,  7.50s/it][A

loss: tensor(0.6676, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 97/471 [12:10<46:35,  7.48s/it][A

loss: tensor(0.6450, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 98/471 [12:18<46:35,  7.50s/it][A

loss: tensor(0.7375, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 99/471 [12:25<46:32,  7.51s/it][A

loss: tensor(0.6875, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██        | 100/471 [12:33<46:35,  7.54s/it][A

loss: tensor(0.6603, device='cuda:0', grad_fn=<NllLossBackward>)



 21%|██▏       | 101/471 [12:40<46:23,  7.52s/it][A

loss: tensor(0.6430, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 102/471 [12:48<46:25,  7.55s/it][A

loss: tensor(0.7669, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 103/471 [12:56<46:21,  7.56s/it][A

loss: tensor(0.6420, device='cuda:0', grad_fn=<NllLossBackward>)



 22%|██▏       | 104/471 [13:03<46:18,  7.57s/it][A

loss: tensor(0.6160, device='cuda:0', grad_fn=<NllLossBackward>)


In [None]:
# saveModelName = "./model-causal-model/model_1_finetuned-{}-epochs-lr_{}.pth".format(epochs, lr) # it should be epoch so that the name shows at what epoch teh mdel ws saved

In [None]:
print(n_trained_epochs)
print(train_avg_loss)
print(val_avg_loss)

In [None]:
plt.plot(range(1, n_trained_epochs+1), train_avg_loss, label="train loss")
plt.plot(range(1, n_trained_epochs+1), val_avg_loss, label="val loss")
plt.title("Training Curve (lr={})".format(lr))
plt.xlabel("epochs")
plt.ylabel("Train Loss")
plt.legend(loc="best")
plt.show()

In [None]:
plt.plot(range(1, n_trained_epochs+1), train_avg_acc, label="train acc")
plt.plot(range(1, n_trained_epochs+1), val_avg_acc, label="val acc")
plt.title("Training Curve (lr={})".format(lr))
plt.xlabel("epochs")
plt.ylabel("Train/val Accuracy")
plt.legend(loc="best")
plt.show()

## Save Model

In [None]:
## saving the model 
#torch.save(model.state_dict(), "finetuned-35-epochs-1e3-lr-with-weighted-loss.pth") # early stopping saves model

### Load the model 
- if train and wants to evaluate: jsut use the `sameModelName`
- if want to laod a specific model - use next cell and write in double quote

In [None]:
model_name = saveModelName

In [None]:
# loading the locally saved model

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CausalityBERT()
model.load_state_dict(torch.load(model_name))
## Move the model to the GPU 
model.to(device)
model.eval() # gettign in the eval mode 



## evaluation on the test dataset 



In [None]:
#loss_fn = CrossEntropyLoss()
test_loss = []
test_acc = []
test_prec = []
test_rec = []
test_f1 = []


for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch     # unpack inputs from dataloader

    with torch.no_grad():
        model.eval()
        logits = model(**{"input_ids":b_input_ids, "attention_mask":b_input_mask, "token_type_ids":b_token_type_ids}) # forward pass, calculates logit predictions 
    
    # move logits and labels to CPU
    logits = logits.detach().to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    metrics = compute_metrics(pred_flat, labels_flat, "macro")
    test_acc.append(metrics["accuracy"])
    test_prec.append(metrics["precision"])
    test_rec.append(metrics["recall"])
    test_f1.append(metrics["f1"])

    metrics = compute_metrics(pred_flat, labels_flat, "weighted")
    test_acc_w.append(metrics["accuracy"])
    test_prec_w.append(metrics["precision"])
    test_rec_w.append(metrics["recall"])
    test_f1_w.append(metrics["f1"])

    metrics = compute_metrics(pred_flat, labels_flat, "binary")
    test_acc_b.append(metrics["accuracy"])
    test_prec_b.append(metrics["precision"])
    test_rec_b.append(metrics["recall"])
    test_f1_b.append(metrics["f1"])

print(F'\n\ttest loss: {np.mean(test_loss)}')
print(F'\n\ttest acc macro: {np.mean(test_acc)}')
print(F'\n\ttest prec macro: {np.mean(test_prec)}')
print(F'\n\ttest rec macro: {np.mean(test_rec)}')
print(F'\n\ttest f1 macro: {np.mean(test_f1)}')
print()
print(F'\n\ttest acc weighted: {np.mean(test_acc_w)}')
print(F'\n\ttest prec weighted: {np.mean(test_prec_w)}')
print(F'\n\ttest rec weighted: {np.mean(test_rec_w)}')
print(F'\n\ttest f1 weighted: {np.mean(test_f1_w)}')
print()
print(F'\n\ttest acc binary: {np.mean(test_acc_b)}')
print(F'\n\ttest prec binary: {np.mean(test_prec_b)}')
print(F'\n\ttest rec binary: {np.mean(test_rec_b)}')
print(F'\n\ttest f1 binary: {np.mean(test_f1_b)}')


### Print predictions of last test set batch:

In [None]:
# take last batch of test set:

for i in range(len(batch)):
    tokens = tokenizer.convert_ids_to_tokens(b_input_ids[i])
    print("\nPadded Sentence:")
    print(tokens)
    print("prediction:", pred_flat[i])
    