## A model build using TweetBERT to classify tweet as causal or non-causal 

The causal sentence prediction model will be trained in several steps using an active learning approach, where in each step the training dataset will be augmented.
In each step the causal sentence classifier is trained and applied on a subsample of unlabeled tweets to identify tweets with causal elements. Those tweets are then manually labeled for the two tasks: causal sentence prediction and cause-effect identification (NER). The newly labeled data will be added to the training dataset and the causal sentence classifier will be retrained with the augmented dataset to increase performance

In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, SubsetRandomSampler
import transformers
from tqdm import tqdm, trange
import io
from utils import normalizeTweet, split_into_sentences, EarlyStopping
import matplotlib.pyplot as plt

########################### Check if cuda available ############################
# print("Cuda available: ", torch.cuda.is_available())
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



########################### DATA FILE ###################################
# dataPath = "/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality_tweets_data.xlsx"
dataPath = "data/Causality_tweets_data.xlsx"
#dataPath = "Causality_tweets_data.xlsx"


########################### MODEL PARAMETERS ############################
active_learning_round = 4 # will change the saved model name 
lr = 1e-5  
adam_eps = 1e-8
epochs = 50
num_warmup_steps = 0
early_patience = 10# how long to wait after last time validation loss improved

train_batch_size = 16
val_batch_size = 16
test_batch_size = 16
test_to_train_ratio = 0.1 # 10% test and 90% train
val_to_train_ratio = 0.2

#metrics_average = "macro" # this will give measure for class_1,i.e., causal class
#
# saveModelName = "./model-causal-model/new_model_1_finetuned-{}-epochs-lr_{}.pth".format(epochs, lr) # it should be epoch so that the name shows at what epoch teh mdel ws saved
### naming the model 
active_learning_round = 4
n_trained_epochs = 0
saveModelName = "./model-causal-model/new_model_{}_finetuned-{}-epochs-lr_{}.pth".format(active_learning_round,n_trained_epochs-early_patience, lr) 


#### Checking if thec cuda is available and then select the `gpu`

In [2]:
########################### Check if cuda available ############################
print("Cuda available: ", torch.cuda.is_available())
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print("Selected {} for this notebook".format(device))

Cuda available:  True
Selected cuda:1 for this notebook


In [3]:
##### DATA TO LOAD ######

data_round0 = pd.read_excel(dataPath, sheet_name="round0")
data_round0 = data_round0[data_round0["Causal association"].notnull()] # some tweets at the end are not labeled yet
data_round0 = data_round0[["full_text", "Intent", "Cause", "Effect", "Causal association"]]
print("Data round 0 (tweets!):")
print(data_round0["Causal association"].value_counts())
print("-----"*5)


##### additional data labeled through active learning strategy - round 1 ########
data_round1 = pd.read_excel(dataPath, sheet_name="round1")
data_round1 = data_round1[data_round1["Causal association"].notnull()]
data_round1 = data_round1[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round1.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("Sentences round 1:")
print(data_round1["Causal association"].value_counts())
print("-----"*5)

##### additional data labeled through active learning strategy - round 2 ########
data_round2 = pd.read_excel(dataPath, sheet_name="round2")
data_round2 = data_round2[data_round2["Causal association"].notnull()]
data_round2 = data_round2[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round2.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("sentences round 2:")
print(data_round2["Causal association"].value_counts())
print("-----"*5)

##### additional data labeled through active learning strategy - round 3 ########
data_round3 = pd.read_excel(dataPath, sheet_name="round3")
data_round3 = data_round3[data_round3["Causal association"].notnull()]
data_round3 = data_round3[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round3.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("sentences round 3:")
print(data_round3["Causal association"].value_counts())
print("-----"*5)

##### new additional data labeled through active learning strategy - round 4 (model is only retrained with this data) #####################
data_round4 = pd.read_excel(dataPath, sheet_name="round4")
data_round4 = data_round4[data_round4["Causal association"].notnull()]
data_round4 = data_round4[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round4.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("sentences round 4:")
print(data_round4["Causal association"].value_counts())

#### merge datasets ######
data = data_round0.append(data_round1).append(data_round2).append(data_round3).append(data_round4)
print("\nAfter merge old data:")
print(data["Causal association"].value_counts())
data.head()
print("-----"*5)


print("----"*5)
print("\n 0:non causal tweet \n 1: causal tweet.\n \n each tweet may have more than one sentence and we are splitting them and labelling by checking if cause or effect occur in them or not")

Data round 0 (tweets!):
0.0    3710
1.0    1290
Name: Causal association, dtype: int64
-------------------------
Sentences round 1:
0.0    1763
1.0     429
Name: Causal association, dtype: int64
-------------------------
sentences round 2:
0    1658
1     150
Name: Causal association, dtype: int64
-------------------------
sentences round 3:
0    1886
1     215
Name: Causal association, dtype: int64
-------------------------
sentences round 4:
0    1895
1     313
Name: Causal association, dtype: int64

After merge old data:
0.0    10912
1.0     2397
Name: Causal association, dtype: int64
-------------------------
--------------------

 0:non causal tweet 
 1: causal tweet.
 
 each tweet may have more than one sentence and we are splitting them and labelling by checking if cause or effect occur in them or not


### Preprocessing

In [4]:
def get_start_end_index_of_sentence_in_tweet(tweet, sentence):
    """ 
    The sentence tokens are included in the tweet tokens.
    Return the start end end indices of the sentence tokens in the tweet tokens

    """

    sentence_start_word = sentence[0]
    start_indices = [i for i, x in enumerate(tweet) if x == sentence_start_word] # find all indices of the start word of the sentence 
    try:
        for start_index in start_indices:
            isTrueStartIndex = all([tweet[start_index+i] == sentence[i] for i in range(len(sentence))])
            #print("start_index:", start_index, "isTrueStartIndex:", isTrueStartIndex)
            if isTrueStartIndex:
                return start_index, start_index + len(sentence) 
    except:
        print("ERROR: StartIndex should have been found for sentence:")
        print("tweet:")
        print(tweet)
        print("sentence:")
        print(sentence)
    return -1, -2 # should not be returned


def split_tweets_to_sentences(data):
    """ 
        Splits tweets into sentences and associates the appropriate intent, causes, effects and causal association
        to each sentence.
        
        Parameters:
        - min_words_in_sentences: Minimal number of words in a sentence such that the sentence is kept. 
                                  Assumption: A sentence with too few words does not have enough information
                              
                              
                              
        Ex.:
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what? type 1 causes insulin dependence | q;msS  | type 1|insulin dependence | 1       | ...  
        
        New dataframe returned: 
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what?                                  |   q    |       |        |       0            | ...
        type 1 causes insulin dependence       |        | type 1| insulin dependence | 1       | ...  
    """

    newDF = pd.DataFrame(columns=["sentence", "Intent", "Cause", "Effect", "Causal association", "tokenized"])
    
    for i,row in data.iterrows():
        causes = row["Cause"]
        effects = row["Effect"]
        sentences = split_into_sentences(normalizeTweet(row["full_text"]))
        
        # single sentence in tweet
        if len(sentences) == 1:
            singleSentenceIntent = ""
            if isinstance(row["Intent"], str):
                if len(row["Intent"].split(";")) > 1:
                    singleSentenceIntent = row["Intent"].strip().replace(";msS", "").replace("msS;", "").replace(";mS", "").replace("mS;", "")
                else:
                    if row["Intent"] == "mS" or row["Intent"] == "msS":
                        singleSentenceIntent = ""
                    else:
                        singleSentenceIntent = row["Intent"].strip()
                    
            newDF=newDF.append(pd.Series({"sentence": sentences[0] # only one sentence
                         , "Intent": singleSentenceIntent
                         , "Cause" : row["Cause"]
                         , "Effect": row["Effect"]
                         , "Causal association" : row["Causal association"]
                         , "tokenized": row["tokenized"]}), ignore_index=True)
        
        # tweet has several sentences
        else: 
            intents = str(row["Intent"]).strip().split(";")
            for sentence in sentences:
                sent_tokenized = sentence.split(" ")
                causeInSentence = np.nan if not isinstance(causes, str) or not any([cause in sentence for cause in causes.split(";")]) else ";".join([cause for cause in causes.split(";") if cause in sentence])
                effectInSentence = np.nan if not isinstance(effects, str) or not any([effect in sentence for effect in effects.split(";")]) else ";".join([effect for effect in effects.split(";") if effect in sentence])
                causalAssociationInSentence = 1 if isinstance(causeInSentence, str) and isinstance(effectInSentence, str) else 0
                startIndex, endIndex = get_start_end_index_of_sentence_in_tweet(row["tokenized"], sent_tokenized)
                sentence_tokenized = row["tokenized"][startIndex:endIndex]
                
                if "q" in intents and sentence[-1] == "?": # if current sentence is question
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "q", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)                    
                elif "joke" in intents: # all sentences with "joke" in tweet keep the intent "joke"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "joke", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)   
                elif "neg" in intents: # all sentences with "neg" in tweet keep intent "neg"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "neg", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)               
                elif isinstance(causeInSentence, str) and isinstance(effectInSentence, str): # cause effect sentence
                    causalIntent = ""
                    if len(causeInSentence.split(";")) > 1:
                        causalIntent = "mC"
                        if len(effectInSentence.split(";")) > 1:
                            causalIntent = "mC;mE"
                    elif len(effectInSentence.split(";")) > 1:
                        causalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": causalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)                                  
                else:
                    nonCausalIntent = ""
                    if isinstance(causeInSentence, str): # only cause is given
                        if len(causeInSentence.split(";")) > 1:
                            nonCausalIntent = "mC"
                    elif isinstance(effectInSentence, str): # only effect is given
                        if len(effectInSentence.split(";")) > 1:
                            nonCausalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": nonCausalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)

    return newDF

In [5]:
### Split tweets into sentences (train classifier on sentence level) ####

print("Count of tweets:", data.shape[0])

data["tokenized"] = data["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
dataSentences = split_tweets_to_sentences(data)
print("Count of sentences:", dataSentences.shape[0])

dataSentences.head()

Count of tweets: 13309
Count of sentences: 20065


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will..."
1,Fiercely .,,,,0,"[Fiercely, .]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]"
3,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0,"[USER, USER, I, knew, diabetes, and, fibromyal..."
4,:face_with_rolling_eyes:,joke,,,0,[:face_with_rolling_eyes:]


In [6]:
########## Remove sentences with joke, question, negation and keep only sentences with more than 3 tokens #####

print("Count of sentences before filtering: ", dataSentences.shape[0])
dataSentFiltered = dataSentences[~dataSentences["Intent"].str.contains("neg|joke|q")] 
dataSentFiltered = dataSentFiltered[dataSentFiltered["tokenized"].map(len) > 3] 
print("Count of sentences after filtering: ", dataSentFiltered.shape[0])
print("Distribution:")
print("\n")
print(dataSentFiltered["Causal association"].value_counts())
dataSentFiltered.head()



Count of sentences before filtering:  20065
Count of sentences after filtering:  16475
Distribution:


0.0    14364
1.0     2111
Name: Causal association, dtype: int64


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will..."
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]"
5,:down_arrow: :down_arrow: :down_arrow: THIS :d...,,,,0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI..."
6,I 'm a trans woman .,,,,0,"[I, 'm, a, trans, woman, .]"
7,"Both of us could use a world where "" brave and...",,,,0,"[Both, of, us, could, use, a, world, where, "",..."


### Data split and calculate class weight

In [7]:
####################### Stratified splits ####################


## ONLY FOR TESTING ---------------
dataSentFiltered = dataSentFiltered#[0:500] # for testing

text = dataSentFiltered["sentence"].map(normalizeTweet).values.tolist()
labels = dataSentFiltered["Causal association"].values.tolist()

# first split the data into training and testing label in the ratio of 90:10
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=test_to_train_ratio, stratify=labels, random_state=99)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=val_to_train_ratio, stratify=train_labels, random_state=99)

# Redefine training set:Take only new labeled tweets from round 3 for training; test and val set come from old data


data_count_info = pd.Series(labels).value_counts(normalize=True)
train_count_info = pd.Series(train_labels).value_counts(normalize=True)
val_count_info = pd.Series(val_labels).value_counts(normalize=True)
test_count_info = pd.Series(test_labels).value_counts(normalize=True)

# for class-imbalanced dataset, the class weight for a ith class
# to be specified for balancing in the loss function is given by:
# weight[i] = num_samples / (num_classes * num_samples[i])
# since train_count_info obtained above has fraction of samples
# for ith class, hence the corresponding weight calculation is:
class_weight = (1/train_count_info)/len(train_count_info)

print("All: Count = {}, % of 0 = {}, % of 1 = {}".format(len(labels), *data_count_info.round(4).to_list()))
print("\n")
print("Train: Count = {}, % of 0 = {}, % of 1 = {}".format(len(train_labels), *train_count_info.round(4).to_list()))
print("\n")
print("Val: Count = {}, % of 0 = {}, % of 1 = {}".format(len(val_labels), *val_count_info.round(4).to_list()))
print("\n")
print("Test: Count = {}, % of 0 = {}, % of 1 = {}".format(len(test_labels), *test_count_info.round(4).to_list()))
print("\n")
print("Balancing class wts: for 0 = {}, for 1 = {}".format(*class_weight.round(4).to_list()))
print("\n")


All: Count = 16475, % of 0 = 0.8719, % of 1 = 0.1281


Train: Count = 11861, % of 0 = 0.8718, % of 1 = 0.1282


Val: Count = 2966, % of 0 = 0.8719, % of 1 = 0.1281


Test: Count = 1648, % of 0 = 0.872, % of 1 = 0.128


Balancing class wts: for 0 = 0.5735, for 1 = 3.9016




#### Defining our DataLoader 

In [8]:
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
        }      

    def __len__(self):
        return len(self.labels)

    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train_texts, train_labels, tokenizer)
val_dataset = TweetDataSet(val_texts, val_labels, tokenizer)
test_dataset = TweetDataSet(test_texts, test_labels, tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# During training: In each epoch one part of the training data will be used as validation set
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


11861
2966
1648


### Evaluation Metrics

In [9]:
## we are measuring weighted metrics - as our dataset is imbalanced 
# Calculate metrics for each label, and find their average weighted by support
# (the number of true instances for each label). 
# This alters ‘macro’ to account for label imbalance; 
# it can result in an F-score that is not between precision and recall.


from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef

def compute_metrics(pred, labels, average="macro"):
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    precision, recall, f1, _ = precision_recall_fscore_support(labels,pred, average=average)
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



### Model defintion

In [10]:


class CausalityBERT(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalityBERT, self).__init__()
        self.num_labels = 2
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token        
        output_2 = self.dropout(output_1)
        output_3 = self.linear1(output_2)  
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        return output_5

### Moving the model to  GPU and defining training parameters: 
    * num_training_steps 
    * optimizers 
    * scheduler 
    * loss function (weighted) 

In [11]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = CausalityBERT() ## just load the model trained in previous round here 
#model.load_state_dict(torch.load(finetuned_model, map_location='cpu')) # load model trained in previous round
model.to(device)

# fine-tune only the task-specific parameters
for param in model.bert.parameters():
    param.requires_grad = False

num_training_steps = np.ceil(len(train_dataset)/train_batch_size)*epochs
optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # scheduler with a linearly decreasing learning rate from the initial lr set in the optimizer to 0; after a warmup period durnig which it increases linearly from 0 to the initial lr set in the optimizer

## penalising more for class with less number of exaplmes 
loss_fn = CrossEntropyLoss(torch.tensor(class_weight.to_list()).to(device))

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.8.attention.self.query.bias', 'roberta.encoder.layer.6.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.self.value.bias', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.10.attention.self.query.weight', 'roberta.encoder.layer.10.attention.self.value.bias', 'roberta.encoder.layer.11.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.7.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.self.key.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.3.attention.self.query.bias',

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.1.output.dense.bias', 'encoder.layer.4.attention.output.dense.weight', 'encoder.layer.9.output.dense.weight', 'encoder.layer.4.attention.output.dense.bias', 'encoder.layer.7.attention.self.key.weight', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.5.attention.self.key.weight', 'encoder.layer.11.intermediate.dense.weight', 'encoder.layer.3.attention.self.key.weight', 'encoder.layer.8.attention.output.LayerNorm.bias', 'encoder.layer.10.attention.self.key.weight', 'encoder.layer.5.attention.output.dense.bias', 'encoder.layer.5.output.dense.bias', 'encoder.layer.11.attention.self.query.bias', 'encoder.layer.8.intermediate.dense.bias', 'encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.4.attention.self.value.weight', 'pooler.dense.bias', 'encoder.layer.5.intermediate.dense.bias', 'embeddings.word_embeddings.weight', 'encoder.layer.10.

### Training (fine-tuning) with Validation and early stopping 

In [None]:
############ TRAINING #############

# initialise the early_stopping object
# early_stopping = EarlyStopping(patience=early_patience, path=saveModelName, verbose=True)
early_stopping = EarlyStopping(patience=early_patience,verbose=True)


train_avg_loss = [] # avg training loss per epoch
val_avg_loss = [] # avg validation loss per epoch
train_avg_acc = [] # avg training accuracy per epoch
val_avg_acc = [] # avg val accuracy per epoch
n_trained_epochs = 0


for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")
    
    
    ########### training eval metrics #############################
    train_accuracy = []
    train_loss = []
    train_acc = []
    train_prec = []
    train_rec = []
    train_f1 = []
    
    ###################################################
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        ###########################################################################
        model.train()
        logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass
        #############################################################################
        loss = loss_fn(logits, labels)
        print("loss:", loss)
        loss.backward() # backward pass
        optim.step()    # update parameters and take a step up using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
        
    
        ############# Training Accuracy Measure ###################################

        # move logits and labels to CPU
        logits = logits.detach().to('cpu').numpy()
        label_ids = labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        metrics = compute_metrics(pred_flat, labels_flat)        
        
        train_acc.append(metrics["accuracy"])
        train_prec.append(metrics["precision"])
        train_rec.append(metrics["recall"])
        train_f1.append(metrics["f1"])
        
    train_avg_loss.append(np.mean(train_loss))
    train_avg_acc.append(np.mean(train_acc))
    print(F'\n\tTrain loss: {np.mean(train_loss)}')
    print(F'\n\ttrain acc: {np.mean(train_acc)}')
    print(F'\n\ttraining prec: {np.mean(train_prec)}')
    print(F'\n\ttraining rec: {np.mean(train_rec)}')
    print(F'\n\ttraining f1: {np.mean(train_f1)}')
    
    n_trained_epochs += 1
    
    ###################################################################################

    
    ## ---- Validation ------
    val_accuracy = []
    val_loss = []
    val_acc = []
    val_prec = []
    val_rec = []
    val_f1 = []
    
    
    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch     # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            ##################################################################################
            model.eval()
            logits = model(**{"input_ids":b_input_ids, "attention_mask":b_input_mask, "token_type_ids":b_token_type_ids}) # forward pass, calculates logit predictions 

                        
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())
        
        # move logits and labels to CPU
        logits = logits.detach().to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics = compute_metrics(pred_flat, labels_flat)
        val_acc.append(metrics["accuracy"])
        val_prec.append(metrics["precision"])
        val_rec.append(metrics["recall"])
        val_f1.append(metrics["f1"])

    val_avg_loss.append(np.mean(val_loss))
    val_avg_acc.append(np.mean(val_acc))
    print(F'\n\tValidation loss: {np.mean(val_loss)}')
    print(F'\n\tValidation acc: {np.mean(val_acc)}')
    print(F'\n\tValidation prec: {np.mean(val_prec)}')
    print(F'\n\tValidation rec: {np.mean(val_rec)}')
    print(F'\n\tValidation f1: {np.mean(val_f1)}')

    # early_stopping needs the validation loss to check if it has decreased,
    # and if it has, it will make a checkpoint of the current model
    saveModelName = "./model-causal-model/new_model_{}_finetuned-{}-epochs-lr_{}.pth".format(active_learning_round,n_trained_epochs-early_patience, lr) 
    early_stopping.path = saveModelName
    early_stopping(np.average(val_loss), model)
    
    if early_stopping.early_stop:
        print("Early stopping")
        break

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]




  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  0%|          | 1/742 [00:12<2:35:17, 12.57s/it][A

loss: tensor(0.7280, device='cuda:1', grad_fn=<NllLossBackward>)



  0%|          | 2/742 [00:24<2:29:24, 12.11s/it][A

loss: tensor(0.6180, device='cuda:1', grad_fn=<NllLossBackward>)



  0%|          | 3/742 [00:36<2:28:11, 12.03s/it][A

loss: tensor(0.7027, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 4/742 [00:48<2:27:01, 11.95s/it][A

loss: tensor(0.7618, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 5/742 [01:00<2:26:52, 11.96s/it][A

loss: tensor(0.7064, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 6/742 [01:12<2:26:40, 11.96s/it][A

loss: tensor(0.6737, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 7/742 [01:24<2:26:45, 11.98s/it][A

loss: tensor(0.6380, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 8/742 [01:36<2:26:23, 11.97s/it][A

loss: tensor(0.6865, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 9/742 [01:47<2:26:12, 11.97s/it][A

loss: tensor(0.7018, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|▏         | 10/742 [02:00<2:26:10, 11.98s/it][A

loss: tensor(0.6871, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|▏         | 11/742 [02:12<2:26:07, 11.99s/it][A

loss: tensor(0.6617, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 12/742 [02:24<2:26:01, 12.00s/it][A

loss: tensor(0.7937, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 13/742 [02:35<2:25:24, 11.97s/it][A

loss: tensor(0.6528, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 14/742 [02:47<2:25:10, 11.97s/it][A

loss: tensor(0.6795, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 15/742 [03:00<2:25:30, 12.01s/it][A

loss: tensor(0.6636, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 16/742 [03:12<2:25:58, 12.06s/it][A

loss: tensor(0.7728, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 17/742 [03:24<2:25:15, 12.02s/it][A

loss: tensor(0.7115, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 18/742 [03:36<2:25:01, 12.02s/it][A

loss: tensor(0.6850, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 19/742 [03:48<2:24:48, 12.02s/it][A

loss: tensor(0.7126, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 20/742 [04:00<2:24:38, 12.02s/it][A

loss: tensor(0.7380, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 21/742 [04:12<2:24:08, 12.00s/it][A

loss: tensor(0.7530, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 22/742 [04:24<2:23:36, 11.97s/it][A

loss: tensor(0.7350, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 23/742 [04:36<2:23:59, 12.02s/it][A

loss: tensor(0.6546, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 24/742 [04:48<2:24:00, 12.03s/it][A

loss: tensor(0.7391, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 25/742 [05:00<2:23:12, 11.98s/it][A

loss: tensor(0.6780, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▎         | 26/742 [05:11<2:22:17, 11.92s/it][A

loss: tensor(0.7376, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▎         | 27/742 [05:23<2:22:05, 11.92s/it][A

loss: tensor(0.6483, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 28/742 [05:35<2:22:14, 11.95s/it][A

loss: tensor(0.6630, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 29/742 [05:47<2:22:04, 11.96s/it][A

loss: tensor(0.6953, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 30/742 [05:59<2:22:05, 11.97s/it][A

loss: tensor(0.7070, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 31/742 [06:11<2:22:14, 12.00s/it][A

loss: tensor(0.6715, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 32/742 [06:24<2:22:42, 12.06s/it][A

loss: tensor(0.6464, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 33/742 [06:36<2:22:41, 12.08s/it][A

loss: tensor(0.6489, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 34/742 [06:48<2:22:49, 12.10s/it][A

loss: tensor(0.6213, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 35/742 [07:00<2:22:29, 12.09s/it][A

loss: tensor(0.6359, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 36/742 [07:12<2:21:50, 12.06s/it][A

loss: tensor(0.6281, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 37/742 [07:24<2:21:14, 12.02s/it][A

loss: tensor(0.6295, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▌         | 38/742 [07:36<2:20:36, 11.98s/it][A

loss: tensor(0.7368, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▌         | 39/742 [07:48<2:19:47, 11.93s/it][A

loss: tensor(0.7524, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▌         | 40/742 [07:59<2:19:29, 11.92s/it][A

loss: tensor(0.7138, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 41/742 [08:11<2:19:06, 11.91s/it][A

loss: tensor(0.6362, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 42/742 [08:23<2:19:33, 11.96s/it][A

loss: tensor(0.7030, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 43/742 [08:35<2:19:16, 11.95s/it][A

loss: tensor(0.5385, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 44/742 [08:47<2:19:08, 11.96s/it][A

loss: tensor(0.6649, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 45/742 [08:59<2:19:12, 11.98s/it][A

loss: tensor(0.6823, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 46/742 [09:11<2:19:10, 12.00s/it][A

loss: tensor(0.7694, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▋         | 47/742 [09:23<2:18:55, 11.99s/it][A

loss: tensor(0.6590, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▋         | 48/742 [09:35<2:17:58, 11.93s/it][A

loss: tensor(0.6314, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 49/742 [09:47<2:17:53, 11.94s/it][A

loss: tensor(0.6951, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 50/742 [09:59<2:17:40, 11.94s/it][A

loss: tensor(0.6864, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 51/742 [10:11<2:17:23, 11.93s/it][A

loss: tensor(0.7067, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 52/742 [10:23<2:16:57, 11.91s/it][A

loss: tensor(0.6624, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 53/742 [10:35<2:16:46, 11.91s/it][A

loss: tensor(0.7602, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 54/742 [10:47<2:16:32, 11.91s/it][A

loss: tensor(0.6570, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 55/742 [10:59<2:16:39, 11.94s/it][A

loss: tensor(0.6347, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 56/742 [11:11<2:16:47, 11.96s/it][A

loss: tensor(0.6627, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 57/742 [11:23<2:16:31, 11.96s/it][A

loss: tensor(0.6540, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 58/742 [11:35<2:16:33, 11.98s/it][A

loss: tensor(0.6484, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 59/742 [11:47<2:16:59, 12.03s/it][A

loss: tensor(0.5875, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 60/742 [11:59<2:16:49, 12.04s/it][A

loss: tensor(0.6573, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 61/742 [12:11<2:16:04, 11.99s/it][A

loss: tensor(0.6716, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 62/742 [12:23<2:15:40, 11.97s/it][A

loss: tensor(0.7032, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 63/742 [12:35<2:15:16, 11.95s/it][A

loss: tensor(0.6365, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▊         | 64/742 [12:47<2:15:23, 11.98s/it][A

loss: tensor(0.6929, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 65/742 [12:59<2:15:25, 12.00s/it][A

loss: tensor(0.6605, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 66/742 [13:10<2:14:34, 11.94s/it][A

loss: tensor(0.6764, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 67/742 [13:23<2:14:58, 12.00s/it][A

loss: tensor(0.6363, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 68/742 [13:34<2:14:30, 11.97s/it][A

loss: tensor(0.6887, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 69/742 [13:46<2:14:14, 11.97s/it][A

loss: tensor(0.7107, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 70/742 [13:58<2:13:51, 11.95s/it][A

loss: tensor(0.7053, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 71/742 [14:10<2:13:40, 11.95s/it][A

loss: tensor(0.6344, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 72/742 [14:22<2:13:38, 11.97s/it][A

loss: tensor(0.6710, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 73/742 [14:34<2:13:34, 11.98s/it][A

loss: tensor(0.7265, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 74/742 [14:46<2:13:19, 11.97s/it][A

loss: tensor(0.7722, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|█         | 75/742 [14:58<2:13:10, 11.98s/it][A

loss: tensor(0.6337, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|█         | 76/742 [15:10<2:12:49, 11.97s/it][A

loss: tensor(0.6114, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|█         | 77/742 [15:22<2:12:41, 11.97s/it][A

loss: tensor(0.6608, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 78/742 [15:34<2:13:01, 12.02s/it][A

loss: tensor(0.7209, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 79/742 [15:46<2:12:43, 12.01s/it][A

loss: tensor(0.6082, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 80/742 [15:58<2:12:24, 12.00s/it][A

loss: tensor(0.7141, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 81/742 [16:10<2:12:04, 11.99s/it][A

loss: tensor(0.6078, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 82/742 [16:22<2:12:13, 12.02s/it][A

loss: tensor(0.6034, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 83/742 [16:34<2:12:22, 12.05s/it][A

loss: tensor(0.7678, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█▏        | 84/742 [16:47<2:12:31, 12.08s/it][A

loss: tensor(0.7413, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█▏        | 85/742 [16:59<2:12:43, 12.12s/it][A

loss: tensor(0.6832, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 86/742 [17:11<2:12:26, 12.11s/it][A

loss: tensor(0.6710, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 87/742 [17:23<2:11:34, 12.05s/it][A

loss: tensor(0.7583, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 88/742 [17:35<2:10:58, 12.02s/it][A

loss: tensor(0.6506, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 89/742 [17:47<2:10:24, 11.98s/it][A

loss: tensor(0.6256, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 90/742 [17:59<2:10:06, 11.97s/it][A

loss: tensor(0.7678, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 91/742 [18:11<2:09:54, 11.97s/it][A

loss: tensor(0.5391, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 92/742 [18:22<2:09:06, 11.92s/it][A

loss: tensor(0.7687, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 93/742 [18:34<2:08:59, 11.93s/it][A

loss: tensor(0.6604, device='cuda:1', grad_fn=<NllLossBackward>)


  _warn_prf(average, modifier, msg_start, len(result))

 13%|█▎        | 94/742 [18:46<2:09:20, 11.98s/it][A

loss: tensor(0.6127, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 95/742 [18:59<2:10:00, 12.06s/it][A

loss: tensor(0.6942, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 96/742 [19:11<2:09:13, 12.00s/it][A

loss: tensor(0.5657, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 97/742 [19:22<2:08:50, 11.98s/it][A

loss: tensor(0.5600, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 98/742 [19:34<2:08:40, 11.99s/it][A

loss: tensor(0.7989, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 99/742 [19:46<2:08:28, 11.99s/it][A

loss: tensor(0.6411, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 100/742 [19:58<2:08:05, 11.97s/it][A

loss: tensor(0.7155, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▎        | 101/742 [20:10<2:07:59, 11.98s/it][A

loss: tensor(0.6434, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▎        | 102/742 [20:22<2:07:41, 11.97s/it][A

loss: tensor(0.6601, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 103/742 [20:34<2:07:21, 11.96s/it][A

loss: tensor(0.6206, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 104/742 [20:46<2:07:14, 11.97s/it][A

loss: tensor(0.6278, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 105/742 [20:58<2:06:44, 11.94s/it][A

loss: tensor(0.7135, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 106/742 [21:10<2:06:38, 11.95s/it][A

loss: tensor(0.6589, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 107/742 [21:22<2:06:23, 11.94s/it][A

loss: tensor(0.6951, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 108/742 [21:34<2:06:56, 12.01s/it][A

loss: tensor(0.7670, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 109/742 [21:46<2:06:57, 12.03s/it][A

loss: tensor(0.7880, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 110/742 [21:58<2:06:04, 11.97s/it][A

loss: tensor(0.7192, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 111/742 [22:10<2:06:22, 12.02s/it][A

loss: tensor(0.7260, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 112/742 [22:22<2:06:19, 12.03s/it][A

loss: tensor(0.5668, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 113/742 [22:34<2:05:48, 12.00s/it][A

loss: tensor(0.6213, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 114/742 [22:46<2:05:15, 11.97s/it][A

loss: tensor(0.5324, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 115/742 [22:58<2:05:14, 11.98s/it][A

loss: tensor(0.8599, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 116/742 [23:10<2:05:39, 12.04s/it][A

loss: tensor(0.7923, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 117/742 [23:22<2:05:21, 12.03s/it][A

loss: tensor(0.7447, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 118/742 [23:34<2:04:45, 12.00s/it][A

loss: tensor(0.7039, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 119/742 [23:46<2:04:59, 12.04s/it][A

loss: tensor(0.7127, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 120/742 [23:58<2:04:32, 12.01s/it][A

loss: tensor(0.6994, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▋        | 121/742 [24:10<2:04:17, 12.01s/it][A

loss: tensor(0.7264, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▋        | 122/742 [24:22<2:03:49, 11.98s/it][A

loss: tensor(0.7415, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 123/742 [24:34<2:03:41, 11.99s/it][A

loss: tensor(0.6257, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 124/742 [24:46<2:03:59, 12.04s/it][A

loss: tensor(0.6817, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 125/742 [24:58<2:03:35, 12.02s/it][A

loss: tensor(0.7672, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 126/742 [25:11<2:03:48, 12.06s/it][A

loss: tensor(0.6695, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 127/742 [25:23<2:03:21, 12.03s/it][A

loss: tensor(0.5186, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 128/742 [25:35<2:03:29, 12.07s/it][A

loss: tensor(0.6971, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 129/742 [25:47<2:03:03, 12.04s/it][A

loss: tensor(0.6860, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 130/742 [25:59<2:02:16, 11.99s/it][A

loss: tensor(0.7666, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 131/742 [26:10<2:01:39, 11.95s/it][A

loss: tensor(0.7191, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 132/742 [26:22<2:00:54, 11.89s/it][A

loss: tensor(0.6133, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 133/742 [26:34<2:00:36, 11.88s/it][A

loss: tensor(0.7995, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 134/742 [26:46<2:00:22, 11.88s/it][A

loss: tensor(0.7045, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 135/742 [26:58<2:00:06, 11.87s/it][A

loss: tensor(0.7475, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 136/742 [27:10<1:59:42, 11.85s/it][A

loss: tensor(0.8070, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 137/742 [27:21<1:59:37, 11.86s/it][A

loss: tensor(0.7901, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▊        | 138/742 [27:33<1:59:33, 11.88s/it][A

loss: tensor(0.6268, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▊        | 139/742 [27:45<1:59:17, 11.87s/it][A

loss: tensor(0.6639, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 140/742 [27:57<1:58:48, 11.84s/it][A

loss: tensor(0.6558, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 141/742 [28:09<1:59:07, 11.89s/it][A

loss: tensor(0.7102, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 142/742 [28:21<1:59:14, 11.92s/it][A

loss: tensor(0.7584, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 143/742 [28:33<1:59:14, 11.94s/it][A

loss: tensor(0.6127, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 144/742 [28:45<1:59:07, 11.95s/it][A

loss: tensor(0.5750, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 145/742 [28:57<1:58:21, 11.89s/it][A

loss: tensor(0.6546, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 146/742 [29:09<1:58:16, 11.91s/it][A

loss: tensor(0.6282, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 147/742 [29:21<1:58:09, 11.92s/it][A

loss: tensor(0.7198, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 148/742 [29:33<1:58:06, 11.93s/it][A

loss: tensor(0.6829, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 149/742 [29:44<1:57:42, 11.91s/it][A

loss: tensor(0.6632, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 150/742 [29:56<1:57:17, 11.89s/it][A

loss: tensor(0.7346, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 151/742 [30:08<1:56:56, 11.87s/it][A

loss: tensor(0.6325, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 152/742 [30:20<1:56:48, 11.88s/it][A

loss: tensor(0.6893, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 153/742 [30:32<1:56:43, 11.89s/it][A

loss: tensor(0.6702, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 154/742 [30:44<1:56:51, 11.92s/it][A

loss: tensor(0.6506, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 155/742 [30:56<1:56:58, 11.96s/it][A

loss: tensor(0.8484, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 156/742 [31:08<1:57:18, 12.01s/it][A

loss: tensor(0.6660, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 157/742 [31:20<1:56:37, 11.96s/it][A

loss: tensor(0.5911, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██▏       | 158/742 [31:32<1:56:24, 11.96s/it][A

loss: tensor(0.5864, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██▏       | 159/742 [31:44<1:56:07, 11.95s/it][A

loss: tensor(0.6443, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 160/742 [31:56<1:55:28, 11.90s/it][A

loss: tensor(0.7046, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 161/742 [32:08<1:55:33, 11.93s/it][A

loss: tensor(0.6083, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 162/742 [32:19<1:55:09, 11.91s/it][A

loss: tensor(0.6803, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 163/742 [32:31<1:55:10, 11.94s/it][A

loss: tensor(0.7358, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 164/742 [32:43<1:55:02, 11.94s/it][A

loss: tensor(0.6509, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 165/742 [32:55<1:55:04, 11.97s/it][A

loss: tensor(0.6924, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 166/742 [33:07<1:54:43, 11.95s/it][A

loss: tensor(0.7608, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 167/742 [33:19<1:54:42, 11.97s/it][A

loss: tensor(0.6263, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 168/742 [33:31<1:54:35, 11.98s/it][A

loss: tensor(0.5939, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 169/742 [33:43<1:54:45, 12.02s/it][A

loss: tensor(0.7113, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 170/742 [33:55<1:54:20, 11.99s/it][A

loss: tensor(0.7614, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 171/742 [34:07<1:53:34, 11.93s/it][A

loss: tensor(0.5889, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 172/742 [34:19<1:53:20, 11.93s/it][A

loss: tensor(0.7178, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 173/742 [34:31<1:53:04, 11.92s/it][A

loss: tensor(0.6352, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 174/742 [34:43<1:52:59, 11.94s/it][A

loss: tensor(0.7709, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▎       | 175/742 [34:55<1:52:46, 11.93s/it][A

loss: tensor(0.6346, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▎       | 176/742 [35:07<1:52:56, 11.97s/it][A

loss: tensor(0.7755, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 177/742 [35:19<1:53:27, 12.05s/it][A

loss: tensor(0.7254, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 178/742 [35:31<1:53:47, 12.10s/it][A

loss: tensor(0.7125, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 179/742 [35:44<1:53:32, 12.10s/it][A

loss: tensor(0.6532, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 180/742 [35:56<1:53:26, 12.11s/it][A

loss: tensor(0.7887, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 181/742 [36:08<1:52:33, 12.04s/it][A

loss: tensor(0.7165, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 182/742 [36:19<1:52:06, 12.01s/it][A

loss: tensor(0.8044, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 183/742 [36:31<1:51:35, 11.98s/it][A

loss: tensor(0.8270, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 184/742 [36:43<1:50:58, 11.93s/it][A

loss: tensor(0.5752, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 185/742 [36:55<1:50:37, 11.92s/it][A

loss: tensor(0.6102, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 186/742 [37:07<1:50:18, 11.90s/it][A

loss: tensor(0.7291, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 187/742 [37:19<1:50:00, 11.89s/it][A

loss: tensor(0.5809, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 188/742 [37:31<1:49:51, 11.90s/it][A

loss: tensor(0.6702, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 189/742 [37:43<1:49:48, 11.91s/it][A

loss: tensor(0.7386, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 190/742 [37:55<1:49:51, 11.94s/it][A

loss: tensor(0.6393, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 191/742 [38:07<1:49:53, 11.97s/it][A

loss: tensor(0.7116, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 192/742 [38:19<1:49:42, 11.97s/it][A

loss: tensor(0.5623, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 193/742 [38:31<1:49:14, 11.94s/it][A

loss: tensor(0.6723, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 194/742 [38:42<1:48:57, 11.93s/it][A

loss: tensor(0.6732, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▋       | 195/742 [38:54<1:48:55, 11.95s/it][A

loss: tensor(0.6149, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▋       | 196/742 [39:07<1:49:30, 12.03s/it][A

loss: tensor(0.6725, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 197/742 [39:19<1:49:25, 12.05s/it][A

loss: tensor(0.6298, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 198/742 [39:31<1:49:41, 12.10s/it][A

loss: tensor(0.7321, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 199/742 [39:43<1:49:01, 12.05s/it][A

loss: tensor(0.6843, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 200/742 [39:55<1:48:30, 12.01s/it][A

loss: tensor(0.6774, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 201/742 [40:07<1:47:39, 11.94s/it][A

loss: tensor(0.6544, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 202/742 [40:18<1:47:21, 11.93s/it][A

loss: tensor(0.6704, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 203/742 [40:30<1:47:02, 11.92s/it][A

loss: tensor(0.6661, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 204/742 [40:42<1:46:57, 11.93s/it][A

loss: tensor(0.6401, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 205/742 [40:54<1:46:50, 11.94s/it][A

loss: tensor(0.7134, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 206/742 [41:06<1:46:16, 11.90s/it][A

loss: tensor(0.7145, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 207/742 [41:18<1:46:44, 11.97s/it][A

loss: tensor(0.6570, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 208/742 [41:30<1:46:59, 12.02s/it][A

loss: tensor(0.7513, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 209/742 [41:43<1:47:04, 12.05s/it][A

loss: tensor(0.6164, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 210/742 [41:54<1:46:38, 12.03s/it][A

loss: tensor(0.7453, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 211/742 [42:07<1:46:37, 12.05s/it][A

loss: tensor(0.6516, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▊       | 212/742 [42:19<1:46:57, 12.11s/it][A

loss: tensor(0.7893, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▊       | 213/742 [42:31<1:46:38, 12.09s/it][A

loss: tensor(0.6263, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 214/742 [42:43<1:46:20, 12.08s/it][A

loss: tensor(0.6789, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 215/742 [42:55<1:45:19, 11.99s/it][A

loss: tensor(0.6292, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 216/742 [43:07<1:44:55, 11.97s/it][A

loss: tensor(0.6133, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 217/742 [43:19<1:44:47, 11.98s/it][A

loss: tensor(0.6873, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 218/742 [43:30<1:44:08, 11.93s/it][A

loss: tensor(0.7301, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 219/742 [43:42<1:43:19, 11.85s/it][A

loss: tensor(0.7352, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 220/742 [43:54<1:42:49, 11.82s/it][A

loss: tensor(0.6040, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 221/742 [44:06<1:42:29, 11.80s/it][A

loss: tensor(0.6583, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 222/742 [44:18<1:42:33, 11.83s/it][A

loss: tensor(0.6164, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 223/742 [44:29<1:42:25, 11.84s/it][A

loss: tensor(0.6677, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 224/742 [44:41<1:41:57, 11.81s/it][A

loss: tensor(0.6568, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 225/742 [44:53<1:41:53, 11.82s/it][A

loss: tensor(0.7085, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 226/742 [45:05<1:41:47, 11.84s/it][A

loss: tensor(0.6979, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 227/742 [45:17<1:41:48, 11.86s/it][A

loss: tensor(0.6959, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 228/742 [45:28<1:41:16, 11.82s/it][A

loss: tensor(0.7106, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 229/742 [45:40<1:40:56, 11.81s/it][A

loss: tensor(0.6303, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 230/742 [45:52<1:40:49, 11.82s/it][A

loss: tensor(0.7896, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 231/742 [46:04<1:40:44, 11.83s/it][A

loss: tensor(0.7786, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███▏      | 232/742 [46:16<1:40:35, 11.84s/it][A

loss: tensor(0.6691, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███▏      | 233/742 [46:28<1:40:47, 11.88s/it][A

loss: tensor(0.7261, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 234/742 [46:40<1:40:50, 11.91s/it][A

loss: tensor(0.6864, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 235/742 [46:52<1:40:48, 11.93s/it][A

loss: tensor(0.7604, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 236/742 [47:04<1:41:00, 11.98s/it][A

loss: tensor(0.6779, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 237/742 [47:16<1:40:50, 11.98s/it][A

loss: tensor(0.6867, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 238/742 [47:28<1:40:54, 12.01s/it][A

loss: tensor(0.6778, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 239/742 [47:40<1:40:49, 12.03s/it][A

loss: tensor(0.7116, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 240/742 [47:52<1:40:50, 12.05s/it][A

loss: tensor(0.7225, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 241/742 [48:04<1:40:26, 12.03s/it][A

loss: tensor(0.6792, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 242/742 [48:16<1:40:26, 12.05s/it][A

loss: tensor(0.7221, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 243/742 [48:28<1:40:23, 12.07s/it][A

loss: tensor(0.7772, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 244/742 [48:40<1:40:12, 12.07s/it][A

loss: tensor(0.6954, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 245/742 [48:52<1:39:47, 12.05s/it][A

loss: tensor(0.6647, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 246/742 [49:04<1:39:37, 12.05s/it][A

loss: tensor(0.7146, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 247/742 [49:16<1:39:29, 12.06s/it][A

loss: tensor(0.7316, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 248/742 [49:29<1:39:36, 12.10s/it][A

loss: tensor(0.7364, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▎      | 249/742 [49:41<1:39:35, 12.12s/it][A

loss: tensor(0.7241, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▎      | 250/742 [49:53<1:39:14, 12.10s/it][A

loss: tensor(0.7395, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 251/742 [50:05<1:39:17, 12.13s/it][A

loss: tensor(0.6920, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 252/742 [50:17<1:39:24, 12.17s/it][A

loss: tensor(0.6898, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 253/742 [50:30<1:39:34, 12.22s/it][A

loss: tensor(0.6088, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 254/742 [50:42<1:39:01, 12.18s/it][A

loss: tensor(0.7277, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 255/742 [50:54<1:38:46, 12.17s/it][A

loss: tensor(0.7124, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 256/742 [51:06<1:38:20, 12.14s/it][A

loss: tensor(0.7598, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 257/742 [51:18<1:37:49, 12.10s/it][A

loss: tensor(0.7128, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 258/742 [51:30<1:37:11, 12.05s/it][A

loss: tensor(0.6167, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 259/742 [51:42<1:36:20, 11.97s/it][A

loss: tensor(0.6623, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 260/742 [51:53<1:35:38, 11.91s/it][A

loss: tensor(0.6932, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 261/742 [52:05<1:35:11, 11.87s/it][A

loss: tensor(0.7156, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 262/742 [52:17<1:34:53, 11.86s/it][A

loss: tensor(0.7962, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 263/742 [52:29<1:34:25, 11.83s/it][A

loss: tensor(0.6260, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 264/742 [52:41<1:34:37, 11.88s/it][A

loss: tensor(0.6875, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 265/742 [52:53<1:34:42, 11.91s/it][A

loss: tensor(0.7178, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 266/742 [53:05<1:34:25, 11.90s/it][A

loss: tensor(0.7789, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 267/742 [53:17<1:34:36, 11.95s/it][A

loss: tensor(0.7031, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 268/742 [53:29<1:34:41, 11.99s/it][A

loss: tensor(0.6271, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▋      | 269/742 [53:41<1:34:50, 12.03s/it][A

loss: tensor(0.7617, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▋      | 270/742 [53:53<1:35:18, 12.12s/it][A

loss: tensor(0.6381, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 271/742 [54:06<1:35:20, 12.15s/it][A

loss: tensor(0.6797, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 272/742 [54:17<1:34:27, 12.06s/it][A

loss: tensor(0.7445, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 273/742 [54:29<1:33:53, 12.01s/it][A

loss: tensor(0.6189, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 274/742 [54:41<1:33:42, 12.01s/it][A

loss: tensor(0.7031, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 275/742 [54:53<1:33:51, 12.06s/it][A

loss: tensor(0.6723, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 276/742 [55:06<1:34:12, 12.13s/it][A

loss: tensor(0.6481, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 277/742 [55:18<1:33:27, 12.06s/it][A

loss: tensor(0.6746, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 278/742 [55:30<1:32:52, 12.01s/it][A

loss: tensor(0.6632, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 279/742 [55:42<1:32:40, 12.01s/it][A

loss: tensor(0.6974, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 280/742 [55:53<1:31:57, 11.94s/it][A

loss: tensor(0.6030, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 281/742 [56:05<1:31:49, 11.95s/it][A

loss: tensor(0.7404, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 282/742 [56:17<1:31:48, 11.97s/it][A

loss: tensor(0.7114, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 283/742 [56:30<1:32:28, 12.09s/it][A

loss: tensor(0.7218, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 284/742 [56:42<1:32:40, 12.14s/it][A

loss: tensor(0.7852, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 285/742 [56:54<1:32:48, 12.18s/it][A

loss: tensor(0.6570, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▊      | 286/742 [57:07<1:32:46, 12.21s/it][A

loss: tensor(0.6927, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▊      | 287/742 [57:18<1:31:55, 12.12s/it][A

loss: tensor(0.6398, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 288/742 [57:30<1:31:19, 12.07s/it][A

loss: tensor(0.7144, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 289/742 [57:42<1:30:39, 12.01s/it][A

loss: tensor(0.7414, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 290/742 [57:54<1:30:10, 11.97s/it][A

loss: tensor(0.6226, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 291/742 [58:06<1:30:06, 11.99s/it][A

loss: tensor(0.6799, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 292/742 [58:18<1:29:58, 12.00s/it][A

loss: tensor(0.7875, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 293/742 [58:30<1:29:48, 12.00s/it][A

loss: tensor(0.7154, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|███▉      | 294/742 [58:42<1:29:27, 11.98s/it][A

loss: tensor(0.6176, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|███▉      | 295/742 [58:54<1:29:25, 12.00s/it][A

loss: tensor(0.6868, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|███▉      | 296/742 [59:07<1:30:00, 12.11s/it][A

loss: tensor(0.7320, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 297/742 [59:19<1:30:09, 12.16s/it][A

loss: tensor(0.5528, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 298/742 [59:31<1:29:06, 12.04s/it][A

loss: tensor(0.5724, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 299/742 [59:42<1:28:35, 12.00s/it][A

loss: tensor(0.7030, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 300/742 [59:54<1:28:11, 11.97s/it][A

loss: tensor(0.7378, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 301/742 [1:00:06<1:27:57, 11.97s/it][A

loss: tensor(0.7216, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 302/742 [1:00:18<1:27:43, 11.96s/it][A

loss: tensor(0.5905, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 303/742 [1:00:30<1:27:35, 11.97s/it][A

loss: tensor(0.7706, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 304/742 [1:00:42<1:27:29, 11.99s/it][A

loss: tensor(0.6674, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 305/742 [1:00:54<1:27:04, 11.96s/it][A

loss: tensor(0.5506, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 306/742 [1:01:06<1:27:05, 11.98s/it][A

loss: tensor(0.7453, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████▏     | 307/742 [1:01:18<1:26:45, 11.97s/it][A

loss: tensor(0.6566, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 308/742 [1:01:30<1:27:07, 12.05s/it][A

loss: tensor(0.6453, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 309/742 [1:01:43<1:27:24, 12.11s/it][A

loss: tensor(0.6641, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 310/742 [1:01:55<1:26:51, 12.06s/it][A

loss: tensor(0.6715, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 311/742 [1:02:07<1:26:17, 12.01s/it][A

loss: tensor(0.7059, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 312/742 [1:02:18<1:25:55, 11.99s/it][A

loss: tensor(0.7849, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 313/742 [1:02:30<1:25:28, 11.95s/it][A

loss: tensor(0.7195, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 314/742 [1:02:42<1:25:22, 11.97s/it][A

loss: tensor(0.5578, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 315/742 [1:02:55<1:25:51, 12.07s/it][A

loss: tensor(0.6354, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 316/742 [1:03:07<1:25:59, 12.11s/it][A

loss: tensor(0.6505, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 317/742 [1:03:19<1:26:16, 12.18s/it][A

loss: tensor(0.6996, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 318/742 [1:03:31<1:26:12, 12.20s/it][A

loss: tensor(0.6539, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 319/742 [1:03:44<1:26:09, 12.22s/it][A

loss: tensor(0.7146, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 320/742 [1:03:56<1:25:30, 12.16s/it][A

loss: tensor(0.7837, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 321/742 [1:04:08<1:24:56, 12.11s/it][A

loss: tensor(0.7282, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 322/742 [1:04:20<1:24:28, 12.07s/it][A

loss: tensor(0.7108, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▎     | 323/742 [1:04:32<1:23:58, 12.02s/it][A

loss: tensor(0.7096, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▎     | 324/742 [1:04:43<1:23:29, 11.98s/it][A

loss: tensor(0.7028, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 325/742 [1:04:55<1:23:19, 11.99s/it][A

loss: tensor(0.6367, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 326/742 [1:05:08<1:23:30, 12.04s/it][A

loss: tensor(0.7057, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 327/742 [1:05:20<1:23:29, 12.07s/it][A

loss: tensor(0.6977, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 328/742 [1:05:32<1:23:03, 12.04s/it][A

loss: tensor(0.6929, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 329/742 [1:05:44<1:22:22, 11.97s/it][A

loss: tensor(0.7245, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 330/742 [1:05:56<1:22:16, 11.98s/it][A

loss: tensor(0.6523, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▍     | 331/742 [1:06:08<1:22:13, 12.00s/it][A

loss: tensor(0.5943, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▍     | 332/742 [1:06:20<1:22:11, 12.03s/it][A

loss: tensor(0.7253, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▍     | 333/742 [1:06:32<1:21:49, 12.00s/it][A

loss: tensor(0.5738, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 334/742 [1:06:44<1:21:48, 12.03s/it][A

loss: tensor(0.5505, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 335/742 [1:06:56<1:21:50, 12.07s/it][A

loss: tensor(0.6811, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 336/742 [1:07:08<1:21:43, 12.08s/it][A

loss: tensor(0.6268, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 337/742 [1:07:20<1:21:10, 12.03s/it][A

loss: tensor(0.6474, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 338/742 [1:07:32<1:20:42, 11.99s/it][A

loss: tensor(0.7049, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 339/742 [1:07:44<1:20:34, 12.00s/it][A

loss: tensor(0.6469, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 340/742 [1:07:56<1:20:22, 12.00s/it][A

loss: tensor(0.6504, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 341/742 [1:08:08<1:20:11, 12.00s/it][A

loss: tensor(0.6939, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 342/742 [1:08:20<1:19:58, 12.00s/it][A

loss: tensor(0.6598, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 343/742 [1:08:32<1:19:52, 12.01s/it][A

loss: tensor(0.7015, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▋     | 344/742 [1:08:44<1:19:39, 12.01s/it][A

loss: tensor(0.7367, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▋     | 345/742 [1:08:56<1:19:31, 12.02s/it][A

loss: tensor(0.5600, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 346/742 [1:09:08<1:19:43, 12.08s/it][A

loss: tensor(0.6516, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 347/742 [1:09:20<1:19:26, 12.07s/it][A

loss: tensor(0.5879, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 348/742 [1:09:32<1:19:16, 12.07s/it][A

loss: tensor(0.6403, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 349/742 [1:09:44<1:18:49, 12.03s/it][A

loss: tensor(0.8184, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 350/742 [1:09:56<1:18:34, 12.03s/it][A

loss: tensor(0.7359, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 351/742 [1:10:08<1:18:03, 11.98s/it][A

loss: tensor(0.6677, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 352/742 [1:10:20<1:17:56, 11.99s/it][A

loss: tensor(0.6450, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 353/742 [1:10:32<1:17:47, 12.00s/it][A

loss: tensor(0.8019, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 354/742 [1:10:44<1:17:36, 12.00s/it][A

loss: tensor(0.6679, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 355/742 [1:10:56<1:17:14, 11.97s/it][A

loss: tensor(0.7055, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 356/742 [1:11:08<1:17:08, 11.99s/it][A

loss: tensor(0.6593, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 357/742 [1:11:20<1:16:56, 11.99s/it][A

loss: tensor(0.8233, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 358/742 [1:11:32<1:16:41, 11.98s/it][A

loss: tensor(0.7714, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 359/742 [1:11:44<1:16:13, 11.94s/it][A

loss: tensor(0.6214, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▊     | 360/742 [1:11:56<1:16:22, 12.00s/it][A

loss: tensor(0.6967, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▊     | 361/742 [1:12:08<1:16:14, 12.01s/it][A

loss: tensor(0.7171, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 362/742 [1:12:20<1:16:00, 12.00s/it][A

loss: tensor(0.5906, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 363/742 [1:12:32<1:15:53, 12.01s/it][A

loss: tensor(0.7729, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 364/742 [1:12:44<1:15:23, 11.97s/it][A

loss: tensor(0.7513, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 365/742 [1:12:56<1:15:13, 11.97s/it][A

loss: tensor(0.6680, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 366/742 [1:13:08<1:15:08, 11.99s/it][A

loss: tensor(0.6108, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 367/742 [1:13:20<1:14:59, 12.00s/it][A

loss: tensor(0.7299, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|████▉     | 368/742 [1:13:32<1:14:40, 11.98s/it][A

loss: tensor(0.7160, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|████▉     | 369/742 [1:13:44<1:14:28, 11.98s/it][A

loss: tensor(0.7587, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|████▉     | 370/742 [1:13:56<1:14:19, 11.99s/it][A

loss: tensor(0.6722, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 371/742 [1:14:08<1:14:22, 12.03s/it][A

loss: tensor(0.8192, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 372/742 [1:14:20<1:14:22, 12.06s/it][A

loss: tensor(0.6510, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 373/742 [1:14:32<1:14:09, 12.06s/it][A

loss: tensor(0.5891, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 374/742 [1:14:44<1:14:08, 12.09s/it][A

loss: tensor(0.5983, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 375/742 [1:14:56<1:14:04, 12.11s/it][A

loss: tensor(0.7704, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 376/742 [1:15:09<1:13:43, 12.09s/it][A

loss: tensor(0.7499, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 377/742 [1:15:20<1:13:07, 12.02s/it][A

loss: tensor(0.7314, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 378/742 [1:15:32<1:12:49, 12.00s/it][A

loss: tensor(0.6616, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 379/742 [1:15:44<1:12:29, 11.98s/it][A

loss: tensor(0.7184, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 380/742 [1:15:56<1:12:29, 12.01s/it][A

loss: tensor(0.6747, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████▏    | 381/742 [1:16:08<1:12:24, 12.04s/it][A

loss: tensor(0.5481, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████▏    | 382/742 [1:16:20<1:11:54, 11.99s/it][A

loss: tensor(0.6066, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 383/742 [1:16:32<1:11:56, 12.02s/it][A

loss: tensor(0.6463, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 384/742 [1:16:44<1:11:41, 12.01s/it][A

loss: tensor(0.6279, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 385/742 [1:16:57<1:11:36, 12.04s/it][A

loss: tensor(0.7836, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 386/742 [1:17:08<1:11:10, 12.00s/it][A

loss: tensor(0.7943, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 387/742 [1:17:20<1:10:53, 11.98s/it][A

loss: tensor(0.7471, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 388/742 [1:17:32<1:10:47, 12.00s/it][A

loss: tensor(0.6097, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 389/742 [1:17:44<1:10:33, 11.99s/it][A

loss: tensor(0.7841, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 390/742 [1:17:56<1:10:14, 11.97s/it][A

loss: tensor(0.7073, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 391/742 [1:18:08<1:10:07, 11.99s/it][A

loss: tensor(0.6988, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 392/742 [1:18:20<1:09:55, 11.99s/it][A

loss: tensor(0.6615, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 393/742 [1:18:32<1:09:42, 11.99s/it][A

loss: tensor(0.7104, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 394/742 [1:18:44<1:09:24, 11.97s/it][A

loss: tensor(0.6874, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 395/742 [1:18:56<1:08:59, 11.93s/it][A

loss: tensor(0.6874, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 396/742 [1:19:08<1:08:47, 11.93s/it][A

loss: tensor(0.7857, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▎    | 397/742 [1:19:20<1:08:57, 11.99s/it][A

loss: tensor(0.7096, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▎    | 398/742 [1:19:32<1:09:08, 12.06s/it][A

loss: tensor(0.6335, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 399/742 [1:19:44<1:08:54, 12.05s/it][A

loss: tensor(0.5797, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 400/742 [1:19:57<1:08:57, 12.10s/it][A

loss: tensor(0.5974, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 401/742 [1:20:09<1:08:54, 12.13s/it][A

loss: tensor(0.7868, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 402/742 [1:20:21<1:08:26, 12.08s/it][A

loss: tensor(0.5732, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 403/742 [1:20:33<1:07:54, 12.02s/it][A

loss: tensor(0.6180, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 404/742 [1:20:45<1:07:33, 11.99s/it][A

loss: tensor(0.6449, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 405/742 [1:20:57<1:07:28, 12.01s/it][A

loss: tensor(0.7066, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 406/742 [1:21:09<1:07:41, 12.09s/it][A

loss: tensor(0.5813, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 407/742 [1:21:21<1:07:33, 12.10s/it][A

loss: tensor(0.6543, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 408/742 [1:21:33<1:07:00, 12.04s/it][A

loss: tensor(0.6576, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▌    | 409/742 [1:21:45<1:06:55, 12.06s/it][A

loss: tensor(0.6759, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▌    | 410/742 [1:21:57<1:07:03, 12.12s/it][A

loss: tensor(0.6993, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▌    | 411/742 [1:22:10<1:07:08, 12.17s/it][A

loss: tensor(0.7487, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 412/742 [1:22:22<1:06:48, 12.15s/it][A

loss: tensor(0.5553, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 413/742 [1:22:34<1:06:29, 12.13s/it][A

loss: tensor(0.6369, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 414/742 [1:22:46<1:06:03, 12.08s/it][A

loss: tensor(0.7169, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 415/742 [1:22:58<1:05:39, 12.05s/it][A

loss: tensor(0.7094, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 416/742 [1:23:10<1:05:09, 11.99s/it][A

loss: tensor(0.5820, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 417/742 [1:23:22<1:04:55, 11.98s/it][A

loss: tensor(0.7242, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▋    | 418/742 [1:23:34<1:04:55, 12.02s/it][A

loss: tensor(0.6408, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▋    | 419/742 [1:23:46<1:05:03, 12.09s/it][A

loss: tensor(0.4990, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 420/742 [1:23:58<1:05:02, 12.12s/it][A

loss: tensor(0.5443, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 421/742 [1:24:10<1:04:28, 12.05s/it][A

loss: tensor(0.7546, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 422/742 [1:24:22<1:04:20, 12.06s/it][A

loss: tensor(0.6054, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 423/742 [1:24:34<1:04:12, 12.08s/it][A

loss: tensor(0.6493, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 424/742 [1:24:46<1:04:05, 12.09s/it][A

loss: tensor(0.8753, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 425/742 [1:24:58<1:04:00, 12.11s/it][A

loss: tensor(0.7236, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 426/742 [1:25:11<1:03:46, 12.11s/it][A

loss: tensor(0.5904, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 427/742 [1:25:23<1:03:25, 12.08s/it][A

loss: tensor(0.7488, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 428/742 [1:25:35<1:03:04, 12.05s/it][A

loss: tensor(0.6071, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 429/742 [1:25:47<1:02:51, 12.05s/it][A

loss: tensor(0.8178, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 430/742 [1:25:58<1:02:24, 12.00s/it][A

loss: tensor(0.7686, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 431/742 [1:26:11<1:02:14, 12.01s/it][A

loss: tensor(0.8050, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 432/742 [1:26:22<1:01:51, 11.97s/it][A

loss: tensor(0.6748, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 433/742 [1:26:34<1:01:37, 11.97s/it][A

loss: tensor(0.7856, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 434/742 [1:26:46<1:01:29, 11.98s/it][A

loss: tensor(0.7297, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▊    | 435/742 [1:26:58<1:01:26, 12.01s/it][A

loss: tensor(0.7601, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 436/742 [1:27:10<1:01:16, 12.01s/it][A

loss: tensor(0.6478, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 437/742 [1:27:22<1:01:04, 12.02s/it][A

loss: tensor(0.6857, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 438/742 [1:27:34<1:00:41, 11.98s/it][A

loss: tensor(0.7059, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 439/742 [1:27:46<1:00:33, 11.99s/it][A

loss: tensor(0.7111, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 440/742 [1:27:58<1:00:20, 11.99s/it][A

loss: tensor(0.6377, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 441/742 [1:28:10<1:00:18, 12.02s/it][A

loss: tensor(0.6278, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 442/742 [1:28:23<1:00:23, 12.08s/it][A

loss: tensor(0.7201, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 443/742 [1:28:35<1:00:14, 12.09s/it][A

loss: tensor(0.6393, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 444/742 [1:28:47<1:00:17, 12.14s/it][A

loss: tensor(0.8617, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 445/742 [1:28:59<1:00:10, 12.16s/it][A

loss: tensor(0.7728, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|██████    | 446/742 [1:29:11<59:43, 12.11s/it]  [A

loss: tensor(0.7058, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|██████    | 447/742 [1:29:23<59:20, 12.07s/it][A

loss: tensor(0.6673, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|██████    | 448/742 [1:29:35<59:01, 12.04s/it][A

loss: tensor(0.6100, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 449/742 [1:29:47<58:51, 12.05s/it][A

loss: tensor(0.6919, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 450/742 [1:29:59<58:40, 12.06s/it][A

loss: tensor(0.6584, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 451/742 [1:30:11<58:26, 12.05s/it][A

loss: tensor(0.7379, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 452/742 [1:30:23<58:01, 12.01s/it][A

loss: tensor(0.7202, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 453/742 [1:30:35<57:49, 12.01s/it][A

loss: tensor(0.6561, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 454/742 [1:30:47<57:49, 12.05s/it][A

loss: tensor(0.6624, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████▏   | 455/742 [1:31:00<57:51, 12.10s/it][A

loss: tensor(0.6970, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████▏   | 456/742 [1:31:12<57:41, 12.10s/it][A

loss: tensor(0.7211, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 457/742 [1:31:24<57:26, 12.09s/it][A

loss: tensor(0.7173, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 458/742 [1:31:36<57:08, 12.07s/it][A

loss: tensor(0.7723, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 459/742 [1:31:48<56:53, 12.06s/it][A

loss: tensor(0.6638, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 460/742 [1:32:00<56:43, 12.07s/it][A

loss: tensor(0.6968, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 461/742 [1:32:12<56:17, 12.02s/it][A

loss: tensor(0.7309, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 462/742 [1:32:24<56:06, 12.02s/it][A

loss: tensor(0.7459, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 463/742 [1:32:36<55:58, 12.04s/it][A

loss: tensor(0.7535, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 464/742 [1:32:48<56:07, 12.11s/it][A

loss: tensor(0.7305, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 465/742 [1:33:00<55:38, 12.05s/it][A

loss: tensor(0.6807, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 466/742 [1:33:12<55:18, 12.02s/it][A

loss: tensor(0.6840, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 467/742 [1:33:24<55:01, 12.01s/it][A

loss: tensor(0.6871, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 468/742 [1:33:36<54:51, 12.01s/it][A

loss: tensor(0.6865, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 469/742 [1:33:48<54:45, 12.03s/it][A

loss: tensor(0.6507, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 470/742 [1:34:00<54:46, 12.08s/it][A

loss: tensor(0.7654, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 471/742 [1:34:12<54:22, 12.04s/it][A

loss: tensor(0.6029, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▎   | 472/742 [1:34:24<54:09, 12.03s/it][A

loss: tensor(0.6284, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▎   | 473/742 [1:34:36<54:02, 12.05s/it][A

loss: tensor(0.7630, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 474/742 [1:34:49<53:49, 12.05s/it][A

loss: tensor(0.6949, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 475/742 [1:35:01<53:43, 12.07s/it][A

loss: tensor(0.6641, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 476/742 [1:35:13<53:25, 12.05s/it][A

loss: tensor(0.6992, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 477/742 [1:35:25<53:07, 12.03s/it][A

loss: tensor(0.6769, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 478/742 [1:35:36<52:42, 11.98s/it][A

loss: tensor(0.7015, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 479/742 [1:35:48<52:29, 11.98s/it][A

loss: tensor(0.6887, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 480/742 [1:36:00<52:22, 11.99s/it][A

loss: tensor(0.6438, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 481/742 [1:36:13<52:20, 12.03s/it][A

loss: tensor(0.8012, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 482/742 [1:36:24<51:53, 11.97s/it][A

loss: tensor(0.6594, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 483/742 [1:36:36<51:36, 11.95s/it][A

loss: tensor(0.7217, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 484/742 [1:36:48<51:22, 11.95s/it][A

loss: tensor(0.7813, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 485/742 [1:37:00<51:18, 11.98s/it][A

loss: tensor(0.6695, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 486/742 [1:37:12<51:07, 11.98s/it][A

loss: tensor(0.6420, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 487/742 [1:37:24<50:44, 11.94s/it][A

loss: tensor(0.6963, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 488/742 [1:37:36<50:36, 11.96s/it][A

loss: tensor(0.6711, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 489/742 [1:37:48<50:32, 11.99s/it][A

loss: tensor(0.7371, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 490/742 [1:38:00<50:18, 11.98s/it][A

loss: tensor(0.7117, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 491/742 [1:38:12<49:55, 11.93s/it][A

loss: tensor(0.7391, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▋   | 492/742 [1:38:24<49:39, 11.92s/it][A

loss: tensor(0.6587, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▋   | 493/742 [1:38:36<49:27, 11.92s/it][A

loss: tensor(0.6621, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 494/742 [1:38:48<49:19, 11.93s/it][A

loss: tensor(0.6430, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 495/742 [1:39:00<49:17, 11.97s/it][A

loss: tensor(0.7280, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 496/742 [1:39:12<49:19, 12.03s/it][A

loss: tensor(0.6642, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 497/742 [1:39:24<49:07, 12.03s/it][A

loss: tensor(0.6484, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 498/742 [1:39:36<48:45, 11.99s/it][A

loss: tensor(0.7159, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 499/742 [1:39:48<48:41, 12.02s/it][A

loss: tensor(0.6443, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 500/742 [1:40:00<48:12, 11.95s/it][A

loss: tensor(0.6373, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 501/742 [1:40:12<47:59, 11.95s/it][A

loss: tensor(0.6387, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 502/742 [1:40:24<47:44, 11.94s/it][A

loss: tensor(0.6655, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 503/742 [1:40:36<47:36, 11.95s/it][A

loss: tensor(0.7330, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 504/742 [1:40:48<47:33, 11.99s/it][A

loss: tensor(0.6263, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 505/742 [1:41:00<47:15, 11.96s/it][A

loss: tensor(0.6478, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 506/742 [1:41:12<47:10, 11.99s/it][A

loss: tensor(0.6063, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 507/742 [1:41:24<46:59, 12.00s/it][A

loss: tensor(0.6381, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 508/742 [1:41:36<46:48, 12.00s/it][A

loss: tensor(0.7344, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▊   | 509/742 [1:41:48<46:30, 11.98s/it][A

loss: tensor(0.5803, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▊   | 510/742 [1:42:00<46:17, 11.97s/it][A

loss: tensor(0.6974, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 511/742 [1:42:12<46:08, 11.99s/it][A

loss: tensor(0.6881, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 512/742 [1:42:24<46:03, 12.01s/it][A

loss: tensor(0.6659, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 513/742 [1:42:36<45:46, 11.99s/it][A

loss: tensor(0.5941, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 514/742 [1:42:48<45:38, 12.01s/it][A

loss: tensor(0.5273, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 515/742 [1:43:00<45:24, 12.00s/it][A

loss: tensor(0.5592, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 516/742 [1:43:12<45:22, 12.04s/it][A

loss: tensor(0.6615, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 517/742 [1:43:24<45:11, 12.05s/it][A

loss: tensor(0.6325, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 518/742 [1:43:36<45:06, 12.08s/it][A

loss: tensor(0.7128, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 519/742 [1:43:48<45:03, 12.12s/it][A

loss: tensor(0.7294, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 520/742 [1:44:00<44:53, 12.13s/it][A

loss: tensor(0.6605, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 521/742 [1:44:13<44:41, 12.13s/it][A

loss: tensor(0.5834, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 522/742 [1:44:24<44:10, 12.05s/it][A

loss: tensor(0.6654, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 523/742 [1:44:36<43:55, 12.03s/it][A

loss: tensor(0.5984, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 524/742 [1:44:48<43:38, 12.01s/it][A

loss: tensor(0.5914, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 525/742 [1:45:00<43:23, 12.00s/it][A

loss: tensor(0.7728, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 526/742 [1:45:12<43:04, 11.97s/it][A

loss: tensor(0.7875, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 527/742 [1:45:24<43:01, 12.01s/it][A

loss: tensor(0.4807, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 528/742 [1:45:36<42:46, 11.99s/it][A

loss: tensor(0.7584, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████▏  | 529/742 [1:45:48<42:46, 12.05s/it][A

loss: tensor(0.6002, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████▏  | 530/742 [1:46:00<42:32, 12.04s/it][A

loss: tensor(0.7603, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 531/742 [1:46:12<42:10, 11.99s/it][A

loss: tensor(0.6831, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 532/742 [1:46:24<41:57, 11.99s/it][A

loss: tensor(0.7459, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 533/742 [1:46:36<41:50, 12.01s/it][A

loss: tensor(0.6704, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 534/742 [1:46:48<41:41, 12.03s/it][A

loss: tensor(0.6773, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 535/742 [1:47:00<41:22, 11.99s/it][A

loss: tensor(0.7630, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 536/742 [1:47:12<41:10, 11.99s/it][A

loss: tensor(0.6925, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 537/742 [1:47:24<40:58, 11.99s/it][A

loss: tensor(0.8087, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 538/742 [1:47:36<40:47, 12.00s/it][A

loss: tensor(0.7190, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 539/742 [1:47:48<40:35, 12.00s/it][A

loss: tensor(0.6158, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 540/742 [1:48:00<40:12, 11.94s/it][A

loss: tensor(0.7209, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 541/742 [1:48:12<39:55, 11.92s/it][A

loss: tensor(0.7692, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 542/742 [1:48:24<39:41, 11.91s/it][A

loss: tensor(0.6530, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 543/742 [1:48:36<39:33, 11.93s/it][A

loss: tensor(0.6543, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 544/742 [1:48:48<39:22, 11.93s/it][A

loss: tensor(0.7171, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 545/742 [1:49:00<39:19, 11.98s/it][A

loss: tensor(0.5425, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▎  | 546/742 [1:49:12<39:10, 11.99s/it][A

loss: tensor(0.7812, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▎  | 547/742 [1:49:24<38:56, 11.98s/it][A

loss: tensor(0.5963, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 548/742 [1:49:36<38:42, 11.97s/it][A

loss: tensor(0.7646, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 549/742 [1:49:48<38:35, 12.00s/it][A

loss: tensor(0.7608, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 550/742 [1:50:00<38:25, 12.01s/it][A

loss: tensor(0.8472, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 551/742 [1:50:12<38:15, 12.02s/it][A

loss: tensor(0.6075, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 552/742 [1:50:24<38:06, 12.03s/it][A

loss: tensor(0.6638, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 553/742 [1:50:36<37:49, 12.01s/it][A

loss: tensor(0.5282, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 554/742 [1:50:48<37:39, 12.02s/it][A

loss: tensor(0.6618, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 555/742 [1:51:00<37:25, 12.01s/it][A

loss: tensor(0.6711, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 556/742 [1:51:12<37:14, 12.02s/it][A

loss: tensor(0.6269, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 557/742 [1:51:24<36:57, 11.99s/it][A

loss: tensor(0.6318, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 558/742 [1:51:36<36:48, 12.00s/it][A

loss: tensor(0.6021, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 559/742 [1:51:48<36:36, 12.01s/it][A

loss: tensor(0.6086, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 560/742 [1:52:00<36:28, 12.03s/it][A

loss: tensor(0.6539, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 561/742 [1:52:12<36:12, 12.00s/it][A

loss: tensor(0.6969, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 562/742 [1:52:24<36:04, 12.02s/it][A

loss: tensor(0.6625, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 563/742 [1:52:36<35:49, 12.01s/it][A

loss: tensor(0.6875, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 564/742 [1:52:48<35:42, 12.04s/it][A

loss: tensor(0.7059, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 565/742 [1:53:00<35:33, 12.05s/it][A

loss: tensor(0.6454, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▋  | 566/742 [1:53:12<35:18, 12.04s/it][A

loss: tensor(0.7467, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▋  | 567/742 [1:53:24<35:10, 12.06s/it][A

loss: tensor(0.6860, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 568/742 [1:53:36<34:55, 12.04s/it][A

loss: tensor(0.5964, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 569/742 [1:53:48<34:40, 12.03s/it][A

loss: tensor(0.6997, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 570/742 [1:54:00<34:18, 11.97s/it][A

loss: tensor(0.6930, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 571/742 [1:54:12<34:07, 11.97s/it][A

loss: tensor(0.5443, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 572/742 [1:54:24<33:59, 12.00s/it][A

loss: tensor(0.7269, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 573/742 [1:54:36<33:50, 12.01s/it][A

loss: tensor(0.7257, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 574/742 [1:54:48<33:36, 12.00s/it][A

loss: tensor(0.6384, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 575/742 [1:55:00<33:26, 12.01s/it][A

loss: tensor(0.7290, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 576/742 [1:55:12<33:14, 12.01s/it][A

loss: tensor(0.7024, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 577/742 [1:55:24<33:01, 12.01s/it][A

loss: tensor(0.5659, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 578/742 [1:55:36<32:48, 12.00s/it][A

loss: tensor(0.7002, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 579/742 [1:55:48<32:31, 11.97s/it][A

loss: tensor(0.7768, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 580/742 [1:56:00<32:17, 11.96s/it][A

loss: tensor(0.5957, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 581/742 [1:56:12<32:02, 11.94s/it][A

loss: tensor(0.7769, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 582/742 [1:56:24<31:54, 11.97s/it][A

loss: tensor(0.6020, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▊  | 583/742 [1:56:36<31:48, 12.00s/it][A

loss: tensor(0.7153, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▊  | 584/742 [1:56:48<31:30, 11.97s/it][A

loss: tensor(0.7109, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 585/742 [1:57:00<31:22, 11.99s/it][A

loss: tensor(0.6496, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 586/742 [1:57:12<31:13, 12.01s/it][A

loss: tensor(0.6064, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 587/742 [1:57:24<31:05, 12.04s/it][A

loss: tensor(0.7510, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 588/742 [1:57:36<30:50, 12.01s/it][A

loss: tensor(0.6608, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 589/742 [1:57:48<30:31, 11.97s/it][A

loss: tensor(0.6802, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 590/742 [1:58:00<30:19, 11.97s/it][A

loss: tensor(0.5363, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 591/742 [1:58:12<30:06, 11.97s/it][A

loss: tensor(0.7211, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 592/742 [1:58:24<29:52, 11.95s/it][A

loss: tensor(0.6859, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 593/742 [1:58:36<29:47, 12.00s/it][A

loss: tensor(0.6247, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 594/742 [1:58:48<29:35, 12.00s/it][A

loss: tensor(0.5617, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 595/742 [1:59:00<29:24, 12.00s/it][A

loss: tensor(0.6453, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 596/742 [1:59:12<29:13, 12.01s/it][A

loss: tensor(0.6632, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 597/742 [1:59:24<28:58, 11.99s/it][A

loss: tensor(0.6641, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 598/742 [1:59:36<28:45, 11.98s/it][A

loss: tensor(0.8211, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 599/742 [1:59:48<28:33, 11.99s/it][A

loss: tensor(0.6430, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 600/742 [2:00:00<28:23, 11.99s/it][A

loss: tensor(0.8351, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 601/742 [2:00:12<28:04, 11.95s/it][A

loss: tensor(0.8325, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 602/742 [2:00:24<27:52, 11.95s/it][A

loss: tensor(0.7340, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████▏ | 603/742 [2:00:36<27:40, 11.95s/it][A

loss: tensor(0.6662, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████▏ | 604/742 [2:00:48<27:29, 11.95s/it][A

loss: tensor(0.6353, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 605/742 [2:01:00<27:17, 11.95s/it][A

loss: tensor(0.7804, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 606/742 [2:01:12<27:02, 11.93s/it][A

loss: tensor(0.7700, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 607/742 [2:01:23<26:51, 11.94s/it][A

loss: tensor(0.7961, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 608/742 [2:01:35<26:40, 11.94s/it][A

loss: tensor(0.6639, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 609/742 [2:01:47<26:29, 11.95s/it][A

loss: tensor(0.7788, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 610/742 [2:01:59<26:12, 11.91s/it][A

loss: tensor(0.5842, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 611/742 [2:02:11<25:59, 11.90s/it][A

loss: tensor(0.5523, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 612/742 [2:02:23<25:48, 11.91s/it][A

loss: tensor(0.6980, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 613/742 [2:02:35<25:38, 11.92s/it][A

loss: tensor(0.7422, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 614/742 [2:02:47<25:23, 11.90s/it][A

loss: tensor(0.8062, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 615/742 [2:02:59<25:13, 11.92s/it][A

loss: tensor(0.7535, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 616/742 [2:03:11<25:01, 11.92s/it][A

loss: tensor(0.7275, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 617/742 [2:03:23<24:51, 11.93s/it][A

loss: tensor(0.7006, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 618/742 [2:03:35<24:40, 11.94s/it][A

loss: tensor(0.5652, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 619/742 [2:03:46<24:23, 11.89s/it][A

loss: tensor(0.5233, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▎ | 620/742 [2:03:58<24:11, 11.90s/it][A

loss: tensor(0.5461, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▎ | 621/742 [2:04:10<23:59, 11.90s/it][A

loss: tensor(0.8072, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 622/742 [2:04:22<23:50, 11.92s/it][A

loss: tensor(0.5090, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 623/742 [2:04:34<23:34, 11.89s/it][A

loss: tensor(0.7414, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 624/742 [2:04:46<23:23, 11.90s/it][A

loss: tensor(0.8840, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 625/742 [2:04:58<23:12, 11.90s/it][A

loss: tensor(0.7026, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 626/742 [2:05:10<23:01, 11.91s/it][A

loss: tensor(0.4947, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 627/742 [2:05:22<22:48, 11.90s/it][A

loss: tensor(0.7682, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 628/742 [2:05:34<22:36, 11.90s/it][A

loss: tensor(0.6575, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 629/742 [2:05:45<22:24, 11.89s/it][A

loss: tensor(0.7269, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 630/742 [2:05:57<22:12, 11.90s/it][A

loss: tensor(0.6230, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 631/742 [2:06:09<22:02, 11.91s/it][A

loss: tensor(0.7912, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 632/742 [2:06:21<21:48, 11.89s/it][A

loss: tensor(0.6073, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 633/742 [2:06:33<21:37, 11.90s/it][A

loss: tensor(0.7579, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 634/742 [2:06:45<21:27, 11.92s/it][A

loss: tensor(0.7058, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 635/742 [2:06:57<21:14, 11.91s/it][A

loss: tensor(0.7229, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 636/742 [2:07:09<20:58, 11.88s/it][A

loss: tensor(0.5288, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 637/742 [2:07:21<20:48, 11.89s/it][A

loss: tensor(0.6360, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 638/742 [2:07:33<20:37, 11.90s/it][A

loss: tensor(0.6866, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 639/742 [2:07:45<20:28, 11.93s/it][A

loss: tensor(0.6683, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▋ | 640/742 [2:07:56<20:12, 11.89s/it][A

loss: tensor(0.7687, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▋ | 641/742 [2:08:08<20:01, 11.89s/it][A

loss: tensor(0.6709, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 642/742 [2:08:20<19:50, 11.91s/it][A

loss: tensor(0.6701, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 643/742 [2:08:32<19:39, 11.92s/it][A

loss: tensor(0.7634, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 644/742 [2:08:44<19:28, 11.92s/it][A

loss: tensor(0.6855, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 645/742 [2:08:56<19:13, 11.89s/it][A

loss: tensor(0.6498, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 646/742 [2:09:08<19:02, 11.90s/it][A

loss: tensor(0.5975, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 647/742 [2:09:20<18:52, 11.92s/it][A

loss: tensor(0.5694, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 648/742 [2:09:32<18:41, 11.93s/it][A

loss: tensor(0.6279, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 649/742 [2:09:44<18:25, 11.89s/it][A

loss: tensor(0.6835, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 650/742 [2:09:55<18:13, 11.88s/it][A

loss: tensor(0.7008, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 651/742 [2:10:07<18:02, 11.89s/it][A

loss: tensor(0.7399, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 652/742 [2:10:19<17:50, 11.90s/it][A

loss: tensor(0.7001, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 653/742 [2:10:31<17:37, 11.88s/it][A

loss: tensor(0.7169, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 654/742 [2:10:43<17:25, 11.88s/it][A

loss: tensor(0.6005, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 655/742 [2:10:55<17:14, 11.89s/it][A

loss: tensor(0.7023, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 656/742 [2:11:07<17:02, 11.89s/it][A

loss: tensor(0.7159, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▊ | 657/742 [2:11:19<16:53, 11.92s/it][A

loss: tensor(0.6431, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▊ | 658/742 [2:11:31<16:40, 11.91s/it][A

loss: tensor(0.5768, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 659/742 [2:11:43<16:28, 11.91s/it][A

loss: tensor(0.7419, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 660/742 [2:11:54<16:16, 11.90s/it][A

loss: tensor(0.7386, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 661/742 [2:12:06<16:05, 11.92s/it][A

loss: tensor(0.8090, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 662/742 [2:12:18<15:53, 11.92s/it][A

loss: tensor(0.6263, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 663/742 [2:12:30<15:38, 11.88s/it][A

loss: tensor(0.7043, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 664/742 [2:12:42<15:27, 11.89s/it][A

loss: tensor(0.7385, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|████████▉ | 665/742 [2:12:54<15:16, 11.91s/it][A

loss: tensor(0.6919, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|████████▉ | 666/742 [2:13:06<15:05, 11.91s/it][A

loss: tensor(0.7236, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|████████▉ | 667/742 [2:13:18<14:52, 11.91s/it][A

loss: tensor(0.6735, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 668/742 [2:13:30<14:40, 11.90s/it][A

loss: tensor(0.6177, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 669/742 [2:13:42<14:28, 11.90s/it][A

loss: tensor(0.7062, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 670/742 [2:13:53<14:16, 11.89s/it][A

loss: tensor(0.7755, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 671/742 [2:14:05<14:04, 11.90s/it][A

loss: tensor(0.7664, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 672/742 [2:14:17<13:53, 11.90s/it][A

loss: tensor(0.6493, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 673/742 [2:14:29<13:39, 11.88s/it][A

loss: tensor(0.6828, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 674/742 [2:14:41<13:28, 11.89s/it][A

loss: tensor(0.6493, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 675/742 [2:14:53<13:15, 11.87s/it][A

loss: tensor(0.5768, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 676/742 [2:15:05<13:06, 11.92s/it][A

loss: tensor(0.6796, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 677/742 [2:15:17<12:56, 11.94s/it][A

loss: tensor(0.7458, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████▏| 678/742 [2:15:29<12:45, 11.95s/it][A

loss: tensor(0.6510, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 679/742 [2:15:41<12:34, 11.97s/it][A

loss: tensor(0.5787, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 680/742 [2:15:53<12:20, 11.94s/it][A

loss: tensor(0.7032, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 681/742 [2:16:05<12:08, 11.95s/it][A

loss: tensor(0.6287, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 682/742 [2:16:17<11:56, 11.94s/it][A

loss: tensor(0.8172, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 683/742 [2:16:29<11:44, 11.94s/it][A

loss: tensor(0.6011, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 684/742 [2:16:40<11:31, 11.92s/it][A

loss: tensor(0.6073, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 685/742 [2:16:52<11:19, 11.92s/it][A

loss: tensor(0.6345, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 686/742 [2:17:04<11:07, 11.92s/it][A

loss: tensor(0.7220, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 687/742 [2:17:16<10:55, 11.93s/it][A

loss: tensor(0.5853, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 688/742 [2:17:28<10:44, 11.93s/it][A

loss: tensor(0.5934, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 689/742 [2:17:40<10:30, 11.89s/it][A

loss: tensor(0.6610, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 690/742 [2:17:52<10:19, 11.92s/it][A

loss: tensor(0.6890, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 691/742 [2:18:04<10:08, 11.93s/it][A

loss: tensor(0.6871, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 692/742 [2:18:16<09:56, 11.92s/it][A

loss: tensor(0.6238, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 693/742 [2:18:28<09:42, 11.88s/it][A

loss: tensor(0.6400, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▎| 694/742 [2:18:40<09:31, 11.91s/it][A

loss: tensor(0.6663, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▎| 695/742 [2:18:51<09:19, 11.90s/it][A

loss: tensor(0.5606, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 696/742 [2:19:03<09:08, 11.92s/it][A

loss: tensor(0.7462, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 697/742 [2:19:15<08:57, 11.94s/it][A

loss: tensor(0.7711, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 698/742 [2:19:27<08:44, 11.92s/it][A

loss: tensor(0.7929, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 699/742 [2:19:39<08:32, 11.92s/it][A

loss: tensor(0.8439, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 700/742 [2:19:51<08:20, 11.93s/it][A

loss: tensor(0.6346, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 701/742 [2:20:03<08:09, 11.95s/it][A

loss: tensor(0.5875, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▍| 702/742 [2:20:15<07:57, 11.94s/it][A

loss: tensor(0.6485, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▍| 703/742 [2:20:27<07:45, 11.95s/it][A

loss: tensor(0.6544, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▍| 704/742 [2:20:39<07:33, 11.93s/it][A

loss: tensor(0.7173, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 705/742 [2:20:51<07:20, 11.91s/it][A

loss: tensor(0.6821, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 706/742 [2:21:03<07:08, 11.90s/it][A

loss: tensor(0.6547, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 707/742 [2:21:15<06:57, 11.91s/it][A

loss: tensor(0.6507, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 708/742 [2:21:26<06:44, 11.90s/it][A

loss: tensor(0.6364, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 709/742 [2:21:38<06:32, 11.90s/it][A

loss: tensor(0.6454, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 710/742 [2:21:50<06:21, 11.91s/it][A

loss: tensor(0.6798, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 711/742 [2:22:02<06:08, 11.89s/it][A

loss: tensor(0.6089, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 712/742 [2:22:14<05:57, 11.91s/it][A

loss: tensor(0.6401, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 713/742 [2:22:26<05:46, 11.93s/it][A

loss: tensor(0.7065, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 714/742 [2:22:38<05:34, 11.95s/it][A

loss: tensor(0.6751, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▋| 715/742 [2:22:50<05:21, 11.91s/it][A

loss: tensor(0.6788, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▋| 716/742 [2:23:02<05:09, 11.91s/it][A

loss: tensor(0.6435, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 717/742 [2:23:14<04:57, 11.92s/it][A

loss: tensor(0.7365, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 718/742 [2:23:26<04:45, 11.92s/it][A

loss: tensor(0.5731, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 719/742 [2:23:38<04:34, 11.92s/it][A

loss: tensor(0.7345, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 720/742 [2:23:49<04:22, 11.93s/it][A

loss: tensor(0.5684, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 721/742 [2:24:01<04:10, 11.94s/it][A

loss: tensor(0.7077, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 722/742 [2:24:13<03:59, 11.96s/it][A

loss: tensor(0.6300, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 723/742 [2:24:25<03:47, 11.96s/it][A

loss: tensor(0.6921, device='cuda:1', grad_fn=<NllLossBackward>)



 98%|█████████▊| 724/742 [2:24:37<03:34, 11.93s/it][A

loss: tensor(0.7300, device='cuda:1', grad_fn=<NllLossBackward>)



 98%|█████████▊| 725/742 [2:24:49<03:22, 11.91s/it][A

loss: tensor(0.6482, device='cuda:1', grad_fn=<NllLossBackward>)



 98%|█████████▊| 726/742 [2:25:01<03:10, 11.92s/it][A

loss: tensor(0.6771, device='cuda:1', grad_fn=<NllLossBackward>)



 98%|█████████▊| 727/742 [2:25:13<02:59, 11.96s/it][A

loss: tensor(0.7370, device='cuda:1', grad_fn=<NllLossBackward>)



 98%|█████████▊| 728/742 [2:25:25<02:46, 11.90s/it][A

loss: tensor(0.5601, device='cuda:1', grad_fn=<NllLossBackward>)



 98%|█████████▊| 729/742 [2:25:37<02:34, 11.91s/it][A

loss: tensor(0.7666, device='cuda:1', grad_fn=<NllLossBackward>)



 98%|█████████▊| 730/742 [2:25:49<02:23, 11.92s/it][A

loss: tensor(0.6991, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▊| 731/742 [2:26:01<02:11, 11.94s/it][A

loss: tensor(0.7355, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▊| 732/742 [2:26:13<01:59, 11.93s/it][A

loss: tensor(0.6964, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▉| 733/742 [2:26:25<01:47, 11.94s/it][A

loss: tensor(0.5682, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▉| 734/742 [2:26:37<01:35, 11.93s/it][A

loss: tensor(0.6450, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▉| 735/742 [2:26:49<01:23, 11.95s/it][A

loss: tensor(0.7226, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▉| 736/742 [2:27:01<01:11, 11.96s/it][A

loss: tensor(0.6316, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▉| 737/742 [2:27:12<00:59, 11.92s/it][A

loss: tensor(0.6625, device='cuda:1', grad_fn=<NllLossBackward>)



 99%|█████████▉| 738/742 [2:27:24<00:47, 11.90s/it][A

loss: tensor(0.6539, device='cuda:1', grad_fn=<NllLossBackward>)



100%|█████████▉| 739/742 [2:27:36<00:35, 11.90s/it][A

loss: tensor(0.5849, device='cuda:1', grad_fn=<NllLossBackward>)



100%|█████████▉| 740/742 [2:27:48<00:23, 11.92s/it][A

loss: tensor(0.7296, device='cuda:1', grad_fn=<NllLossBackward>)



100%|█████████▉| 741/742 [2:28:00<00:11, 11.94s/it][A

loss: tensor(0.6653, device='cuda:1', grad_fn=<NllLossBackward>)



100%|██████████| 742/742 [2:28:04<00:00, 11.97s/it][A


loss: tensor(0.5128, device='cuda:1', grad_fn=<NllLossBackward>)

	Train loss: 0.6823367201334062

	train acc: 0.7317890835579515

	training prec: 0.5237040166706609

	training rec: 0.5205625489739372

	training f1: 0.4986650304493786



  0%|          | 0/186 [00:00<?, ?it/s][A
  1%|          | 1/186 [00:02<08:48,  2.86s/it][A
  1%|          | 2/186 [00:05<08:40,  2.83s/it][A
  2%|▏         | 3/186 [00:08<08:34,  2.81s/it][A
  2%|▏         | 4/186 [00:11<08:29,  2.80s/it][A
  3%|▎         | 5/186 [00:14<08:32,  2.83s/it][A
  3%|▎         | 6/186 [00:16<08:27,  2.82s/it][A
  4%|▍         | 7/186 [00:19<08:22,  2.81s/it][A
  4%|▍         | 8/186 [00:22<08:20,  2.81s/it][A
  5%|▍         | 9/186 [00:25<08:16,  2.81s/it][A
  5%|▌         | 10/186 [00:28<08:13,  2.80s/it][A
  6%|▌         | 11/186 [00:30<08:09,  2.80s/it][A
  6%|▋         | 12/186 [00:33<08:11,  2.82s/it][A
  7%|▋         | 13/186 [00:36<08:07,  2.82s/it][A
  8%|▊         | 14/186 [00:39<08:03,  2.81s/it][A
  8%|▊         | 15/186 [00:42<07:59,  2.81s/it][A
  9%|▊         | 16/186 [00:44<07:56,  2.80s/it][A
  9%|▉         | 17/186 [00:47<07:52,  2.80s/it][A
 10%|▉         | 18/186 [00:50<07:49,  2.79s/it][A
 10%|█         | 19/186 [00:5


	Validation loss: 0.6650938878777206

	Validation acc: 0.8717517921146952

	Validation prec: 0.4761984767025089

	Validation rec: 0.5403225806451613

	Validation f1: 0.5052491324119298
Validation loss decreased (inf --> 0.665094).  Saving model ...


Epoch:   2%|▏         | 1/50 [2:36:43<127:59:50, 9403.88s/it]




  0%|          | 0/742 [00:00<?, ?it/s][A
  0%|          | 1/742 [00:11<2:26:30, 11.86s/it][A

loss: tensor(0.6259, device='cuda:1', grad_fn=<NllLossBackward>)



  0%|          | 2/742 [00:23<2:26:27, 11.87s/it][A

loss: tensor(0.6347, device='cuda:1', grad_fn=<NllLossBackward>)



  0%|          | 3/742 [00:35<2:26:29, 11.89s/it][A

loss: tensor(0.6950, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 4/742 [00:47<2:26:28, 11.91s/it][A

loss: tensor(0.7517, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 5/742 [00:59<2:25:58, 11.88s/it][A

loss: tensor(0.6422, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 6/742 [01:11<2:26:11, 11.92s/it][A

loss: tensor(0.6916, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 7/742 [01:23<2:25:51, 11.91s/it][A

loss: tensor(0.7100, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 8/742 [01:35<2:25:35, 11.90s/it][A

loss: tensor(0.7043, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|          | 9/742 [01:47<2:25:25, 11.90s/it][A

loss: tensor(0.6369, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|▏         | 10/742 [01:59<2:25:23, 11.92s/it][A

loss: tensor(0.7444, device='cuda:1', grad_fn=<NllLossBackward>)



  1%|▏         | 11/742 [02:11<2:25:48, 11.97s/it][A

loss: tensor(0.6926, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 12/742 [02:23<2:26:19, 12.03s/it][A

loss: tensor(0.6492, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 13/742 [02:35<2:26:43, 12.08s/it][A

loss: tensor(0.7453, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 14/742 [02:47<2:25:46, 12.01s/it][A

loss: tensor(0.5435, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 15/742 [02:59<2:25:27, 12.00s/it][A

loss: tensor(0.7034, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 16/742 [03:11<2:25:14, 12.00s/it][A

loss: tensor(0.6272, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 17/742 [03:23<2:24:56, 11.99s/it][A

loss: tensor(0.6961, device='cuda:1', grad_fn=<NllLossBackward>)



  2%|▏         | 18/742 [03:35<2:24:22, 11.96s/it][A

loss: tensor(0.5907, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 19/742 [03:47<2:23:57, 11.95s/it][A

loss: tensor(0.5924, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 20/742 [03:59<2:23:32, 11.93s/it][A

loss: tensor(0.7143, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 21/742 [04:10<2:23:17, 11.92s/it][A

loss: tensor(0.6138, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 22/742 [04:22<2:23:04, 11.92s/it][A

loss: tensor(0.6799, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 23/742 [04:34<2:22:19, 11.88s/it][A

loss: tensor(0.5709, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 24/742 [04:46<2:22:25, 11.90s/it][A

loss: tensor(0.5569, device='cuda:1', grad_fn=<NllLossBackward>)



  3%|▎         | 25/742 [04:58<2:22:55, 11.96s/it][A

loss: tensor(0.6344, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▎         | 26/742 [05:10<2:22:33, 11.95s/it][A

loss: tensor(0.5534, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▎         | 27/742 [05:22<2:21:38, 11.89s/it][A

loss: tensor(0.6313, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 28/742 [05:34<2:21:36, 11.90s/it][A

loss: tensor(0.7574, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 29/742 [05:46<2:21:16, 11.89s/it][A

loss: tensor(0.6926, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 30/742 [05:58<2:21:09, 11.90s/it][A

loss: tensor(0.7599, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 31/742 [06:09<2:20:36, 11.87s/it][A

loss: tensor(0.6868, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 32/742 [06:21<2:20:24, 11.87s/it][A

loss: tensor(0.7097, device='cuda:1', grad_fn=<NllLossBackward>)



  4%|▍         | 33/742 [06:33<2:20:05, 11.86s/it][A

loss: tensor(0.5444, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 34/742 [06:45<2:20:01, 11.87s/it][A

loss: tensor(0.6804, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 35/742 [06:57<2:20:02, 11.88s/it][A

loss: tensor(0.6977, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 36/742 [07:09<2:19:38, 11.87s/it][A

loss: tensor(0.6065, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▍         | 37/742 [07:21<2:19:31, 11.88s/it][A

loss: tensor(0.6884, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▌         | 38/742 [07:32<2:19:24, 11.88s/it][A

loss: tensor(0.7825, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▌         | 39/742 [07:44<2:19:06, 11.87s/it][A

loss: tensor(0.6642, device='cuda:1', grad_fn=<NllLossBackward>)



  5%|▌         | 40/742 [07:56<2:18:55, 11.87s/it][A

loss: tensor(0.8063, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 41/742 [08:08<2:18:08, 11.82s/it][A

loss: tensor(0.6610, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 42/742 [08:20<2:17:59, 11.83s/it][A

loss: tensor(0.6490, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 43/742 [08:32<2:17:40, 11.82s/it][A

loss: tensor(0.6778, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 44/742 [08:43<2:17:40, 11.84s/it][A

loss: tensor(0.7608, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 45/742 [08:55<2:17:04, 11.80s/it][A

loss: tensor(0.7192, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▌         | 46/742 [09:07<2:16:53, 11.80s/it][A

loss: tensor(0.6526, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▋         | 47/742 [09:19<2:17:00, 11.83s/it][A

loss: tensor(0.6907, device='cuda:1', grad_fn=<NllLossBackward>)



  6%|▋         | 48/742 [09:31<2:16:51, 11.83s/it][A

loss: tensor(0.8044, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 49/742 [09:42<2:16:39, 11.83s/it][A

loss: tensor(0.8051, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 50/742 [09:54<2:16:40, 11.85s/it][A

loss: tensor(0.7672, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 51/742 [10:06<2:16:30, 11.85s/it][A

loss: tensor(0.6322, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 52/742 [10:18<2:16:14, 11.85s/it][A

loss: tensor(0.6212, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 53/742 [10:30<2:15:47, 11.82s/it][A

loss: tensor(0.7254, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 54/742 [10:42<2:15:54, 11.85s/it][A

loss: tensor(0.6036, device='cuda:1', grad_fn=<NllLossBackward>)



  7%|▋         | 55/742 [10:54<2:15:44, 11.86s/it][A

loss: tensor(0.7262, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 56/742 [11:05<2:15:28, 11.85s/it][A

loss: tensor(0.6941, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 57/742 [11:17<2:15:20, 11.85s/it][A

loss: tensor(0.6317, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 58/742 [11:29<2:14:47, 11.82s/it][A

loss: tensor(0.7490, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 59/742 [11:41<2:14:44, 11.84s/it][A

loss: tensor(0.7740, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 60/742 [11:53<2:14:37, 11.84s/it][A

loss: tensor(0.6586, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 61/742 [12:05<2:14:20, 11.84s/it][A

loss: tensor(0.6672, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 62/742 [12:16<2:13:55, 11.82s/it][A

loss: tensor(0.5741, device='cuda:1', grad_fn=<NllLossBackward>)



  8%|▊         | 63/742 [12:28<2:13:50, 11.83s/it][A

loss: tensor(0.7857, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▊         | 64/742 [12:40<2:13:59, 11.86s/it][A

loss: tensor(0.6397, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 65/742 [12:52<2:13:50, 11.86s/it][A

loss: tensor(0.6851, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 66/742 [13:04<2:13:44, 11.87s/it][A

loss: tensor(0.6492, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 67/742 [13:16<2:13:11, 11.84s/it][A

loss: tensor(0.7087, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 68/742 [13:28<2:12:54, 11.83s/it][A

loss: tensor(0.6172, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 69/742 [13:39<2:12:36, 11.82s/it][A

loss: tensor(0.7080, device='cuda:1', grad_fn=<NllLossBackward>)



  9%|▉         | 70/742 [13:51<2:12:32, 11.83s/it][A

loss: tensor(0.6995, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 71/742 [14:03<2:11:52, 11.79s/it][A

loss: tensor(0.7695, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 72/742 [14:15<2:11:55, 11.81s/it][A

loss: tensor(0.6926, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 73/742 [14:27<2:11:50, 11.82s/it][A

loss: tensor(0.6976, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|▉         | 74/742 [14:39<2:11:58, 11.85s/it][A

loss: tensor(0.6596, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|█         | 75/742 [14:50<2:11:52, 11.86s/it][A

loss: tensor(0.6101, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|█         | 76/742 [15:02<2:11:18, 11.83s/it][A

loss: tensor(0.7041, device='cuda:1', grad_fn=<NllLossBackward>)



 10%|█         | 77/742 [15:14<2:10:52, 11.81s/it][A

loss: tensor(0.6015, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 78/742 [15:26<2:10:34, 11.80s/it][A

loss: tensor(0.6720, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 79/742 [15:38<2:10:37, 11.82s/it][A

loss: tensor(0.6831, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 80/742 [15:49<2:10:13, 11.80s/it][A

loss: tensor(0.6348, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 81/742 [16:01<2:09:57, 11.80s/it][A

loss: tensor(0.6407, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 82/742 [16:13<2:10:03, 11.82s/it][A

loss: tensor(0.6863, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█         | 83/742 [16:25<2:10:13, 11.86s/it][A

loss: tensor(0.7731, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█▏        | 84/742 [16:37<2:10:09, 11.87s/it][A

loss: tensor(0.6554, device='cuda:1', grad_fn=<NllLossBackward>)



 11%|█▏        | 85/742 [16:49<2:10:17, 11.90s/it][A

loss: tensor(0.7776, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 86/742 [17:01<2:09:57, 11.89s/it][A

loss: tensor(0.7167, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 87/742 [17:12<2:09:35, 11.87s/it][A

loss: tensor(0.7180, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 88/742 [17:24<2:09:22, 11.87s/it][A

loss: tensor(0.6331, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 89/742 [17:36<2:08:45, 11.83s/it][A

loss: tensor(0.6319, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 90/742 [17:48<2:08:37, 11.84s/it][A

loss: tensor(0.5818, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 91/742 [18:00<2:08:14, 11.82s/it][A

loss: tensor(0.7050, device='cuda:1', grad_fn=<NllLossBackward>)



 12%|█▏        | 92/742 [18:12<2:08:13, 11.84s/it][A

loss: tensor(0.7879, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 93/742 [18:23<2:07:55, 11.83s/it][A

loss: tensor(0.6528, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 94/742 [18:35<2:07:59, 11.85s/it][A

loss: tensor(0.7043, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 95/742 [18:47<2:07:53, 11.86s/it][A

loss: tensor(0.6314, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 96/742 [18:59<2:07:36, 11.85s/it][A

loss: tensor(0.6499, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 97/742 [19:11<2:06:55, 11.81s/it][A

loss: tensor(0.7261, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 98/742 [19:23<2:06:59, 11.83s/it][A

loss: tensor(0.6237, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 99/742 [19:34<2:06:57, 11.85s/it][A

loss: tensor(0.6028, device='cuda:1', grad_fn=<NllLossBackward>)



 13%|█▎        | 100/742 [19:46<2:06:38, 11.84s/it][A

loss: tensor(0.7267, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▎        | 101/742 [19:58<2:06:20, 11.83s/it][A

loss: tensor(0.6345, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▎        | 102/742 [20:10<2:05:37, 11.78s/it][A

loss: tensor(0.6337, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 103/742 [20:22<2:05:25, 11.78s/it][A

loss: tensor(0.7002, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 104/742 [20:33<2:05:22, 11.79s/it][A

loss: tensor(0.6419, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 105/742 [20:45<2:05:24, 11.81s/it][A

loss: tensor(0.6360, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 106/742 [20:57<2:05:00, 11.79s/it][A

loss: tensor(0.6959, device='cuda:1', grad_fn=<NllLossBackward>)



 14%|█▍        | 107/742 [21:09<2:06:08, 11.92s/it][A

loss: tensor(0.7341, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 108/742 [21:21<2:06:54, 12.01s/it][A

loss: tensor(0.6236, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 109/742 [21:34<2:07:34, 12.09s/it][A

loss: tensor(0.6112, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 110/742 [21:46<2:07:29, 12.10s/it][A

loss: tensor(0.7182, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▍        | 111/742 [21:58<2:07:09, 12.09s/it][A

loss: tensor(0.6261, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 112/742 [22:10<2:06:52, 12.08s/it][A

loss: tensor(0.6799, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 113/742 [22:22<2:06:43, 12.09s/it][A

loss: tensor(0.6115, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 114/742 [22:34<2:06:37, 12.10s/it][A

loss: tensor(0.6842, device='cuda:1', grad_fn=<NllLossBackward>)



 15%|█▌        | 115/742 [22:46<2:05:51, 12.04s/it][A

loss: tensor(0.7324, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 116/742 [22:58<2:05:39, 12.04s/it][A

loss: tensor(0.6762, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 117/742 [23:10<2:05:46, 12.07s/it][A

loss: tensor(0.6429, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 118/742 [23:22<2:05:34, 12.07s/it][A

loss: tensor(0.7130, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 119/742 [23:34<2:05:04, 12.05s/it][A

loss: tensor(0.6864, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▌        | 120/742 [23:46<2:04:18, 11.99s/it][A

loss: tensor(0.5840, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▋        | 121/742 [23:58<2:03:53, 11.97s/it][A

loss: tensor(0.7371, device='cuda:1', grad_fn=<NllLossBackward>)



 16%|█▋        | 122/742 [24:10<2:03:41, 11.97s/it][A

loss: tensor(0.6005, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 123/742 [24:22<2:03:36, 11.98s/it][A

loss: tensor(0.6866, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 124/742 [24:34<2:03:12, 11.96s/it][A

loss: tensor(0.5890, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 125/742 [24:46<2:02:57, 11.96s/it][A

loss: tensor(0.7090, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 126/742 [24:58<2:02:39, 11.95s/it][A

loss: tensor(0.6576, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 127/742 [25:10<2:02:24, 11.94s/it][A

loss: tensor(0.6867, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 128/742 [25:22<2:01:51, 11.91s/it][A

loss: tensor(0.6757, device='cuda:1', grad_fn=<NllLossBackward>)



 17%|█▋        | 129/742 [25:34<2:01:49, 11.92s/it][A

loss: tensor(0.6330, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 130/742 [25:45<2:01:32, 11.92s/it][A

loss: tensor(0.6022, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 131/742 [25:57<2:01:31, 11.93s/it][A

loss: tensor(0.7901, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 132/742 [26:09<2:01:00, 11.90s/it][A

loss: tensor(0.7402, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 133/742 [26:21<2:00:55, 11.91s/it][A

loss: tensor(0.6918, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 134/742 [26:33<2:00:56, 11.94s/it][A

loss: tensor(0.5548, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 135/742 [26:45<2:00:41, 11.93s/it][A

loss: tensor(0.7520, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 136/742 [26:57<2:00:37, 11.94s/it][A

loss: tensor(0.6519, device='cuda:1', grad_fn=<NllLossBackward>)



 18%|█▊        | 137/742 [27:09<2:00:18, 11.93s/it][A

loss: tensor(0.5894, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▊        | 138/742 [27:21<2:00:11, 11.94s/it][A

loss: tensor(0.6932, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▊        | 139/742 [27:33<1:59:58, 11.94s/it][A

loss: tensor(0.6364, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 140/742 [27:45<1:59:42, 11.93s/it][A

loss: tensor(0.7219, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 141/742 [27:57<1:59:13, 11.90s/it][A

loss: tensor(0.6955, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 142/742 [28:09<1:59:06, 11.91s/it][A

loss: tensor(0.7412, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 143/742 [28:21<1:59:07, 11.93s/it][A

loss: tensor(0.6439, device='cuda:1', grad_fn=<NllLossBackward>)



 19%|█▉        | 144/742 [28:33<1:59:10, 11.96s/it][A

loss: tensor(0.6448, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 145/742 [28:45<1:59:01, 11.96s/it][A

loss: tensor(0.6475, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 146/742 [28:57<1:58:47, 11.96s/it][A

loss: tensor(0.6634, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 147/742 [29:09<1:58:42, 11.97s/it][A

loss: tensor(0.6037, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|█▉        | 148/742 [29:20<1:58:31, 11.97s/it][A

loss: tensor(0.7366, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 149/742 [29:32<1:58:15, 11.97s/it][A

loss: tensor(0.6819, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 150/742 [29:44<1:57:43, 11.93s/it][A

loss: tensor(0.7686, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 151/742 [29:56<1:57:33, 11.94s/it][A

loss: tensor(0.6181, device='cuda:1', grad_fn=<NllLossBackward>)



 20%|██        | 152/742 [30:08<1:57:15, 11.92s/it][A

loss: tensor(0.6432, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 153/742 [30:20<1:57:31, 11.97s/it][A

loss: tensor(0.6936, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 154/742 [30:32<1:57:34, 12.00s/it][A

loss: tensor(0.5757, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 155/742 [30:44<1:57:10, 11.98s/it][A

loss: tensor(0.7099, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 156/742 [30:56<1:57:13, 12.00s/it][A

loss: tensor(0.5549, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██        | 157/742 [31:08<1:57:07, 12.01s/it][A

loss: tensor(0.6158, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██▏       | 158/742 [31:20<1:57:04, 12.03s/it][A

loss: tensor(0.6603, device='cuda:1', grad_fn=<NllLossBackward>)



 21%|██▏       | 159/742 [31:32<1:56:32, 11.99s/it][A

loss: tensor(0.6094, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 160/742 [31:44<1:56:20, 11.99s/it][A

loss: tensor(0.5488, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 161/742 [31:56<1:56:10, 12.00s/it][A

loss: tensor(0.7434, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 162/742 [32:08<1:56:03, 12.01s/it][A

loss: tensor(0.6260, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 163/742 [32:20<1:55:35, 11.98s/it][A

loss: tensor(0.7856, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 164/742 [32:32<1:55:31, 11.99s/it][A

loss: tensor(0.7916, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 165/742 [32:44<1:55:18, 11.99s/it][A

loss: tensor(0.7055, device='cuda:1', grad_fn=<NllLossBackward>)



 22%|██▏       | 166/742 [32:56<1:55:15, 12.01s/it][A

loss: tensor(0.6851, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 167/742 [33:08<1:55:10, 12.02s/it][A

loss: tensor(0.6815, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 168/742 [33:20<1:54:45, 12.00s/it][A

loss: tensor(0.7335, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 169/742 [33:32<1:54:38, 12.00s/it][A

loss: tensor(0.7546, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 170/742 [33:44<1:54:30, 12.01s/it][A

loss: tensor(0.6638, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 171/742 [33:56<1:54:26, 12.03s/it][A

loss: tensor(0.6609, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 172/742 [34:08<1:54:01, 12.00s/it][A

loss: tensor(0.7199, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 173/742 [34:20<1:53:44, 11.99s/it][A

loss: tensor(0.5935, device='cuda:1', grad_fn=<NllLossBackward>)



 23%|██▎       | 174/742 [34:32<1:53:26, 11.98s/it][A

loss: tensor(0.7298, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▎       | 175/742 [34:44<1:53:03, 11.96s/it][A

loss: tensor(0.6106, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▎       | 176/742 [34:56<1:52:25, 11.92s/it][A

loss: tensor(0.5817, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 177/742 [35:08<1:52:18, 11.93s/it][A

loss: tensor(0.5611, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 178/742 [35:20<1:52:27, 11.96s/it][A

loss: tensor(0.6404, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 179/742 [35:32<1:52:08, 11.95s/it][A

loss: tensor(0.6253, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 180/742 [35:44<1:51:58, 11.96s/it][A

loss: tensor(0.6220, device='cuda:1', grad_fn=<NllLossBackward>)



 24%|██▍       | 181/742 [35:56<1:51:26, 11.92s/it][A

loss: tensor(0.6850, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 182/742 [36:08<1:51:11, 11.91s/it][A

loss: tensor(0.7379, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 183/742 [36:20<1:51:08, 11.93s/it][A

loss: tensor(0.6422, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 184/742 [36:32<1:51:08, 11.95s/it][A

loss: tensor(0.5821, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▍       | 185/742 [36:43<1:50:41, 11.92s/it][A

loss: tensor(0.5778, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 186/742 [36:55<1:50:27, 11.92s/it][A

loss: tensor(0.6551, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 187/742 [37:07<1:50:27, 11.94s/it][A

loss: tensor(0.6878, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 188/742 [37:19<1:50:28, 11.96s/it][A

loss: tensor(0.6423, device='cuda:1', grad_fn=<NllLossBackward>)



 25%|██▌       | 189/742 [37:31<1:50:00, 11.94s/it][A

loss: tensor(0.6917, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 190/742 [37:43<1:49:51, 11.94s/it][A

loss: tensor(0.7595, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 191/742 [37:55<1:49:31, 11.93s/it][A

loss: tensor(0.6082, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 192/742 [38:07<1:49:24, 11.94s/it][A

loss: tensor(0.7096, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 193/742 [38:19<1:49:21, 11.95s/it][A

loss: tensor(0.7433, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▌       | 194/742 [38:31<1:48:56, 11.93s/it][A

loss: tensor(0.5952, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▋       | 195/742 [38:43<1:48:51, 11.94s/it][A

loss: tensor(0.6804, device='cuda:1', grad_fn=<NllLossBackward>)



 26%|██▋       | 196/742 [38:55<1:48:39, 11.94s/it][A

loss: tensor(0.7273, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 197/742 [39:07<1:48:34, 11.95s/it][A

loss: tensor(0.6013, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 198/742 [39:19<1:48:28, 11.96s/it][A

loss: tensor(0.6187, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 199/742 [39:31<1:47:52, 11.92s/it][A

loss: tensor(0.6816, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 200/742 [39:42<1:47:38, 11.92s/it][A

loss: tensor(0.7511, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 201/742 [39:54<1:47:25, 11.91s/it][A

loss: tensor(0.6145, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 202/742 [40:06<1:47:21, 11.93s/it][A

loss: tensor(0.7066, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 203/742 [40:18<1:47:01, 11.91s/it][A

loss: tensor(0.6008, device='cuda:1', grad_fn=<NllLossBackward>)



 27%|██▋       | 204/742 [40:30<1:46:47, 11.91s/it][A

loss: tensor(0.6348, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 205/742 [40:42<1:46:37, 11.91s/it][A

loss: tensor(0.7448, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 206/742 [40:54<1:46:36, 11.93s/it][A

loss: tensor(0.6861, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 207/742 [41:06<1:46:32, 11.95s/it][A

loss: tensor(0.6333, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 208/742 [41:18<1:46:35, 11.98s/it][A

loss: tensor(0.6330, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 209/742 [41:30<1:46:11, 11.95s/it][A

loss: tensor(0.6573, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 210/742 [41:42<1:45:57, 11.95s/it][A

loss: tensor(0.6829, device='cuda:1', grad_fn=<NllLossBackward>)



 28%|██▊       | 211/742 [41:54<1:45:27, 11.92s/it][A

loss: tensor(0.6028, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▊       | 212/742 [42:06<1:45:21, 11.93s/it][A

loss: tensor(0.6537, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▊       | 213/742 [42:18<1:45:15, 11.94s/it][A

loss: tensor(0.6527, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 214/742 [42:30<1:44:58, 11.93s/it][A

loss: tensor(0.7833, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 215/742 [42:42<1:44:50, 11.94s/it][A

loss: tensor(0.5819, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 216/742 [42:53<1:44:20, 11.90s/it][A

loss: tensor(0.6546, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 217/742 [43:05<1:44:17, 11.92s/it][A

loss: tensor(0.5878, device='cuda:1', grad_fn=<NllLossBackward>)



 29%|██▉       | 218/742 [43:17<1:44:08, 11.92s/it][A

loss: tensor(0.7302, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 219/742 [43:29<1:44:00, 11.93s/it][A

loss: tensor(0.6726, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 220/742 [43:41<1:43:37, 11.91s/it][A

loss: tensor(0.5621, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 221/742 [43:53<1:43:18, 11.90s/it][A

loss: tensor(0.7816, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|██▉       | 222/742 [44:05<1:43:15, 11.92s/it][A

loss: tensor(0.5580, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 223/742 [44:17<1:43:15, 11.94s/it][A

loss: tensor(0.7158, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 224/742 [44:29<1:43:16, 11.96s/it][A

loss: tensor(0.6233, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 225/742 [44:41<1:42:54, 11.94s/it][A

loss: tensor(0.7378, device='cuda:1', grad_fn=<NllLossBackward>)



 30%|███       | 226/742 [44:53<1:42:53, 11.96s/it][A

loss: tensor(0.6814, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 227/742 [45:05<1:43:34, 12.07s/it][A

loss: tensor(0.7521, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 228/742 [45:17<1:43:45, 12.11s/it][A

loss: tensor(0.7125, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 229/742 [45:29<1:43:30, 12.11s/it][A

loss: tensor(0.6440, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 230/742 [45:42<1:43:31, 12.13s/it][A

loss: tensor(0.5745, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███       | 231/742 [45:54<1:43:21, 12.14s/it][A

loss: tensor(0.7782, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███▏      | 232/742 [46:06<1:43:27, 12.17s/it][A

loss: tensor(0.7665, device='cuda:1', grad_fn=<NllLossBackward>)



 31%|███▏      | 233/742 [46:18<1:43:20, 12.18s/it][A

loss: tensor(0.7177, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 234/742 [46:30<1:42:39, 12.12s/it][A

loss: tensor(0.5878, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 235/742 [46:42<1:42:30, 12.13s/it][A

loss: tensor(0.6013, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 236/742 [46:54<1:42:20, 12.14s/it][A

loss: tensor(0.6235, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 237/742 [47:07<1:42:20, 12.16s/it][A

loss: tensor(0.6348, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 238/742 [47:19<1:41:58, 12.14s/it][A

loss: tensor(0.5788, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 239/742 [47:31<1:41:42, 12.13s/it][A

loss: tensor(0.6390, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 240/742 [47:43<1:41:30, 12.13s/it][A

loss: tensor(0.6538, device='cuda:1', grad_fn=<NllLossBackward>)



 32%|███▏      | 241/742 [47:55<1:41:14, 12.12s/it][A

loss: tensor(0.6762, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 242/742 [48:07<1:40:55, 12.11s/it][A

loss: tensor(0.6592, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 243/742 [48:19<1:40:49, 12.12s/it][A

loss: tensor(0.7105, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 244/742 [48:31<1:40:31, 12.11s/it][A

loss: tensor(0.7046, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 245/742 [48:44<1:40:25, 12.12s/it][A

loss: tensor(0.7292, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 246/742 [48:56<1:40:23, 12.14s/it][A

loss: tensor(0.6815, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 247/742 [49:08<1:40:07, 12.14s/it][A

loss: tensor(0.6273, device='cuda:1', grad_fn=<NllLossBackward>)



 33%|███▎      | 248/742 [49:20<1:39:58, 12.14s/it][A

loss: tensor(0.6136, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▎      | 249/742 [49:32<1:39:53, 12.16s/it][A

loss: tensor(0.6445, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▎      | 250/742 [49:45<1:40:05, 12.21s/it][A

loss: tensor(0.6811, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 251/742 [49:57<1:40:07, 12.23s/it][A

loss: tensor(0.6978, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 252/742 [50:09<1:40:12, 12.27s/it][A

loss: tensor(0.6978, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 253/742 [50:21<1:39:28, 12.21s/it][A

loss: tensor(0.6218, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 254/742 [50:33<1:38:33, 12.12s/it][A

loss: tensor(0.6649, device='cuda:1', grad_fn=<NllLossBackward>)



 34%|███▍      | 255/742 [50:45<1:37:40, 12.03s/it][A

loss: tensor(0.6961, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 256/742 [50:57<1:37:13, 12.00s/it][A

loss: tensor(0.6637, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 257/742 [51:09<1:36:59, 12.00s/it][A

loss: tensor(0.6581, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 258/742 [51:21<1:36:42, 11.99s/it][A

loss: tensor(0.6171, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▍      | 259/742 [51:33<1:36:27, 11.98s/it][A

loss: tensor(0.5491, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 260/742 [51:45<1:35:54, 11.94s/it][A

loss: tensor(0.5990, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 261/742 [51:57<1:35:37, 11.93s/it][A

loss: tensor(0.6726, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 262/742 [52:09<1:35:29, 11.94s/it][A

loss: tensor(0.5913, device='cuda:1', grad_fn=<NllLossBackward>)



 35%|███▌      | 263/742 [52:21<1:35:28, 11.96s/it][A

loss: tensor(0.6407, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 264/742 [52:32<1:35:01, 11.93s/it][A

loss: tensor(0.6307, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 265/742 [52:44<1:34:47, 11.92s/it][A

loss: tensor(0.5847, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 266/742 [52:56<1:34:36, 11.93s/it][A

loss: tensor(0.6408, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 267/742 [53:08<1:34:29, 11.94s/it][A

loss: tensor(0.7198, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▌      | 268/742 [53:20<1:34:13, 11.93s/it][A

loss: tensor(0.6487, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▋      | 269/742 [53:32<1:34:13, 11.95s/it][A

loss: tensor(0.5687, device='cuda:1', grad_fn=<NllLossBackward>)



 36%|███▋      | 270/742 [53:44<1:34:04, 11.96s/it][A

loss: tensor(0.7093, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 271/742 [53:56<1:33:49, 11.95s/it][A

loss: tensor(0.7728, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 272/742 [54:08<1:33:52, 11.98s/it][A

loss: tensor(0.7901, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 273/742 [54:20<1:33:35, 11.97s/it][A

loss: tensor(0.7588, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 274/742 [54:32<1:33:28, 11.98s/it][A

loss: tensor(0.6683, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 275/742 [54:44<1:33:22, 12.00s/it][A

loss: tensor(0.5885, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 276/742 [54:56<1:33:09, 12.00s/it][A

loss: tensor(0.7082, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 277/742 [55:08<1:32:57, 11.99s/it][A

loss: tensor(0.6690, device='cuda:1', grad_fn=<NllLossBackward>)



 37%|███▋      | 278/742 [55:20<1:32:17, 11.93s/it][A

loss: tensor(0.7073, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 279/742 [55:32<1:32:11, 11.95s/it][A

loss: tensor(0.7083, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 280/742 [55:44<1:31:55, 11.94s/it][A

loss: tensor(0.7287, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 281/742 [55:56<1:31:47, 11.95s/it][A

loss: tensor(0.7910, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 282/742 [56:08<1:31:25, 11.93s/it][A

loss: tensor(0.6775, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 283/742 [56:20<1:31:13, 11.93s/it][A

loss: tensor(0.6492, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 284/742 [56:32<1:31:05, 11.93s/it][A

loss: tensor(0.5867, device='cuda:1', grad_fn=<NllLossBackward>)



 38%|███▊      | 285/742 [56:44<1:30:56, 11.94s/it][A

loss: tensor(0.5924, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▊      | 286/742 [56:55<1:30:33, 11.92s/it][A

loss: tensor(0.6249, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▊      | 287/742 [57:07<1:30:25, 11.92s/it][A

loss: tensor(0.7411, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 288/742 [57:19<1:30:12, 11.92s/it][A

loss: tensor(0.7165, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 289/742 [57:31<1:30:13, 11.95s/it][A

loss: tensor(0.7739, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 290/742 [57:43<1:29:58, 11.94s/it][A

loss: tensor(0.8220, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 291/742 [57:55<1:29:50, 11.95s/it][A

loss: tensor(0.7136, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 292/742 [58:07<1:29:44, 11.97s/it][A

loss: tensor(0.6493, device='cuda:1', grad_fn=<NllLossBackward>)



 39%|███▉      | 293/742 [58:19<1:29:30, 11.96s/it][A

loss: tensor(0.6084, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|███▉      | 294/742 [58:31<1:29:16, 11.96s/it][A

loss: tensor(0.7485, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|███▉      | 295/742 [58:43<1:28:52, 11.93s/it][A

loss: tensor(0.6798, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|███▉      | 296/742 [58:55<1:28:48, 11.95s/it][A

loss: tensor(0.7660, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 297/742 [59:07<1:28:46, 11.97s/it][A

loss: tensor(0.7615, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 298/742 [59:19<1:28:31, 11.96s/it][A

loss: tensor(0.7787, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 299/742 [59:31<1:28:01, 11.92s/it][A

loss: tensor(0.6848, device='cuda:1', grad_fn=<NllLossBackward>)



 40%|████      | 300/742 [59:43<1:27:57, 11.94s/it][A

loss: tensor(0.6552, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 301/742 [59:55<1:28:00, 11.97s/it][A

loss: tensor(0.6641, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 302/742 [1:00:07<1:27:49, 11.98s/it][A

loss: tensor(0.6103, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 303/742 [1:00:19<1:27:34, 11.97s/it][A

loss: tensor(0.7013, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 304/742 [1:00:30<1:27:03, 11.93s/it][A

loss: tensor(0.6545, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 305/742 [1:00:42<1:26:56, 11.94s/it][A

loss: tensor(0.7405, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████      | 306/742 [1:00:55<1:27:04, 11.98s/it][A

loss: tensor(0.6391, device='cuda:1', grad_fn=<NllLossBackward>)



 41%|████▏     | 307/742 [1:01:07<1:27:06, 12.01s/it][A

loss: tensor(0.6902, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 308/742 [1:01:19<1:26:36, 11.97s/it][A

loss: tensor(0.6869, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 309/742 [1:01:30<1:26:18, 11.96s/it][A

loss: tensor(0.7105, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 310/742 [1:01:42<1:26:01, 11.95s/it][A

loss: tensor(0.7269, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 311/742 [1:01:54<1:26:09, 11.99s/it][A

loss: tensor(0.7066, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 312/742 [1:02:06<1:26:01, 12.00s/it][A

loss: tensor(0.6867, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 313/742 [1:02:18<1:25:24, 11.95s/it][A

loss: tensor(0.7198, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 314/742 [1:02:30<1:25:09, 11.94s/it][A

loss: tensor(0.6863, device='cuda:1', grad_fn=<NllLossBackward>)



 42%|████▏     | 315/742 [1:02:42<1:24:57, 11.94s/it][A

loss: tensor(0.6250, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 316/742 [1:02:54<1:24:52, 11.95s/it][A

loss: tensor(0.6972, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 317/742 [1:03:06<1:24:31, 11.93s/it][A

loss: tensor(0.7294, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 318/742 [1:03:18<1:24:16, 11.93s/it][A

loss: tensor(0.6748, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 319/742 [1:03:30<1:24:11, 11.94s/it][A

loss: tensor(0.6479, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 320/742 [1:03:42<1:23:58, 11.94s/it][A

loss: tensor(0.6813, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 321/742 [1:03:54<1:23:40, 11.92s/it][A

loss: tensor(0.5804, device='cuda:1', grad_fn=<NllLossBackward>)



 43%|████▎     | 322/742 [1:04:06<1:23:33, 11.94s/it][A

loss: tensor(0.7154, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▎     | 323/742 [1:04:18<1:23:19, 11.93s/it][A

loss: tensor(0.6865, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▎     | 324/742 [1:04:30<1:23:05, 11.93s/it][A

loss: tensor(0.7041, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 325/742 [1:04:41<1:22:55, 11.93s/it][A

loss: tensor(0.6886, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 326/742 [1:04:53<1:22:50, 11.95s/it][A

loss: tensor(0.7185, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 327/742 [1:05:06<1:22:52, 11.98s/it][A

loss: tensor(0.6833, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 328/742 [1:05:18<1:22:44, 11.99s/it][A

loss: tensor(0.6923, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 329/742 [1:05:30<1:22:36, 12.00s/it][A

loss: tensor(0.6563, device='cuda:1', grad_fn=<NllLossBackward>)



 44%|████▍     | 330/742 [1:05:42<1:22:16, 11.98s/it][A

loss: tensor(0.6268, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▍     | 331/742 [1:05:53<1:22:02, 11.98s/it][A

loss: tensor(0.6818, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▍     | 332/742 [1:06:05<1:21:47, 11.97s/it][A

loss: tensor(0.6907, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▍     | 333/742 [1:06:17<1:21:35, 11.97s/it][A

loss: tensor(0.6488, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 334/742 [1:06:29<1:21:10, 11.94s/it][A

loss: tensor(0.6068, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 335/742 [1:06:41<1:20:57, 11.94s/it][A

loss: tensor(0.6654, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 336/742 [1:06:53<1:20:51, 11.95s/it][A

loss: tensor(0.6059, device='cuda:1', grad_fn=<NllLossBackward>)



 45%|████▌     | 337/742 [1:07:05<1:20:44, 11.96s/it][A

loss: tensor(0.6808, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 338/742 [1:07:17<1:20:40, 11.98s/it][A

loss: tensor(0.7206, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 339/742 [1:07:29<1:20:11, 11.94s/it][A

loss: tensor(0.7409, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 340/742 [1:07:41<1:20:02, 11.95s/it][A

loss: tensor(0.6152, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 341/742 [1:07:53<1:19:55, 11.96s/it][A

loss: tensor(0.6647, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 342/742 [1:08:05<1:19:47, 11.97s/it][A

loss: tensor(0.6342, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▌     | 343/742 [1:08:17<1:19:21, 11.93s/it][A

loss: tensor(0.6053, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▋     | 344/742 [1:08:29<1:19:16, 11.95s/it][A

loss: tensor(0.6963, device='cuda:1', grad_fn=<NllLossBackward>)



 46%|████▋     | 345/742 [1:08:41<1:19:00, 11.94s/it][A

loss: tensor(0.6222, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 346/742 [1:08:53<1:18:50, 11.95s/it][A

loss: tensor(0.6429, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 347/742 [1:09:05<1:18:31, 11.93s/it][A

loss: tensor(0.6264, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 348/742 [1:09:16<1:18:19, 11.93s/it][A

loss: tensor(0.6186, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 349/742 [1:09:28<1:18:06, 11.93s/it][A

loss: tensor(0.6102, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 350/742 [1:09:40<1:17:59, 11.94s/it][A

loss: tensor(0.6534, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 351/742 [1:09:52<1:17:48, 11.94s/it][A

loss: tensor(0.6659, device='cuda:1', grad_fn=<NllLossBackward>)



 47%|████▋     | 352/742 [1:10:04<1:17:29, 11.92s/it][A

loss: tensor(0.7220, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 353/742 [1:10:16<1:17:24, 11.94s/it][A

loss: tensor(0.6390, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 354/742 [1:10:28<1:17:23, 11.97s/it][A

loss: tensor(0.6650, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 355/742 [1:10:40<1:17:18, 11.98s/it][A

loss: tensor(0.6002, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 356/742 [1:10:52<1:17:07, 11.99s/it][A

loss: tensor(0.7944, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 357/742 [1:11:04<1:16:39, 11.95s/it][A

loss: tensor(0.6059, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 358/742 [1:11:16<1:16:18, 11.92s/it][A

loss: tensor(0.6423, device='cuda:1', grad_fn=<NllLossBackward>)



 48%|████▊     | 359/742 [1:11:28<1:16:05, 11.92s/it][A

loss: tensor(0.6487, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▊     | 360/742 [1:11:40<1:15:56, 11.93s/it][A

loss: tensor(0.5719, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▊     | 361/742 [1:11:52<1:15:35, 11.91s/it][A

loss: tensor(0.6684, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 362/742 [1:12:04<1:15:27, 11.92s/it][A

loss: tensor(0.6723, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 363/742 [1:12:16<1:15:21, 11.93s/it][A

loss: tensor(0.6435, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 364/742 [1:12:28<1:15:10, 11.93s/it][A

loss: tensor(0.6915, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 365/742 [1:12:39<1:14:57, 11.93s/it][A

loss: tensor(0.6657, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 366/742 [1:12:51<1:14:47, 11.93s/it][A

loss: tensor(0.6175, device='cuda:1', grad_fn=<NllLossBackward>)



 49%|████▉     | 367/742 [1:13:03<1:14:29, 11.92s/it][A

loss: tensor(0.6576, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|████▉     | 368/742 [1:13:15<1:14:19, 11.92s/it][A

loss: tensor(0.7219, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|████▉     | 369/742 [1:13:27<1:14:00, 11.90s/it][A

loss: tensor(0.6587, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|████▉     | 370/742 [1:13:39<1:14:01, 11.94s/it][A

loss: tensor(0.6762, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 371/742 [1:13:51<1:14:02, 11.97s/it][A

loss: tensor(0.7146, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 372/742 [1:14:03<1:13:46, 11.96s/it][A

loss: tensor(0.6715, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 373/742 [1:14:15<1:13:43, 11.99s/it][A

loss: tensor(0.7715, device='cuda:1', grad_fn=<NllLossBackward>)



 50%|█████     | 374/742 [1:14:27<1:13:16, 11.95s/it][A

loss: tensor(0.7007, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 375/742 [1:14:39<1:13:09, 11.96s/it][A

loss: tensor(0.6009, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 376/742 [1:14:51<1:12:58, 11.96s/it][A

loss: tensor(0.5715, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 377/742 [1:15:03<1:12:48, 11.97s/it][A

loss: tensor(0.6090, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 378/742 [1:15:15<1:12:29, 11.95s/it][A

loss: tensor(0.6491, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 379/742 [1:15:27<1:12:15, 11.94s/it][A

loss: tensor(0.7095, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████     | 380/742 [1:15:39<1:12:09, 11.96s/it][A

loss: tensor(0.6520, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████▏    | 381/742 [1:15:51<1:11:57, 11.96s/it][A

loss: tensor(0.6874, device='cuda:1', grad_fn=<NllLossBackward>)



 51%|█████▏    | 382/742 [1:16:03<1:11:47, 11.96s/it][A

loss: tensor(0.7984, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 383/742 [1:16:14<1:11:19, 11.92s/it][A

loss: tensor(0.5858, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 384/742 [1:16:26<1:11:05, 11.92s/it][A

loss: tensor(0.6520, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 385/742 [1:16:38<1:11:08, 11.96s/it][A

loss: tensor(0.6343, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 386/742 [1:16:50<1:10:58, 11.96s/it][A

loss: tensor(0.6430, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 387/742 [1:17:02<1:10:33, 11.93s/it][A

loss: tensor(0.6492, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 388/742 [1:17:14<1:10:22, 11.93s/it][A

loss: tensor(0.6309, device='cuda:1', grad_fn=<NllLossBackward>)



 52%|█████▏    | 389/742 [1:17:26<1:10:08, 11.92s/it][A

loss: tensor(0.6736, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 390/742 [1:17:38<1:10:03, 11.94s/it][A

loss: tensor(0.6717, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 391/742 [1:17:50<1:09:56, 11.96s/it][A

loss: tensor(0.5880, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 392/742 [1:18:02<1:09:31, 11.92s/it][A

loss: tensor(0.6199, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 393/742 [1:18:14<1:09:21, 11.92s/it][A

loss: tensor(0.6497, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 394/742 [1:18:26<1:09:12, 11.93s/it][A

loss: tensor(0.5646, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 395/742 [1:18:38<1:09:09, 11.96s/it][A

loss: tensor(0.6435, device='cuda:1', grad_fn=<NllLossBackward>)



 53%|█████▎    | 396/742 [1:18:50<1:08:49, 11.93s/it][A

loss: tensor(0.7112, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▎    | 397/742 [1:19:02<1:08:37, 11.93s/it][A

loss: tensor(0.7361, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▎    | 398/742 [1:19:14<1:08:26, 11.94s/it][A

loss: tensor(0.6486, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 399/742 [1:19:26<1:08:17, 11.95s/it][A

loss: tensor(0.6955, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 400/742 [1:19:37<1:08:00, 11.93s/it][A

loss: tensor(0.7439, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 401/742 [1:19:49<1:07:48, 11.93s/it][A

loss: tensor(0.6383, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 402/742 [1:20:01<1:07:36, 11.93s/it][A

loss: tensor(0.6679, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 403/742 [1:20:13<1:07:26, 11.94s/it][A

loss: tensor(0.6089, device='cuda:1', grad_fn=<NllLossBackward>)



 54%|█████▍    | 404/742 [1:20:25<1:07:14, 11.94s/it][A

loss: tensor(0.6682, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 405/742 [1:20:37<1:06:55, 11.91s/it][A

loss: tensor(0.5374, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 406/742 [1:20:49<1:06:49, 11.93s/it][A

loss: tensor(0.7602, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 407/742 [1:21:01<1:06:34, 11.92s/it][A

loss: tensor(0.7234, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▍    | 408/742 [1:21:13<1:06:27, 11.94s/it][A

loss: tensor(0.6092, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▌    | 409/742 [1:21:25<1:06:12, 11.93s/it][A

loss: tensor(0.6970, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▌    | 410/742 [1:21:37<1:06:08, 11.95s/it][A

loss: tensor(0.7072, device='cuda:1', grad_fn=<NllLossBackward>)



 55%|█████▌    | 411/742 [1:21:49<1:06:04, 11.98s/it][A

loss: tensor(0.6165, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 412/742 [1:22:01<1:05:47, 11.96s/it][A

loss: tensor(0.5645, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 413/742 [1:22:13<1:05:29, 11.94s/it][A

loss: tensor(0.6669, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 414/742 [1:22:25<1:05:19, 11.95s/it][A

loss: tensor(0.6596, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 415/742 [1:22:37<1:05:13, 11.97s/it][A

loss: tensor(0.7016, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 416/742 [1:22:49<1:05:00, 11.97s/it][A

loss: tensor(0.6613, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▌    | 417/742 [1:23:01<1:04:52, 11.98s/it][A

loss: tensor(0.7438, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▋    | 418/742 [1:23:12<1:04:27, 11.94s/it][A

loss: tensor(0.6747, device='cuda:1', grad_fn=<NllLossBackward>)



 56%|█████▋    | 419/742 [1:23:24<1:04:21, 11.95s/it][A

loss: tensor(0.7628, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 420/742 [1:23:36<1:04:17, 11.98s/it][A

loss: tensor(0.6001, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 421/742 [1:23:49<1:04:09, 11.99s/it][A

loss: tensor(0.7524, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 422/742 [1:24:00<1:03:47, 11.96s/it][A

loss: tensor(0.6308, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 423/742 [1:24:12<1:03:35, 11.96s/it][A

loss: tensor(0.6687, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 424/742 [1:24:24<1:03:26, 11.97s/it][A

loss: tensor(0.7033, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 425/742 [1:24:36<1:03:15, 11.97s/it][A

loss: tensor(0.7355, device='cuda:1', grad_fn=<NllLossBackward>)



 57%|█████▋    | 426/742 [1:24:48<1:02:55, 11.95s/it][A

loss: tensor(0.6956, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 427/742 [1:25:00<1:02:43, 11.95s/it][A

loss: tensor(0.7026, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 428/742 [1:25:12<1:02:32, 11.95s/it][A

loss: tensor(0.6484, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 429/742 [1:25:24<1:02:25, 11.97s/it][A

loss: tensor(0.7276, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 430/742 [1:25:36<1:02:21, 11.99s/it][A

loss: tensor(0.6035, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 431/742 [1:25:48<1:01:59, 11.96s/it][A

loss: tensor(0.7085, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 432/742 [1:26:00<1:01:49, 11.96s/it][A

loss: tensor(0.6180, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 433/742 [1:26:12<1:01:32, 11.95s/it][A

loss: tensor(0.6301, device='cuda:1', grad_fn=<NllLossBackward>)



 58%|█████▊    | 434/742 [1:26:24<1:01:28, 11.97s/it][A

loss: tensor(0.6094, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▊    | 435/742 [1:26:36<1:01:22, 12.00s/it][A

loss: tensor(0.6854, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 436/742 [1:26:48<1:00:59, 11.96s/it][A

loss: tensor(0.6025, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 437/742 [1:27:00<1:00:51, 11.97s/it][A

loss: tensor(0.6904, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 438/742 [1:27:12<1:00:35, 11.96s/it][A

loss: tensor(0.6749, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 439/742 [1:27:24<1:00:23, 11.96s/it][A

loss: tensor(0.6548, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 440/742 [1:27:36<1:00:01, 11.93s/it][A

loss: tensor(0.6319, device='cuda:1', grad_fn=<NllLossBackward>)



 59%|█████▉    | 441/742 [1:27:48<59:51, 11.93s/it]  [A

loss: tensor(0.6173, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 442/742 [1:28:00<59:41, 11.94s/it][A

loss: tensor(0.7243, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 443/742 [1:28:12<59:33, 11.95s/it][A

loss: tensor(0.6768, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 444/742 [1:28:23<59:18, 11.94s/it][A

loss: tensor(0.6667, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|█████▉    | 445/742 [1:28:35<59:13, 11.97s/it][A

loss: tensor(0.6549, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|██████    | 446/742 [1:28:47<59:01, 11.96s/it][A

loss: tensor(0.6252, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|██████    | 447/742 [1:28:59<58:52, 11.97s/it][A

loss: tensor(0.6070, device='cuda:1', grad_fn=<NllLossBackward>)



 60%|██████    | 448/742 [1:29:11<58:32, 11.95s/it][A

loss: tensor(0.6516, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 449/742 [1:29:23<58:25, 11.96s/it][A

loss: tensor(0.6817, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 450/742 [1:29:35<58:15, 11.97s/it][A

loss: tensor(0.7557, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 451/742 [1:29:47<58:02, 11.97s/it][A

loss: tensor(0.6887, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 452/742 [1:29:59<57:52, 11.97s/it][A

loss: tensor(0.5691, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 453/742 [1:30:11<57:28, 11.93s/it][A

loss: tensor(0.6474, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████    | 454/742 [1:30:23<57:19, 11.94s/it][A

loss: tensor(0.6706, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████▏   | 455/742 [1:30:35<57:11, 11.96s/it][A

loss: tensor(0.5833, device='cuda:1', grad_fn=<NllLossBackward>)



 61%|██████▏   | 456/742 [1:30:47<57:03, 11.97s/it][A

loss: tensor(0.6519, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 457/742 [1:30:59<56:50, 11.97s/it][A

loss: tensor(0.5767, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 458/742 [1:31:11<56:41, 11.98s/it][A

loss: tensor(0.6880, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 459/742 [1:31:23<56:38, 12.01s/it][A

loss: tensor(0.6148, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 460/742 [1:31:35<56:23, 12.00s/it][A

loss: tensor(0.6050, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 461/742 [1:31:47<56:06, 11.98s/it][A

loss: tensor(0.5213, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 462/742 [1:31:59<55:44, 11.94s/it][A

loss: tensor(0.6242, device='cuda:1', grad_fn=<NllLossBackward>)



 62%|██████▏   | 463/742 [1:32:11<55:30, 11.94s/it][A

loss: tensor(0.5856, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 464/742 [1:32:23<55:21, 11.95s/it][A

loss: tensor(0.7665, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 465/742 [1:32:35<55:09, 11.95s/it][A

loss: tensor(0.6692, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 466/742 [1:32:47<54:47, 11.91s/it][A

loss: tensor(0.7486, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 467/742 [1:32:58<54:39, 11.92s/it][A

loss: tensor(0.6596, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 468/742 [1:33:10<54:25, 11.92s/it][A

loss: tensor(0.7053, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 469/742 [1:33:22<54:24, 11.96s/it][A

loss: tensor(0.5630, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 470/742 [1:33:34<54:18, 11.98s/it][A

loss: tensor(0.6322, device='cuda:1', grad_fn=<NllLossBackward>)



 63%|██████▎   | 471/742 [1:33:46<53:55, 11.94s/it][A

loss: tensor(0.6194, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▎   | 472/742 [1:33:58<53:45, 11.95s/it][A

loss: tensor(0.6613, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▎   | 473/742 [1:34:10<53:34, 11.95s/it][A

loss: tensor(0.6068, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 474/742 [1:34:22<53:29, 11.98s/it][A

loss: tensor(0.7082, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 475/742 [1:34:34<53:13, 11.96s/it][A

loss: tensor(0.6883, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 476/742 [1:34:46<53:00, 11.96s/it][A

loss: tensor(0.6309, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 477/742 [1:34:58<52:48, 11.96s/it][A

loss: tensor(0.7410, device='cuda:1', grad_fn=<NllLossBackward>)



 64%|██████▍   | 478/742 [1:35:10<52:39, 11.97s/it][A

loss: tensor(0.8668, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 479/742 [1:35:22<52:25, 11.96s/it][A

loss: tensor(0.7136, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 480/742 [1:35:34<52:17, 11.97s/it][A

loss: tensor(0.6473, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 481/742 [1:35:46<52:04, 11.97s/it][A

loss: tensor(0.6323, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▍   | 482/742 [1:35:58<51:52, 11.97s/it][A

loss: tensor(0.4793, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 483/742 [1:36:10<51:40, 11.97s/it][A

loss: tensor(0.5687, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 484/742 [1:36:22<51:24, 11.96s/it][A

loss: tensor(0.6621, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 485/742 [1:36:34<51:13, 11.96s/it][A

loss: tensor(0.5259, device='cuda:1', grad_fn=<NllLossBackward>)



 65%|██████▌   | 486/742 [1:36:46<51:02, 11.96s/it][A

loss: tensor(0.6357, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 487/742 [1:36:58<50:53, 11.97s/it][A

loss: tensor(0.5847, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 488/742 [1:37:10<50:36, 11.95s/it][A

loss: tensor(0.6206, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 489/742 [1:37:22<50:26, 11.96s/it][A

loss: tensor(0.7057, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 490/742 [1:37:34<50:16, 11.97s/it][A

loss: tensor(0.6983, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▌   | 491/742 [1:37:46<50:03, 11.97s/it][A

loss: tensor(0.6663, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▋   | 492/742 [1:37:58<49:45, 11.94s/it][A

loss: tensor(0.6918, device='cuda:1', grad_fn=<NllLossBackward>)



 66%|██████▋   | 493/742 [1:38:09<49:35, 11.95s/it][A

loss: tensor(0.6720, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 494/742 [1:38:22<49:28, 11.97s/it][A

loss: tensor(0.7385, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 495/742 [1:38:34<49:19, 11.98s/it][A

loss: tensor(0.4922, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 496/742 [1:38:45<49:05, 11.98s/it][A

loss: tensor(0.6356, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 497/742 [1:38:57<48:46, 11.95s/it][A

loss: tensor(0.5569, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 498/742 [1:39:09<48:37, 11.96s/it][A

loss: tensor(0.6940, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 499/742 [1:39:21<48:30, 11.98s/it][A

loss: tensor(0.8116, device='cuda:1', grad_fn=<NllLossBackward>)



 67%|██████▋   | 500/742 [1:39:33<48:23, 12.00s/it][A

loss: tensor(0.7129, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 501/742 [1:39:45<48:05, 11.97s/it][A

loss: tensor(0.6435, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 502/742 [1:39:57<47:53, 11.97s/it][A

loss: tensor(0.6246, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 503/742 [1:40:09<47:43, 11.98s/it][A

loss: tensor(0.7292, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 504/742 [1:40:21<47:33, 11.99s/it][A

loss: tensor(0.4881, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 505/742 [1:40:33<47:15, 11.96s/it][A

loss: tensor(0.6634, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 506/742 [1:40:45<47:05, 11.97s/it][A

loss: tensor(0.7260, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 507/742 [1:40:57<46:55, 11.98s/it][A

loss: tensor(0.5394, device='cuda:1', grad_fn=<NllLossBackward>)



 68%|██████▊   | 508/742 [1:41:09<46:42, 11.98s/it][A

loss: tensor(0.5970, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▊   | 509/742 [1:41:21<46:34, 12.00s/it][A

loss: tensor(0.7319, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▊   | 510/742 [1:41:33<46:15, 11.96s/it][A

loss: tensor(0.6711, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 511/742 [1:41:45<46:03, 11.96s/it][A

loss: tensor(0.7943, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 512/742 [1:41:57<45:46, 11.94s/it][A

loss: tensor(0.5277, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 513/742 [1:42:09<45:38, 11.96s/it][A

loss: tensor(0.6935, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 514/742 [1:42:21<45:28, 11.97s/it][A

loss: tensor(0.6186, device='cuda:1', grad_fn=<NllLossBackward>)



 69%|██████▉   | 515/742 [1:42:33<45:11, 11.95s/it][A

loss: tensor(0.6387, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 516/742 [1:42:45<45:00, 11.95s/it][A

loss: tensor(0.6780, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 517/742 [1:42:57<44:48, 11.95s/it][A

loss: tensor(0.6868, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 518/742 [1:43:09<44:42, 11.98s/it][A

loss: tensor(0.6555, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|██████▉   | 519/742 [1:43:21<44:25, 11.95s/it][A

loss: tensor(0.7220, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 520/742 [1:43:33<44:14, 11.96s/it][A

loss: tensor(0.7135, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 521/742 [1:43:45<44:03, 11.96s/it][A

loss: tensor(0.6968, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 522/742 [1:43:57<43:54, 11.97s/it][A

loss: tensor(0.5912, device='cuda:1', grad_fn=<NllLossBackward>)



 70%|███████   | 523/742 [1:44:09<43:38, 11.95s/it][A

loss: tensor(0.8357, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 524/742 [1:44:21<43:27, 11.96s/it][A

loss: tensor(0.6529, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 525/742 [1:44:32<43:13, 11.95s/it][A

loss: tensor(0.5390, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 526/742 [1:44:44<43:06, 11.97s/it][A

loss: tensor(0.5969, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 527/742 [1:44:56<42:52, 11.96s/it][A

loss: tensor(0.5673, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████   | 528/742 [1:45:08<42:46, 11.99s/it][A

loss: tensor(0.6109, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████▏  | 529/742 [1:45:21<42:36, 12.00s/it][A

loss: tensor(0.5450, device='cuda:1', grad_fn=<NllLossBackward>)



 71%|███████▏  | 530/742 [1:45:32<42:22, 11.99s/it][A

loss: tensor(0.5334, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 531/742 [1:45:45<42:13, 12.01s/it][A

loss: tensor(0.6077, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 532/742 [1:45:56<41:52, 11.96s/it][A

loss: tensor(0.6302, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 533/742 [1:46:08<41:42, 11.97s/it][A

loss: tensor(0.7684, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 534/742 [1:46:20<41:33, 11.99s/it][A

loss: tensor(0.5028, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 535/742 [1:46:32<41:19, 11.98s/it][A

loss: tensor(0.6909, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 536/742 [1:46:44<41:03, 11.96s/it][A

loss: tensor(0.5871, device='cuda:1', grad_fn=<NllLossBackward>)



 72%|███████▏  | 537/742 [1:46:56<40:51, 11.96s/it][A

loss: tensor(0.6836, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 538/742 [1:47:08<40:42, 11.97s/it][A

loss: tensor(0.6789, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 539/742 [1:47:20<40:31, 11.98s/it][A

loss: tensor(0.7774, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 540/742 [1:47:32<40:20, 11.98s/it][A

loss: tensor(0.7358, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 541/742 [1:47:44<40:01, 11.95s/it][A

loss: tensor(0.7402, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 542/742 [1:47:56<39:48, 11.94s/it][A

loss: tensor(0.6803, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 543/742 [1:48:08<39:42, 11.97s/it][A

loss: tensor(0.7272, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 544/742 [1:48:20<39:33, 11.99s/it][A

loss: tensor(0.4895, device='cuda:1', grad_fn=<NllLossBackward>)



 73%|███████▎  | 545/742 [1:48:32<39:16, 11.96s/it][A

loss: tensor(0.6550, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▎  | 546/742 [1:48:44<39:03, 11.96s/it][A

loss: tensor(0.6907, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▎  | 547/742 [1:48:56<38:54, 11.97s/it][A

loss: tensor(0.6502, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 548/742 [1:49:08<38:44, 11.98s/it][A

loss: tensor(0.6728, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 549/742 [1:49:20<38:35, 12.00s/it][A

loss: tensor(0.7220, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 550/742 [1:49:32<38:19, 11.97s/it][A

loss: tensor(0.5196, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 551/742 [1:49:44<38:08, 11.98s/it][A

loss: tensor(0.6600, device='cuda:1', grad_fn=<NllLossBackward>)



 74%|███████▍  | 552/742 [1:49:56<37:54, 11.97s/it][A

loss: tensor(0.6752, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 553/742 [1:50:08<37:47, 12.00s/it][A

loss: tensor(0.6824, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 554/742 [1:50:20<37:29, 11.96s/it][A

loss: tensor(0.5439, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 555/742 [1:50:32<37:15, 11.96s/it][A

loss: tensor(0.8073, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▍  | 556/742 [1:50:44<37:05, 11.97s/it][A

loss: tensor(0.6779, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 557/742 [1:50:56<36:55, 11.97s/it][A

loss: tensor(0.6084, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 558/742 [1:51:08<36:41, 11.96s/it][A

loss: tensor(0.7629, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 559/742 [1:51:20<36:26, 11.95s/it][A

loss: tensor(0.7472, device='cuda:1', grad_fn=<NllLossBackward>)



 75%|███████▌  | 560/742 [1:51:31<36:13, 11.94s/it][A

loss: tensor(0.6533, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 561/742 [1:51:43<36:03, 11.95s/it][A

loss: tensor(0.6803, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 562/742 [1:51:55<35:52, 11.96s/it][A

loss: tensor(0.6690, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 563/742 [1:52:07<35:34, 11.93s/it][A

loss: tensor(0.7324, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 564/742 [1:52:19<35:27, 11.95s/it][A

loss: tensor(0.5521, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▌  | 565/742 [1:52:31<35:13, 11.94s/it][A

loss: tensor(0.6110, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▋  | 566/742 [1:52:43<35:02, 11.95s/it][A

loss: tensor(0.6284, device='cuda:1', grad_fn=<NllLossBackward>)



 76%|███████▋  | 567/742 [1:52:55<34:50, 11.95s/it][A

loss: tensor(0.5397, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 568/742 [1:53:07<34:39, 11.95s/it][A

loss: tensor(0.5972, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 569/742 [1:53:19<34:31, 11.97s/it][A

loss: tensor(0.6857, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 570/742 [1:53:31<34:19, 11.97s/it][A

loss: tensor(0.6948, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 571/742 [1:53:43<34:04, 11.95s/it][A

loss: tensor(0.7075, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 572/742 [1:53:55<33:54, 11.97s/it][A

loss: tensor(0.7430, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 573/742 [1:54:07<33:43, 11.98s/it][A

loss: tensor(0.6247, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 574/742 [1:54:19<33:36, 12.00s/it][A

loss: tensor(0.5292, device='cuda:1', grad_fn=<NllLossBackward>)



 77%|███████▋  | 575/742 [1:54:31<33:23, 12.00s/it][A

loss: tensor(0.7597, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 576/742 [1:54:43<33:03, 11.95s/it][A

loss: tensor(0.5592, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 577/742 [1:54:55<32:51, 11.95s/it][A

loss: tensor(0.6227, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 578/742 [1:55:07<32:41, 11.96s/it][A

loss: tensor(0.4847, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 579/742 [1:55:19<32:31, 11.97s/it][A

loss: tensor(0.5780, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 580/742 [1:55:31<32:15, 11.95s/it][A

loss: tensor(0.6569, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 581/742 [1:55:43<32:04, 11.96s/it][A

loss: tensor(0.6620, device='cuda:1', grad_fn=<NllLossBackward>)



 78%|███████▊  | 582/742 [1:55:55<31:52, 11.95s/it][A

loss: tensor(0.7513, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▊  | 583/742 [1:56:07<31:39, 11.95s/it][A

loss: tensor(0.5247, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▊  | 584/742 [1:56:18<31:23, 11.92s/it][A

loss: tensor(0.6510, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 585/742 [1:56:30<31:12, 11.93s/it][A

loss: tensor(0.6665, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 586/742 [1:56:42<31:02, 11.94s/it][A

loss: tensor(0.6990, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 587/742 [1:56:54<30:51, 11.95s/it][A

loss: tensor(0.6861, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 588/742 [1:57:06<30:43, 11.97s/it][A

loss: tensor(0.6809, device='cuda:1', grad_fn=<NllLossBackward>)



 79%|███████▉  | 589/742 [1:57:18<30:29, 11.96s/it][A

loss: tensor(0.6865, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 590/742 [1:57:30<30:18, 11.96s/it][A

loss: tensor(0.7459, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 591/742 [1:57:42<30:08, 11.98s/it][A

loss: tensor(0.6930, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 592/742 [1:57:54<29:57, 11.98s/it][A

loss: tensor(0.6684, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|███████▉  | 593/742 [1:58:06<29:48, 12.01s/it][A

loss: tensor(0.4576, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 594/742 [1:58:18<29:32, 11.97s/it][A

loss: tensor(0.5198, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 595/742 [1:58:30<29:19, 11.97s/it][A

loss: tensor(0.5985, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 596/742 [1:58:42<29:08, 11.97s/it][A

loss: tensor(0.6066, device='cuda:1', grad_fn=<NllLossBackward>)



 80%|████████  | 597/742 [1:58:54<28:57, 11.98s/it][A

loss: tensor(0.5677, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 598/742 [1:59:06<28:42, 11.96s/it][A

loss: tensor(0.6157, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 599/742 [1:59:18<28:29, 11.96s/it][A

loss: tensor(0.5965, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 600/742 [1:59:30<28:18, 11.96s/it][A

loss: tensor(0.6585, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 601/742 [1:59:42<28:05, 11.95s/it][A

loss: tensor(0.8130, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████  | 602/742 [1:59:54<27:51, 11.94s/it][A

loss: tensor(0.5040, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████▏ | 603/742 [2:00:06<27:44, 11.97s/it][A

loss: tensor(0.7430, device='cuda:1', grad_fn=<NllLossBackward>)



 81%|████████▏ | 604/742 [2:00:18<27:32, 11.97s/it][A

loss: tensor(0.6998, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 605/742 [2:00:30<27:21, 11.98s/it][A

loss: tensor(0.6128, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 606/742 [2:00:42<27:05, 11.95s/it][A

loss: tensor(0.7018, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 607/742 [2:00:54<26:55, 11.97s/it][A

loss: tensor(0.8417, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 608/742 [2:01:06<26:44, 11.97s/it][A

loss: tensor(0.7212, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 609/742 [2:01:18<26:32, 11.97s/it][A

loss: tensor(0.6421, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 610/742 [2:01:30<26:20, 11.97s/it][A

loss: tensor(0.8819, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 611/742 [2:01:41<26:04, 11.94s/it][A

loss: tensor(0.8384, device='cuda:1', grad_fn=<NllLossBackward>)



 82%|████████▏ | 612/742 [2:01:53<25:55, 11.96s/it][A

loss: tensor(0.7694, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 613/742 [2:02:06<25:47, 12.00s/it][A

loss: tensor(0.5876, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 614/742 [2:02:18<25:33, 11.98s/it][A

loss: tensor(0.7311, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 615/742 [2:02:29<25:21, 11.98s/it][A

loss: tensor(0.6944, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 616/742 [2:02:41<25:09, 11.98s/it][A

loss: tensor(0.4795, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 617/742 [2:02:53<24:58, 11.99s/it][A

loss: tensor(0.7698, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 618/742 [2:03:05<24:46, 11.99s/it][A

loss: tensor(0.8775, device='cuda:1', grad_fn=<NllLossBackward>)



 83%|████████▎ | 619/742 [2:03:18<24:36, 12.00s/it][A

loss: tensor(0.4949, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▎ | 620/742 [2:03:29<24:21, 11.98s/it][A

loss: tensor(0.5733, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▎ | 621/742 [2:03:41<24:09, 11.98s/it][A

loss: tensor(0.5957, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 622/742 [2:03:53<24:00, 12.00s/it][A

loss: tensor(0.5152, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 623/742 [2:04:05<23:48, 12.00s/it][A

loss: tensor(0.6271, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 624/742 [2:04:17<23:34, 11.98s/it][A

loss: tensor(0.6773, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 625/742 [2:04:29<23:22, 11.99s/it][A

loss: tensor(0.5069, device='cuda:1', grad_fn=<NllLossBackward>)



 84%|████████▍ | 626/742 [2:04:41<23:12, 12.00s/it][A

loss: tensor(0.6800, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 627/742 [2:04:54<23:03, 12.03s/it][A

loss: tensor(0.7052, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 628/742 [2:05:05<22:48, 12.01s/it][A

loss: tensor(0.7169, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 629/742 [2:05:17<22:25, 11.91s/it][A

loss: tensor(0.6016, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▍ | 630/742 [2:05:29<22:16, 11.94s/it][A

loss: tensor(0.5613, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 631/742 [2:05:41<22:12, 12.00s/it][A

loss: tensor(0.6531, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 632/742 [2:05:53<22:05, 12.05s/it][A

loss: tensor(0.5719, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 633/742 [2:06:06<21:53, 12.05s/it][A

loss: tensor(0.7097, device='cuda:1', grad_fn=<NllLossBackward>)



 85%|████████▌ | 634/742 [2:06:18<21:43, 12.07s/it][A

loss: tensor(0.8630, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 635/742 [2:06:30<21:34, 12.10s/it][A

loss: tensor(0.5879, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 636/742 [2:06:42<21:24, 12.12s/it][A

loss: tensor(0.7348, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 637/742 [2:06:54<21:14, 12.13s/it][A

loss: tensor(0.7770, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 638/742 [2:07:06<21:03, 12.15s/it][A

loss: tensor(0.6739, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▌ | 639/742 [2:07:19<20:51, 12.15s/it][A

loss: tensor(0.6861, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▋ | 640/742 [2:07:31<20:40, 12.16s/it][A

loss: tensor(0.5756, device='cuda:1', grad_fn=<NllLossBackward>)



 86%|████████▋ | 641/742 [2:07:43<20:28, 12.17s/it][A

loss: tensor(0.7198, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 642/742 [2:07:55<20:10, 12.11s/it][A

loss: tensor(0.7393, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 643/742 [2:08:07<19:57, 12.10s/it][A

loss: tensor(0.8028, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 644/742 [2:08:19<19:43, 12.07s/it][A

loss: tensor(0.6686, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 645/742 [2:08:31<19:30, 12.07s/it][A

loss: tensor(0.6566, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 646/742 [2:08:43<19:11, 11.99s/it][A

loss: tensor(0.5852, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 647/742 [2:08:55<18:58, 11.98s/it][A

loss: tensor(0.5953, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 648/742 [2:09:07<18:47, 11.99s/it][A

loss: tensor(0.6006, device='cuda:1', grad_fn=<NllLossBackward>)



 87%|████████▋ | 649/742 [2:09:19<18:34, 11.98s/it][A

loss: tensor(0.5690, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 650/742 [2:09:31<18:19, 11.95s/it][A

loss: tensor(0.5854, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 651/742 [2:09:43<18:06, 11.94s/it][A

loss: tensor(0.6501, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 652/742 [2:09:54<17:55, 11.94s/it][A

loss: tensor(0.7733, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 653/742 [2:10:06<17:43, 11.95s/it][A

loss: tensor(0.6292, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 654/742 [2:10:18<17:31, 11.95s/it][A

loss: tensor(0.8194, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 655/742 [2:10:30<17:16, 11.91s/it][A

loss: tensor(0.7254, device='cuda:1', grad_fn=<NllLossBackward>)



 88%|████████▊ | 656/742 [2:10:42<17:04, 11.91s/it][A

loss: tensor(0.7174, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▊ | 657/742 [2:10:54<16:52, 11.91s/it][A

loss: tensor(0.7764, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▊ | 658/742 [2:11:06<16:42, 11.94s/it][A

loss: tensor(0.6494, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 659/742 [2:11:18<16:28, 11.91s/it][A

loss: tensor(0.6058, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 660/742 [2:11:30<16:16, 11.91s/it][A

loss: tensor(0.7241, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 661/742 [2:11:42<16:04, 11.91s/it][A

loss: tensor(0.6901, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 662/742 [2:11:54<15:53, 11.92s/it][A

loss: tensor(0.5838, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 663/742 [2:12:06<15:40, 11.90s/it][A

loss: tensor(0.7057, device='cuda:1', grad_fn=<NllLossBackward>)



 89%|████████▉ | 664/742 [2:12:18<15:31, 11.95s/it][A

loss: tensor(0.8567, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|████████▉ | 665/742 [2:12:30<15:21, 11.97s/it][A

loss: tensor(0.7063, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|████████▉ | 666/742 [2:12:42<15:11, 12.00s/it][A

loss: tensor(0.6502, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|████████▉ | 667/742 [2:12:54<15:02, 12.03s/it][A

loss: tensor(0.5780, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 668/742 [2:13:06<14:47, 12.00s/it][A

loss: tensor(0.7377, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 669/742 [2:13:18<14:34, 11.98s/it][A

loss: tensor(0.7206, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 670/742 [2:13:30<14:21, 11.96s/it][A

loss: tensor(0.7062, device='cuda:1', grad_fn=<NllLossBackward>)



 90%|█████████ | 671/742 [2:13:42<14:09, 11.97s/it][A

loss: tensor(0.5883, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 672/742 [2:13:53<13:57, 11.97s/it][A

loss: tensor(0.6584, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 673/742 [2:14:05<13:43, 11.93s/it][A

loss: tensor(0.6223, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 674/742 [2:14:17<13:31, 11.94s/it][A

loss: tensor(0.6217, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 675/742 [2:14:29<13:20, 11.95s/it][A

loss: tensor(0.6857, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 676/742 [2:14:41<13:08, 11.95s/it][A

loss: tensor(0.8010, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████ | 677/742 [2:14:53<12:54, 11.92s/it][A

loss: tensor(0.7329, device='cuda:1', grad_fn=<NllLossBackward>)



 91%|█████████▏| 678/742 [2:15:05<12:43, 11.93s/it][A

loss: tensor(0.6157, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 679/742 [2:15:17<12:34, 11.98s/it][A

loss: tensor(0.6712, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 680/742 [2:15:29<12:25, 12.03s/it][A

loss: tensor(0.6166, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 681/742 [2:15:41<12:14, 12.04s/it][A

loss: tensor(0.6491, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 682/742 [2:15:53<12:04, 12.07s/it][A

loss: tensor(0.6593, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 683/742 [2:16:06<11:53, 12.10s/it][A

loss: tensor(0.5939, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 684/742 [2:16:18<11:42, 12.11s/it][A

loss: tensor(0.7439, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 685/742 [2:16:30<11:29, 12.09s/it][A

loss: tensor(0.6039, device='cuda:1', grad_fn=<NllLossBackward>)



 92%|█████████▏| 686/742 [2:16:42<11:17, 12.11s/it][A

loss: tensor(0.6420, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 687/742 [2:16:54<11:04, 12.08s/it][A

loss: tensor(0.6553, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 688/742 [2:17:06<10:50, 12.05s/it][A

loss: tensor(0.8333, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 689/742 [2:17:18<10:38, 12.04s/it][A

loss: tensor(0.6695, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 690/742 [2:17:30<10:23, 11.99s/it][A

loss: tensor(0.6073, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 691/742 [2:17:42<10:12, 12.00s/it][A

loss: tensor(0.6310, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 692/742 [2:17:54<10:01, 12.03s/it][A

loss: tensor(0.6505, device='cuda:1', grad_fn=<NllLossBackward>)



 93%|█████████▎| 693/742 [2:18:06<09:51, 12.06s/it][A

loss: tensor(0.6476, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▎| 694/742 [2:18:18<09:38, 12.06s/it][A

loss: tensor(0.7252, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▎| 695/742 [2:18:30<09:27, 12.07s/it][A

loss: tensor(0.6676, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 696/742 [2:18:42<09:16, 12.10s/it][A

loss: tensor(0.5914, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 697/742 [2:18:55<09:05, 12.11s/it][A

loss: tensor(0.6279, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 698/742 [2:19:07<08:53, 12.12s/it][A

loss: tensor(0.5798, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 699/742 [2:19:19<08:38, 12.05s/it][A

loss: tensor(0.6717, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 700/742 [2:19:30<08:24, 12.01s/it][A

loss: tensor(0.6134, device='cuda:1', grad_fn=<NllLossBackward>)



 94%|█████████▍| 701/742 [2:19:42<08:12, 12.00s/it][A

loss: tensor(0.7031, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▍| 702/742 [2:19:54<07:59, 11.99s/it][A

loss: tensor(0.5867, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▍| 703/742 [2:20:06<07:46, 11.95s/it][A

loss: tensor(0.5995, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▍| 704/742 [2:20:18<07:34, 11.96s/it][A

loss: tensor(0.7872, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 705/742 [2:20:30<07:23, 11.99s/it][A

loss: tensor(0.5278, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 706/742 [2:20:43<07:13, 12.06s/it][A

loss: tensor(0.5952, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 707/742 [2:20:55<07:03, 12.09s/it][A

loss: tensor(0.6291, device='cuda:1', grad_fn=<NllLossBackward>)



 95%|█████████▌| 708/742 [2:21:07<06:50, 12.07s/it][A

loss: tensor(0.6655, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 709/742 [2:21:19<06:38, 12.08s/it][A

loss: tensor(0.5519, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 710/742 [2:21:31<06:25, 12.05s/it][A

loss: tensor(0.6154, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 711/742 [2:21:43<06:14, 12.08s/it][A

loss: tensor(0.7291, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 712/742 [2:21:55<06:01, 12.06s/it][A

loss: tensor(0.8189, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 713/742 [2:22:07<05:49, 12.05s/it][A

loss: tensor(0.6887, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▌| 714/742 [2:22:19<05:37, 12.05s/it][A

loss: tensor(0.6340, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▋| 715/742 [2:22:31<05:25, 12.04s/it][A

loss: tensor(0.6783, device='cuda:1', grad_fn=<NllLossBackward>)



 96%|█████████▋| 716/742 [2:22:43<05:12, 12.03s/it][A

loss: tensor(0.6915, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 717/742 [2:22:55<05:01, 12.05s/it][A

loss: tensor(0.7316, device='cuda:1', grad_fn=<NllLossBackward>)



 97%|█████████▋| 718/742 [2:23:07<04:49, 12.06s/it][A

loss: tensor(0.6273, device='cuda:1', grad_fn=<NllLossBackward>)


In [None]:
# model

In [None]:
# early_patience = 5
# n_trained_epochs

In [None]:
print("total training epoch: ", n_trained_epochs)
print("\n", "train loss: ", train_avg_loss)
print("\n", "validation loss: ", val_avg_loss)

In [None]:
print("Plot of loss with epochs")
plt.plot(range(1, n_trained_epochs+1), train_avg_loss, label="train loss")
plt.plot(range(1, n_trained_epochs+1), val_avg_loss, label="val loss")
plt.title("Training Curve (lr={})".format(lr))
plt.xlabel("epochs")
plt.ylabel("Train Loss")
plt.legend(loc="best")
plt.show()

In [None]:
plt.plot(range(1, n_trained_epochs+1), train_avg_acc, label="train acc")
plt.plot(range(1, n_trained_epochs+1), val_avg_acc, label="val acc")
plt.title("Training Curve (lr={})".format(lr))
plt.xlabel("epochs")
plt.ylabel("Train/val Accuracy")
plt.legend(loc="best")
plt.show()

In [None]:
!ls model-causal-model/

In [None]:
#ls ./model-causal-model/model_3_finetuned-5-epochs-lr_0.001.pth

## SAVE MODEL (!!)

In [None]:
## saving the model 
#torch.save(model.state_dict(), "./model-causal-model/new_finetuned-{}-epochs-{}-lr-with-weighted-loss.pth".format(give epochs from early stopping, lr)) # early stopping saves model

### Load the model 
- if train and wants to evaluate: jsut use the `sameModelName`
- if want to laod a specific model - use next cell and write in double quote

In [None]:
# model_name = str(saveModelName)
# print(model_name)

In [None]:
# model_name = early
# model_name = "./model-causal-model/model_4_finetuned--6-epochs-lr_0.001.pth"


In [None]:
# loading the locally saved model

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model = CausalityBERT()
# model.load_state_dict(torch.load(model_name))
## Move the model to the GPU 
model.to(device)
model.eval() # gettign in the eval mode 



## evaluation on the test dataset 



In [None]:
#loss_fn = CrossEntropyLoss()
test_loss = []
test_acc = []
test_prec = []
test_rec = []
test_f1 = []

test_acc_w = [] # weighted
test_prec_w = []
test_rec_w = []
test_f1_w = []

test_acc_b = [] # binary
test_prec_b = []
test_rec_b = []
test_f1_b = []

for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch     # unpack inputs from dataloader

    with torch.no_grad():
        model.eval()
        logits = model(**{"input_ids":b_input_ids, "attention_mask":b_input_mask, "token_type_ids":b_token_type_ids}) # forward pass, calculates logit predictions 
    
    # move logits and labels to CPU
    logits = logits.detach().to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    metrics = compute_metrics(pred_flat, labels_flat, "macro")
    test_acc.append(metrics["accuracy"])
    test_prec.append(metrics["precision"])
    test_rec.append(metrics["recall"])
    test_f1.append(metrics["f1"])

    metrics = compute_metrics(pred_flat, labels_flat, "weighted")
    test_acc_w.append(metrics["accuracy"])
    test_prec_w.append(metrics["precision"])
    test_rec_w.append(metrics["recall"])
    test_f1_w.append(metrics["f1"])

    metrics = compute_metrics(pred_flat, labels_flat, "binary")
    test_acc_b.append(metrics["accuracy"])
    test_prec_b.append(metrics["precision"])
    test_rec_b.append(metrics["recall"])
    test_f1_b.append(metrics["f1"])

print(F'\n\ttest loss: {np.mean(test_loss)}')
print(F'\n\ttest acc macro: {np.mean(test_acc)}')
print(F'\n\ttest prec macro: {np.mean(test_prec)}')
print(F'\n\ttest rec macro: {np.mean(test_rec)}')
print(F'\n\ttest f1 macro: {np.mean(test_f1)}')
print()
print(F'\n\ttest acc weighted: {np.mean(test_acc_w)}')
print(F'\n\ttest prec weighted: {np.mean(test_prec_w)}')
print(F'\n\ttest rec weighted: {np.mean(test_rec_w)}')
print(F'\n\ttest f1 weighted: {np.mean(test_f1_w)}')
print()
print(F'\n\ttest acc binary: {np.mean(test_acc_b)}')
print(F'\n\ttest prec binary: {np.mean(test_prec_b)}')
print(F'\n\ttest rec binary: {np.mean(test_rec_b)}')
print(F'\n\ttest f1 binary: {np.mean(test_f1_b)}')


### Print predictions of last test set batch:

In [None]:
# take last batch of test set:

for i in range(len(batch)):
    tokens = tokenizer.convert_ids_to_tokens(b_input_ids[i])
    print("\nPadded Sentence:")
    print(tokens)
    print("prediction:", pred_flat[i])
    