In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm, trange
from utils import normalizeTweet, split_into_sentences, bio_tagging, create_training_data



# data = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
#data = pd.read_excel("/home/adrian/Downloads/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
data = pd.read_excel("Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")

print("Total count:", data.shape[0])
data = data[data["Causal association"].notnull()]
print("Labeled count:", data.shape[0])

data.head()

  data = pd.read_excel("Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")


Total count: 5434
Labeled count: 5000


Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


## Add BIO tags

In [2]:
data["tokenized"] = data["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
data["bio_tags"] = data.apply(lambda row: bio_tagging(row["full_text"],row["Cause"], row["Effect"]), axis=1)
data.head(20)

Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks,tokenized,bio_tags
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,,"[USER, Cheers, !, Have, one, for, this, diabet...","[O, O, O, O, O, O, O, O, O, O]"
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,,"[USER, Additionally, the, medicines, are, bein...","[O, O, O, B-C, I-C, I-C, I-C, I-C, I-C, O, O, ..."
5,885136447774363649,USER USER We have those days Esp . if it inter...,USER USER We have those days Esp . if it inter...,msS,diabetic,hate,1.0,,,"[USER, USER, We, have, those, days, Esp, ., if...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6,1320879577640652807,Why all of a sudden are people hungry and vuln...,Why all of a sudden are people hungry and vuln...,q,,,0.0,,,"[Why, all, of, a, sudden, are, people, hungry,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,1187771905090211842,"i got lime for my glucose test , wasn't that b...","i got lime for my glucose test , wasn't that b...",,glucose test,nauseous,1.0,,,"[i, got, lime, for, my, glucose, test, ,, was,...","[O, O, O, O, O, B-C, I-C, O, O, O, O, O, O, O,..."
8,1174332944493817858,This stickur of Unkel Funny iz ware i am shave...,This stickur of Unkel Funny iz ware i am shave...,,,,0.0,,,"[This, stickur, of, Unkel, Funny, iz, ware, i,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
9,1202836680501121024,For the second time in my life I gave myself i...,For the second time in my life I gave myself i...,mS,,,0.0,,,"[For, the, second, time, in, my, life, I, gave...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


### split tweets into sentences => new dataframe with more rows

In [3]:
def get_start_end_index_of_sentence_in_tweet(tweet, sentence):
    """ 
    The sentence tokens are included in the tweet tokens.
    Return the start end end indices of the sentence tokens in the tweet tokens

    """

    sentence_start_word = sentence[0]
    start_indices = [i for i, x in enumerate(tweet) if x == sentence_start_word] # find all indices of the start word of the sentence 
    try:
        for start_index in start_indices:
            isTrueStartIndex = all([tweet[start_index+i] == sentence[i] for i in range(len(sentence))])
            #print("start_index:", start_index, "isTrueStartIndex:", isTrueStartIndex)
            if isTrueStartIndex:
                return start_index, start_index + len(sentence) 
    except:
        print("ERROR: StartIndex should have been found for sentence:")
        print("tweet:")
        print(tweet)
        print("sentence:")
        print(sentence)
    return -1, -2 # should not be returned


def split_tweets_to_sentences(data):
    """ 
        Splits tweets into sentences and associates the appropriate intent, causes, effects and causal association
        to each sentence.
        
        Parameters:
        - min_words_in_sentences: Minimal number of words in a sentence such that the sentence is kept. 
                                  Assumption: A sentence with too few words does not have enough information
                              
                              
                              
        Ex.:
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what? type 1 causes insulin dependence | q;msS  | type 1|insulin dependence | 1       | ...  
        
        New dataframe returned: 
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what?                                  |   q    |       |        |       0            | ...
        type 1 causes insulin dependence       |        | type 1| insulin dependence | 1       | ...  
    """

    newDF = pd.DataFrame(columns=["sentence", "Intent", "Cause", "Effect", "Causal association", "tokenized", "bio_tags"])
    
    for i,row in data.iterrows():
        causes = row["Cause"]
        effects = row["Effect"]
        sentences = split_into_sentences(normalizeTweet(row["full_text"]))

        # single sentence in tweet
        if len(sentences) == 1:
            singleSentenceIntent = ""
            if isinstance(row["Intent"], str):
                if len(row["Intent"].split(";")) > 1:
                    singleSentenceIntent = row["Intent"].strip().replace(";msS", "").replace("msS;", "").replace(";mS", "").replace("mS;", "")
                else:
                    if row["Intent"] == "mS" or row["Intent"] == "msS":
                        singleSentenceIntent = ""
                    else:
                        singleSentenceIntent = row["Intent"].strip()
                    
            newDF=newDF.append(pd.Series({"sentence": sentences[0] # only one sentence
                         , "Intent": singleSentenceIntent
                         , "Cause" : row["Cause"]
                         , "Effect": row["Effect"]
                         , "Causal association" : row["Causal association"]
                         , "tokenized": row["tokenized"]
                         , "bio_tags": row["bio_tags"]}), ignore_index=True)
        
        # tweet has several sentences
        else: 
            intents = str(row["Intent"]).strip().split(";")
            
            for sentence in sentences:
                sent_tokenized = sentence.split(" ")
                
                causeInSentence = np.nan if not isinstance(causes, str) or not any([cause in sentence for cause in causes.split(";")]) else ";".join([cause for cause in causes.split(";") if cause in sentence])
                effectInSentence = np.nan if not isinstance(effects, str) or not any([effect in sentence for effect in effects.split(";")]) else ";".join([effect for effect in effects.split(";") if effect in sentence])
                causalAssociationInSentence = 1 if isinstance(causeInSentence, str) and isinstance(effectInSentence, str) else 0
                
                startIndex, endIndex = get_start_end_index_of_sentence_in_tweet(row["tokenized"], sent_tokenized)
                sentence_tokenized = row["tokenized"][startIndex:endIndex]
                sentence_bio_tags = row["bio_tags"][startIndex:endIndex]
                
                if "q" in intents and sentence[-1] == "?": # if current sentence is question
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "q", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                    
                elif "joke" in intents: # all sentences with "joke" in tweet keep the intent "joke"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "joke", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)   
                elif "neg" in intents: # all sentences with "neg" in tweet keep intent "neg"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "neg", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)               
                elif isinstance(causeInSentence, str) and isinstance(effectInSentence, str): # cause effect sentence
                    causalIntent = ""
                    if len(causeInSentence.split(";")) > 1:
                        causalIntent = "mC"
                        if len(effectInSentence.split(";")) > 1:
                            causalIntent = "mC;mE"
                    elif len(effectInSentence.split(";")) > 1:
                        causalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": causalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                                  
                else:
                    nonCausalIntent = ""
                    if isinstance(causeInSentence, str): # only cause is given
                        if len(causeInSentence.split(";")) > 1:
                            nonCausalIntent = "mC"
                    elif isinstance(effectInSentence, str): # only effect is given
                        if len(effectInSentence.split(";")) > 1:
                            nonCausalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": nonCausalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)

    return newDF
       
# sample: has one example for each possible "Intent" value
#allIntents = data["Intent"].value_counts().keys().tolist()
#sample = data[data["Intent"] == "mS"][0:1]
#for intent in allIntents:
#    sample = sample.append(data[data["Intent"] == intent][1:2])
#print(sample.shape)

#i = 19
#test = sample[i:i+1]
#dataSentences = split_tweets_to_sentences(test)
#dataSentences.head(30)
#test.head()

print("N tweets:", data.shape[0])
dataSentences = split_tweets_to_sentences(data)
print("N sentences:", dataSentences.shape[0])
dataSentences.head()

N tweets: 5000
N sentences: 11784


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,Fiercely .,,,,0,"[Fiercely, .]","[O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
3,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,:face_with_rolling_eyes:,joke,,,0,[:face_with_rolling_eyes:],[O]


### Filter out negation, jokes, questions and sentences with minimal token length of 3

In [4]:
print("N sentences before filtering: ", dataSentences.shape[0])
dataSentFiltered = dataSentences[~dataSentences["Intent"].str.contains("neg|joke|q")] # remove sentences with joke, q, neg
dataSentFiltered = dataSentFiltered[dataSentFiltered["tokenized"].map(len) >= 3] # only keep sentences with at least 3 words
print("N sentences after filtering: ", dataSentFiltered.shape[0])
dataSentFiltered.head()


N sentences before filtering:  11784
N sentences after filtering:  8835


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
5,:down_arrow: :down_arrow: :down_arrow: THIS :d...,,,,0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
6,I 'm a trans woman .,,,,0,"[I, 'm, a, trans, woman, .]","[O, O, O, O, O, O]"
7,"Both of us could use a world where "" brave and...",,,,0,"[Both, of, us, could, use, a, world, where, "",...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


### only work on cause-effect tweets?

In [5]:
# only take sentences with cause and effect
#trainingData = dataSentFiltered[dataSentFiltered["Causal association"] == 1]
#trainingData.shape

# not in the multitask setting

### Create training, validation, test sets

In [6]:
####################### Stratified splits ####################
trainingData = dataSentFiltered#.sample(n=300, random_state=0) # TODO: remove sample
train, test = train_test_split(trainingData, test_size=0.2, stratify=trainingData[["Causal association"]], random_state=0)
train, val = train_test_split(train, test_size=0.2, stratify=train[["Causal association"]], random_state=0)

data_count_info = trainingData["Causal association"].value_counts(normalize=True)
train_count_info = train["Causal association"].value_counts(normalize=True)
val_count_info = val["Causal association"].value_counts(normalize=True)
test_count_info = test["Causal association"].value_counts(normalize=True)

# for class-imbalanced dataset, the class weight for a ith class
# to be specified for balancing in the loss function is given by:
# weight[i] = num_samples / (num_classes * num_samples[i])
# since train_count_info obtained above has fraction of samples
# for ith class, hence the corresponding weight calculation is:
class_weight = (1/train_count_info)/len(train_count_info)

print("All: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(trainingData["Causal association"]), *data_count_info.round(4).to_list()))
print("Train: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(train["Causal association"]), *train_count_info.round(4).to_list()))
print("Val: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(val["Causal association"]), *val_count_info.round(4).to_list()))
print("Test: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(test["Causal association"]), *test_count_info.round(4).to_list()))
print("Balancing class wts: for 0 = {}, for 1 = {}".format(
    *class_weight.round(4).to_list()))

All: 	Count = 8835, % of 0 = 0.8827, % of 1 = 0.1173
Train: 	Count = 5654, % of 0 = 0.8827, % of 1 = 0.1173
Val: 	Count = 1414, % of 0 = 0.8826, % of 1 = 0.1174
Test: 	Count = 1767, % of 0 = 0.8829, % of 1 = 0.1171
Balancing class wts: for 0 = 0.5664, for 1 = 4.264


In [7]:
trainingData["Causal association"].value_counts()

0.0    7799
1.0    1036
Name: Causal association, dtype: int64

In [8]:
# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, bio_tags, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.bio_tags = bio_tags
        self.tag2id = {label: idx for idx, label in enumerate(["O", "B-C", "I-C", "B-E", "I-E"])}
        self.tag2id[-100] = -100
        self.id2tag = {id:tag for tag,id in self.tag2id.items()}

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        bio_tags_extended = self.extend_tags(self.text[idx], self.bio_tags[idx], ids[idx])
        assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
              , "bio_tags" : torch.tensor(list(map(lambda bioTags: self.tag2id[bioTags], bio_tags_extended))
, dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

    
    def extend_tags(self, tokens_old, tags_old, ids_tokenized_padded):
        """ 
            Each token has a BIO tag label. 
            However BERT's tokenization splits tokens into subwords. How to label those subwords?
            
            Option 1:
            ---------
            
            add the same label to each subword than the first subword. Only replace "B" by "I"
            Ex. 
            #lowbloodsugar => '#low@@', 'blood@@', 'sugar@@'
               "B-C"       =>   "B-C" ,   "I-C"  ,   "I-C"
            
            Option 2 (implemented):      
            ---------
            
            From : https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities
            A common obstacle with using pre-trained models for token-level classification: many of the tokens in
            the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called 
            WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token
            is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter 
            handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we 
            have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will
            end up with a mismatch between our tokens and our labels.

            One way to handle this is to only train on the tag labels for the first subtoken of a split token. 
            We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. 
            In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set 
            the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
        """
        tags = [-100] # add for start token <CLS>
        for token_old, tag in zip(tokens_old.split(" "), tags_old):
#            print(F"\ntoken_old: {token_old};    tag: {tag}")
            for i, sub_token in enumerate(self.tokenizer.tokenize(token_old)):
                if (i == 0):
                    tags.append(tag)
                else: 
                    tags.append(-100)
           
        tags.append(-100) # 0 for end of sentence token
    
        # append -100 for all padded elements
        padded_elements = ids_tokenized_padded.count(1) # id 1 is <PAD> ; Alternative: where attention_mask == 0 add -100
        tags.extend([-100]*padded_elements)
        
        return tags
        
        
    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train["sentence"].values.tolist()
                           , train["Causal association"].values.tolist()
                           , train["bio_tags"].values.tolist()
                           , tokenizer)
val_dataset = TweetDataSet(val["sentence"].values.tolist()
                           , val["Causal association"].values.tolist()
                           , val["bio_tags"].values.tolist()
                           , tokenizer)
test_dataset = TweetDataSet(test["sentence"].values.tolist()
                           , test["Causal association"].values.tolist()
                           , test["bio_tags"].values.tolist()
                           , tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# put data to batches
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)


  assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


5654
1414
1767


In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred, labels):
    """
        Dataset is unbalanced -> measure weighted metrics
        Calculate metrics for each label, and find their average wieghted by support (Number of true instances for each label)
        This alters 'macro' to account for label imbalance;
        it can result in an F-Score taht is not between precision and recall
    """
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='macro') #binary
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalMultiTask(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalMultiTask, self).__init__()
        self.num_labels_NER = 5 # B-C, I-C, B-E, I-E, O
        self.num_labels_CLS = 2 # 0, 1
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear_NER = torch.nn.Linear(256, self.num_labels_NER)
        self.linear_CLS = torch.nn.Linear(256, self.num_labels_CLS)
        #self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_seq, output_cls = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token

        # classification
        output_cls_2 = self.dropout(output_cls)
        output_cls_3 = self.linear1(output_cls_2)
        output_cls_4 = self.dropout(output_cls_3)
        output_cls_5 = self.linear_CLS(output_cls_4)
        #logit_cls = self.softmax(output_cls_5)
        
        # named entity recognition
        output_ner_2 = self.dropout(output_seq)
        output_ner_3 = self.linear1(output_ner_2)
        output_ner_4 = self.dropout(output_ner_3)
        output_ner_5 = self.linear_NER(output_ner_4)
        #logit_ner = self.softmax(output_ner_5)        
        
        return output_cls_5, output_ner_5



### Model parameters

In [10]:
## Model parameters
batchsize_train = 16
lr = 1e-3
adam_eps = 1e-8
epochs = 35
num_warmup_steps = 0
num_training_steps = len(train_loader)*epochs

In [11]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CausalMultiTask()
model.to(device)

# TODO: Check in Multi-task setting, if underlying BERT parameters shall
# be updated too to benefit from common training
for param in model.bert.parameters():
    param.requires_grad = False
    


optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

loss_fn_ner = CrossEntropyLoss(ignore_index=-100) # ignore subwords/tokens with label -100 
## penalising more for class with less number of exaplmes 
loss_fn_cls = CrossEntropyLoss(torch.tensor(class_weight.to_list()).to(device))


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.5.output.dense.weight', 'roberta.encoder.layer.8.attention.self.value.bias', 'roberta.encoder.layer.1.output.dense.weight', 'roberta.encoder.layer.5.intermediate.dense.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.encoder.layer.1.attention.output.dense.weight', 'roberta.encoder.layer.7.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.1.attention.output.dense.bias', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.4.attention.self.value.weight', 'roberta.encoder.layer.10.intermediate.dense.weight', 'roberta.encoder.layer.8.attention.output.LayerNorm.bias', 'lm_head.bias', 'roberta.encoder.layer.3.output.LayerNorm.

### Training

In [12]:
# Store our loss and learning rate for plotting
learning_rate = []

N_bio_tags = 5 # "O", "B-C", "I-C", "B-E", "I-C"
for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")

    
    ############ training eval metrics ######################
    nb_tr_steps = 0 # Tracking variables
    train_loss = []
    train_cls_acc = []
    train_cls_prec = []
    train_cls_rec = []
    train_cls_f1 = []
    train_ner_acc = []
    train_ner_prec = []
    train_ner_rec = []
    train_ner_f1 = []    
    #########################################################
    
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        bio_tags = batch['bio_tags'].to(device)
        
        ################################################
        model.train() # set model to training mode
        logits_cls, logits_ner = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

        ################# Loss function ############################### 
        ### CLS
        loss_cls = loss_fn_cls(logits_cls, labels)
        print("\tloss_cls:", loss_cls)
        
        ### NER
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        active_logits = logits_ner.view(-1, N_bio_tags)[active_loss] # N_bio_tags=5 
        active_tags = bio_tags.view(-1)[active_loss]
        loss_ner = loss_fn_ner(active_logits, active_tags)             
        print("\tloss_ner:", loss_ner)   
        
        loss = loss_cls + loss_ner  # combine binary classification loss and named entity recognition loss
        print("loss:", loss)      
        loss.backward() # backward pass
        optim.step()    # update parameters and take a steup using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
            
            
        ################## Training Performance Measures ##########
        ### CLS
        logits_cls = logits_cls.detach().to('cpu').numpy()
        label_ids = labels.to('cpu').numpy()

        pred_flat = np.argmax(logits_cls, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics_cls = compute_metrics(pred_flat, labels_flat)
        train_cls_acc.append(metrics_cls["accuracy"])
        train_cls_prec.append(metrics_cls["precision"])
        train_cls_rec.append(metrics_cls["recall"])
        train_cls_f1.append(metrics_cls["f1"])
        
        #### NER 
        logits_ner = logits_ner.detach().to('cpu').numpy()
        tags_ids = bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]                      
                
        metrics_ner = compute_metrics(pred, tags)
        train_ner_acc.append(metrics_ner["accuracy"])
        train_ner_prec.append(metrics_ner["precision"])
        train_ner_rec.append(metrics_ner["recall"])
        train_ner_f1.append(metrics_ner["f1"])
                          
        nb_tr_steps += 1
           
    print(F'\n\tTraining Loss: {np.mean(train_loss)}')
    print(F'\n\tTraining cls acc: {np.mean(train_cls_acc)}')
    print(F'\n\tTraining cls prec: {np.mean(train_cls_prec)}')
    print(F'\n\tTraining cls rec: {np.mean(train_cls_rec)}')
    print(F'\n\tTraining cls f1: {np.mean(train_cls_f1)}')
    print(F'\n--\n\tTraining ner acc: {np.mean(train_ner_acc)}')
    print(F'\n\tTraining ner prec: {np.mean(train_ner_prec)}')
    print(F'\n\tTraining ner rec: {np.mean(train_ner_rec)}')
    print(F'\n\tTraining ner f1: {np.mean(train_ner_f1)}')
                          
                          
    # store the current learning rate
    for param_group in optim.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])
    

    ############# Validation ################
    
    val_accuracy = []
    val_loss = []
    val_cls_acc = []
    val_cls_prec = []
    val_cls_rec = []
    val_cls_f1 = []
    val_ner_acc = []
    val_ner_prec = []
    val_ner_rec = []
    val_ner_f1 = []
    
    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        v_input_ids, v_input_mask, v_token_type_ids, v_labels, v_bio_tags = batch  # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            model.eval() # put model in evaluation mode for validation set
            logits_cls, logits_ner = model(**{"input_ids":v_input_ids, "attention_mask":v_input_mask, "token_type_ids":v_token_type_ids}) # forward pass, calculates logit predictions

        ############### LOSS Function #######################################
        ### CLS
        v_loss_cls = loss_fn_cls(logits_cls, v_labels)
        
        ### NER
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        v_active_loss = v_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        v_active_logits = logits_ner.view(-1, N_bio_tags)[v_active_loss] # 5 
        v_active_tags = v_bio_tags.view(-1)[v_active_loss]
        v_loss_ner = loss_fn_ner(v_active_logits, v_active_tags)             
        v_loss = v_loss_cls + v_loss_ner
        val_loss.append(v_loss.item())

   
        ################# PERFORMANCE MEASURES ########################################
        ### CLS
        logits_cls = logits_cls.detach().to('cpu').numpy()
        label_ids = v_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits_cls, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics_cls = compute_metrics(pred_flat, labels_flat)
        val_cls_acc.append(metrics_cls["accuracy"])
        val_cls_prec.append(metrics_cls["precision"])
        val_cls_rec.append(metrics_cls["recall"])
        val_cls_f1.append(metrics_cls["f1"])
        
        #### NER     
        logits_ner = logits_ner.detach().to('cpu').numpy()
        tags_ids = v_bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]#.flatten()        
        
        metrics = compute_metrics(pred, tags)
        val_ner_acc.append(metrics["accuracy"])
        val_ner_prec.append(metrics["precision"])
        val_ner_rec.append(metrics["recall"])
        val_ner_f1.append(metrics["f1"])
                              
           
    print(F'\n\tValidation Loss: {np.mean(val_loss)}')
    print(F'\n\tValidation cls acc: {np.mean(val_cls_acc)}')
    print(F'\n\tValidation cls prec: {np.mean(val_cls_prec)}')
    print(F'\n\tValidation cls rec: {np.mean(val_cls_rec)}')
    print(F'\n\tValidation cls f1: {np.mean(val_cls_f1)}')
    print(F'\n--\n\tValidation ner acc: {np.mean(val_ner_acc)}')
    print(F'\n\tValidation ner prec: {np.mean(val_ner_prec)}')
    print(F'\n\tValidation ner rec: {np.mean(val_ner_rec)}')
    print(F'\n\tValidation ner f1: {np.mean(val_ner_f1)}')


Epoch:   0%|          | 0/35 [00:00<?, ?it/s]
  0%|          | 0/354 [00:00<?, ?it/s][A



  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  0%|          | 1/354 [00:06<36:12,  6.15s/it][A

	loss_cls: tensor(0.7495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(1.4751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(2.2245, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:11<33:22,  5.69s/it][A

	loss_cls: tensor(0.8276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0851, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:33,  5.57s/it][A

	loss_cls: tensor(0.6129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8425, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:22<31:52,  5.47s/it][A

	loss_cls: tensor(0.7421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7428, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:40,  5.44s/it][A

	loss_cls: tensor(0.6373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8474, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:20,  5.41s/it][A

	loss_cls: tensor(0.9098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3676, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:06,  5.38s/it][A

	loss_cls: tensor(0.5808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8374, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:06,  5.40s/it][A

	loss_cls: tensor(0.9024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4712, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<30:57,  5.38s/it][A

	loss_cls: tensor(0.7934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2498, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<30:58,  5.40s/it][A

	loss_cls: tensor(0.8859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5286, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:46,  5.38s/it][A

	loss_cls: tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.9874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6660, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:46,  5.40s/it][A

	loss_cls: tensor(0.5987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(1.2956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.8942, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:38,  5.39s/it][A

	loss_cls: tensor(0.9054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(5.3948e-05, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9054, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<30:30,  5.38s/it][A

	loss_cls: tensor(1.0003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0004, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:31,  5.40s/it][A

	loss_cls: tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2790, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:27,  5.41s/it][A

	loss_cls: tensor(0.7077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9092, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:31,  5.43s/it][A

	loss_cls: tensor(0.7144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3527, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:21,  5.42s/it][A

	loss_cls: tensor(0.8054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.9815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7868, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:22,  5.44s/it][A

	loss_cls: tensor(0.3018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3038, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:10,  5.42s/it][A

	loss_cls: tensor(0.8126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1197, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:00,  5.41s/it][A

	loss_cls: tensor(1.0046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7696, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<30:03,  5.43s/it][A

	loss_cls: tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3313, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:55,  5.42s/it][A

	loss_cls: tensor(0.3370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4406, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:55,  5.44s/it][A

	loss_cls: tensor(0.5037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6424, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:44,  5.42s/it][A

	loss_cls: tensor(0.6475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9928, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:21<29:39,  5.43s/it][A

	loss_cls: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8740, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:29,  5.41s/it][A

	loss_cls: tensor(0.8163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1335, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:32<29:23,  5.41s/it][A

	loss_cls: tensor(0.7725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9871, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:13,  5.40s/it][A

	loss_cls: tensor(0.6682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0470, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:04,  5.38s/it][A

	loss_cls: tensor(0.6800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9826, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:48<29:01,  5.39s/it][A

	loss_cls: tensor(0.8859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0307, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<28:46,  5.36s/it][A

	loss_cls: tensor(0.6708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8829, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:44,  5.37s/it][A

	loss_cls: tensor(0.7120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1262, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:34,  5.36s/it][A

	loss_cls: tensor(0.9866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0679, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:36,  5.38s/it][A

	loss_cls: tensor(0.5005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0765, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:29,  5.38s/it][A

	loss_cls: tensor(0.6093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9530, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:19,  5.36s/it][A

	loss_cls: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0119, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:18,  5.37s/it][A

	loss_cls: tensor(0.6358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0611, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:30<28:05,  5.35s/it][A

	loss_cls: tensor(0.6621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8525, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:02,  5.36s/it][A

	loss_cls: tensor(0.7260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7852, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<27:49,  5.34s/it][A

	loss_cls: tensor(0.7129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9879, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<27:51,  5.36s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8699, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<27:42,  5.35s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1811, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:57<27:33,  5.33s/it][A

	loss_cls: tensor(0.6334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9539, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:32,  5.35s/it][A

	loss_cls: tensor(0.8556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1536, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:08<27:22,  5.33s/it][A

	loss_cls: tensor(0.5437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6694, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:13<27:19,  5.34s/it][A

	loss_cls: tensor(0.5989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8773, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:11,  5.33s/it][A

	loss_cls: tensor(0.7211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0005, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:24<27:13,  5.36s/it][A

	loss_cls: tensor(0.4965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0210, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:29<27:03,  5.34s/it][A

	loss_cls: tensor(0.5864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8848, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:35<26:55,  5.33s/it][A

	loss_cls: tensor(0.6303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9014, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:40<26:58,  5.36s/it][A

	loss_cls: tensor(0.4752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7134, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:45<26:47,  5.34s/it][A

	loss_cls: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8097, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:51<26:42,  5.34s/it][A

	loss_cls: tensor(0.8538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0002, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:56<26:32,  5.33s/it][A

	loss_cls: tensor(0.5981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8846, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:01<26:31,  5.34s/it][A

	loss_cls: tensor(0.6996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2482, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:07<26:22,  5.33s/it][A

	loss_cls: tensor(0.7614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9668, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:12<26:13,  5.31s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6960, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:17<26:13,  5.33s/it][A

	loss_cls: tensor(0.7999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8835, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:22<26:00,  5.31s/it][A

	loss_cls: tensor(0.7664, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8864, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:28<25:58,  5.32s/it][A

	loss_cls: tensor(0.7180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0353, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:33<25:48,  5.30s/it][A

	loss_cls: tensor(0.5074, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7009, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:38<25:48,  5.32s/it][A

	loss_cls: tensor(0.6553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1098, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:44<25:39,  5.31s/it][A

	loss_cls: tensor(0.7447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0426, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:49<25:33,  5.31s/it][A

	loss_cls: tensor(0.5667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7422, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:54<25:32,  5.32s/it][A

	loss_cls: tensor(0.5953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8066, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:00<25:23,  5.31s/it][A

	loss_cls: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7801, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:05<25:23,  5.33s/it][A

	loss_cls: tensor(0.6412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8637, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:10<25:16,  5.32s/it][A

	loss_cls: tensor(0.6264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9677, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:16<25:13,  5.33s/it][A

	loss_cls: tensor(0.6547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9612, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:21<25:04,  5.32s/it][A

	loss_cls: tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7681, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:26<24:55,  5.30s/it][A

	loss_cls: tensor(0.6428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7929, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:32<25:07,  5.37s/it][A

	loss_cls: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2220, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:37<25:07,  5.38s/it][A

	loss_cls: tensor(0.6676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0206, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:43<25:13,  5.42s/it][A

	loss_cls: tensor(0.6339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7840, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:48<25:09,  5.43s/it][A

	loss_cls: tensor(0.7481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9797, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:54<25:09,  5.45s/it][A

	loss_cls: tensor(0.7655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0996, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [06:59<25:01,  5.44s/it][A

	loss_cls: tensor(0.7225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2593, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:04<24:53,  5.43s/it][A

	loss_cls: tensor(0.7234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8975, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:10<24:53,  5.45s/it][A

	loss_cls: tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9243, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:15<24:44,  5.44s/it][A

	loss_cls: tensor(0.6196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7957, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:21<24:47,  5.47s/it][A

	loss_cls: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0274, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:26<24:40,  5.46s/it][A

	loss_cls: tensor(0.8798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0343, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:32<24:38,  5.48s/it][A

	loss_cls: tensor(0.4206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6738, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:37<24:28,  5.46s/it][A

	loss_cls: tensor(0.6867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2614, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9481, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:43<24:20,  5.45s/it][A

	loss_cls: tensor(0.5943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9491, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:48<24:18,  5.46s/it][A

	loss_cls: tensor(0.5787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9898, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:54<24:10,  5.45s/it][A

	loss_cls: tensor(0.6777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8809, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [07:59<24:10,  5.47s/it][A

	loss_cls: tensor(0.7235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8341, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:05<24:02,  5.46s/it][A

	loss_cls: tensor(0.4190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7722, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:10<24:00,  5.48s/it][A

	loss_cls: tensor(0.5668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8903, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<23:52,  5.47s/it][A

	loss_cls: tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0598, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:21<23:42,  5.45s/it][A

	loss_cls: tensor(0.4292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6652, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:26<23:41,  5.47s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4968, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:32<23:32,  5.46s/it][A

	loss_cls: tensor(0.9345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3184, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:37<23:35,  5.49s/it][A

	loss_cls: tensor(0.2139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2666, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:43<23:25,  5.47s/it][A

	loss_cls: tensor(0.5455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9304, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:48<23:23,  5.48s/it][A

	loss_cls: tensor(0.5573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7389, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:53<22:46,  5.36s/it][A

	loss_cls: tensor(0.4033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5210, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:59<22:35,  5.34s/it][A

	loss_cls: tensor(1.2933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4809, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:04<22:42,  5.39s/it][A

	loss_cls: tensor(0.7098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8683, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:10<22:40,  5.40s/it][A

	loss_cls: tensor(0.5907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7184, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:15<22:44,  5.44s/it][A

	loss_cls: tensor(0.8230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1017, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:21<22:37,  5.43s/it][A

	loss_cls: tensor(0.4902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9637, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:26<22:37,  5.45s/it][A

	loss_cls: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2514, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7820, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:32<22:29,  5.44s/it][A

	loss_cls: tensor(0.7881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0505, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:37<22:21,  5.43s/it][A

	loss_cls: tensor(0.8821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2312, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:42<22:21,  5.45s/it][A

	loss_cls: tensor(0.6886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9663, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:48<22:13,  5.44s/it][A

	loss_cls: tensor(0.6213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8482, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:53<22:14,  5.47s/it][A

	loss_cls: tensor(0.8521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0372, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:59<22:05,  5.46s/it][A

	loss_cls: tensor(1.0204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1546, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:04<22:03,  5.47s/it][A

	loss_cls: tensor(0.6692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2282, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:10<21:54,  5.45s/it][A

	loss_cls: tensor(0.8000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4365, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:15<21:45,  5.44s/it][A

	loss_cls: tensor(0.6029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8027, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:21<21:46,  5.47s/it][A

	loss_cls: tensor(0.4850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7215, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:26<21:38,  5.45s/it][A

	loss_cls: tensor(1.1330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3878, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:32<21:36,  5.47s/it][A

	loss_cls: tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8817, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:37<21:27,  5.46s/it][A

	loss_cls: tensor(0.8990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0510, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:43<21:25,  5.47s/it][A

	loss_cls: tensor(0.8086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9483, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:48<21:16,  5.45s/it][A

	loss_cls: tensor(0.6415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9483, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:53<21:07,  5.44s/it][A

	loss_cls: tensor(0.5439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9988, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:59<21:08,  5.47s/it][A

	loss_cls: tensor(0.5634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7621, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:04<21:00,  5.46s/it][A

	loss_cls: tensor(0.8658, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9021, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:10<20:59,  5.48s/it][A

	loss_cls: tensor(0.7281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9550, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:15<20:51,  5.47s/it][A

	loss_cls: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9160, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:21<20:47,  5.47s/it][A

	loss_cls: tensor(0.4284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5788, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:26<20:38,  5.45s/it][A

	loss_cls: tensor(0.4405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7195, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:32<20:33,  5.46s/it][A

	loss_cls: tensor(0.5848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8274, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:37<20:26,  5.45s/it][A

	loss_cls: tensor(0.9646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5308, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:43<20:18,  5.44s/it][A

	loss_cls: tensor(0.3461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7260, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:48<20:16,  5.46s/it][A

	loss_cls: tensor(0.7241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0218, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:53<20:08,  5.45s/it][A

	loss_cls: tensor(0.5693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6245, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:59<19:59,  5.43s/it][A

	loss_cls: tensor(0.5975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6557, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:04<19:44,  5.38s/it][A

	loss_cls: tensor(0.9657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3720, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:10<19:40,  5.39s/it][A

	loss_cls: tensor(0.6424, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7328, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:15<19:29,  5.36s/it][A

	loss_cls: tensor(0.9509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3281, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:20<19:21,  5.35s/it][A

	loss_cls: tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9215, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:26<19:18,  5.36s/it][A

	loss_cls: tensor(0.4607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5610, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:31<19:09,  5.35s/it][A

	loss_cls: tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4191, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9941, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:36<19:05,  5.35s/it][A

	loss_cls: tensor(0.8655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2884, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:41<18:55,  5.33s/it][A

	loss_cls: tensor(0.7034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2159, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:47<18:54,  5.35s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9112, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:52<18:46,  5.34s/it][A

	loss_cls: tensor(0.6918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9199, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:57<18:38,  5.33s/it][A

	loss_cls: tensor(1.1414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2051, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:03<18:38,  5.35s/it][A

	loss_cls: tensor(0.7944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9193, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:08<18:28,  5.33s/it][A

	loss_cls: tensor(0.6388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7944, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:14<18:27,  5.35s/it][A

	loss_cls: tensor(0.8525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9356, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:19<18:17,  5.33s/it][A

	loss_cls: tensor(0.4027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4410, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:24<18:15,  5.35s/it][A

	loss_cls: tensor(0.7272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0604, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:30<18:08,  5.33s/it][A

	loss_cls: tensor(0.6445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2981, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9426, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:35<18:01,  5.33s/it][A

	loss_cls: tensor(0.6587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7706, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:40<17:59,  5.35s/it][A

	loss_cls: tensor(1.1568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7106, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:46<17:51,  5.33s/it][A

	loss_cls: tensor(1.0110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4081, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:51<17:49,  5.35s/it][A

	loss_cls: tensor(0.6758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0710, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7467, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:56<17:40,  5.33s/it][A

	loss_cls: tensor(0.4865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5338, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:02<17:37,  5.34s/it][A

	loss_cls: tensor(0.6161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8840, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:07<17:30,  5.33s/it][A

	loss_cls: tensor(0.7606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8682, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:12<17:23,  5.33s/it][A

	loss_cls: tensor(0.5634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8927, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:18<17:25,  5.36s/it][A

	loss_cls: tensor(0.4643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6842, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:23<17:17,  5.35s/it][A

	loss_cls: tensor(0.7197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8416, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:28<17:13,  5.35s/it][A

	loss_cls: tensor(0.9830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0740, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:34<17:07,  5.35s/it][A

	loss_cls: tensor(0.6829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8374, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:39<17:06,  5.37s/it][A

	loss_cls: tensor(0.8042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1346, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:44<16:56,  5.35s/it][A

	loss_cls: tensor(0.7779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0416, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:50<16:49,  5.34s/it][A

	loss_cls: tensor(0.7648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8866, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:55<16:47,  5.36s/it][A

	loss_cls: tensor(0.4204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4865, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:00<16:38,  5.34s/it][A

	loss_cls: tensor(0.6032, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9673, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:06<16:35,  5.35s/it][A

	loss_cls: tensor(0.6394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7002, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:11<16:29,  5.35s/it][A

	loss_cls: tensor(0.3749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4079, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:17<16:27,  5.36s/it][A

	loss_cls: tensor(0.4845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5288, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:22<16:19,  5.35s/it][A

	loss_cls: tensor(0.3435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5406, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:27<16:12,  5.34s/it][A

	loss_cls: tensor(0.6454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7894, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:33<16:09,  5.36s/it][A

	loss_cls: tensor(0.9093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0877, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:38<16:01,  5.34s/it][A

	loss_cls: tensor(1.0407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2676, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:43<16:01,  5.37s/it][A

	loss_cls: tensor(1.2247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4463, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:49<15:53,  5.36s/it][A

	loss_cls: tensor(0.8050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9506, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:54<15:50,  5.37s/it][A

	loss_cls: tensor(0.9061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4490, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:59<15:41,  5.35s/it][A

	loss_cls: tensor(0.6169, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7065, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:05<15:33,  5.33s/it][A

	loss_cls: tensor(0.7306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0880, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:10<15:30,  5.35s/it][A

	loss_cls: tensor(0.6318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6648, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:15<15:22,  5.33s/it][A

	loss_cls: tensor(0.4340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6026, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:21<15:21,  5.36s/it][A

	loss_cls: tensor(0.8020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0153, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:26<15:15,  5.36s/it][A

	loss_cls: tensor(0.6301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7601, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:32<15:13,  5.37s/it][A

	loss_cls: tensor(0.6710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8221, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:37<15:05,  5.36s/it][A

	loss_cls: tensor(0.8126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1289, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:42<14:57,  5.34s/it][A

	loss_cls: tensor(0.6064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9724, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:48<14:54,  5.35s/it][A

	loss_cls: tensor(0.5790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7098, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:53<14:46,  5.34s/it][A

	loss_cls: tensor(0.5387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6958, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:58<14:44,  5.36s/it][A

	loss_cls: tensor(0.7641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9110, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:04<14:36,  5.34s/it][A

	loss_cls: tensor(0.4860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5346, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:09<14:34,  5.36s/it][A

	loss_cls: tensor(0.6595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9324, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:14<14:26,  5.35s/it][A

	loss_cls: tensor(0.5800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1284, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:20<14:20,  5.34s/it][A

	loss_cls: tensor(0.3827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5956, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:25<14:18,  5.36s/it][A

	loss_cls: tensor(0.8984, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0917, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:30<14:11,  5.36s/it][A

	loss_cls: tensor(0.7987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8832, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:36<14:08,  5.37s/it][A

	loss_cls: tensor(0.7558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1448, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:41<14:00,  5.36s/it][A

	loss_cls: tensor(0.5215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7257, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:46<13:58,  5.37s/it][A

	loss_cls: tensor(0.4744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6398, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:52<13:50,  5.36s/it][A

	loss_cls: tensor(0.7014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9729, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:57<13:42,  5.34s/it][A

	loss_cls: tensor(0.7194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8165, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:02<13:39,  5.36s/it][A

	loss_cls: tensor(0.5121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7045, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:08<13:32,  5.34s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5574, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:13<13:30,  5.37s/it][A

	loss_cls: tensor(0.5490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7365, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:19<13:24,  5.36s/it][A

	loss_cls: tensor(0.7249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8338, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:24<13:21,  5.38s/it][A

	loss_cls: tensor(0.6853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2049, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:29<13:12,  5.36s/it][A

	loss_cls: tensor(0.5563, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6369, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:35<13:05,  5.34s/it][A

	loss_cls: tensor(0.8466, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2319, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:40<13:03,  5.37s/it][A

	loss_cls: tensor(0.7351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0316, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:45<12:56,  5.36s/it][A

	loss_cls: tensor(0.7262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3675, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:51<12:52,  5.37s/it][A

	loss_cls: tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6929, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:56<12:45,  5.36s/it][A

	loss_cls: tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8217, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:01<12:39,  5.35s/it][A

	loss_cls: tensor(0.4710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7404, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:07<12:32,  5.34s/it][A

	loss_cls: tensor(0.6591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8371, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:12<12:25,  5.32s/it][A

	loss_cls: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3015, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:17<12:23,  5.35s/it][A

	loss_cls: tensor(0.6999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9762, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:23<12:17,  5.34s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6976, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:28<12:13,  5.35s/it][A

	loss_cls: tensor(0.5447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9135, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:33<12:05,  5.33s/it][A

	loss_cls: tensor(0.8025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8728, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:39<12:01,  5.34s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8923, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:44<11:53,  5.33s/it][A

	loss_cls: tensor(0.4742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6380, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:49<11:47,  5.32s/it][A

	loss_cls: tensor(0.7426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8709, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:55<11:46,  5.35s/it][A

	loss_cls: tensor(0.6744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9741, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:00<11:40,  5.35s/it][A

	loss_cls: tensor(1.0970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2457, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:06<11:37,  5.37s/it][A

	loss_cls: tensor(0.6902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1647, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:11<11:30,  5.35s/it][A

	loss_cls: tensor(0.6514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9134, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:16<11:26,  5.36s/it][A

	loss_cls: tensor(0.5751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6344, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:22<11:19,  5.35s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6691, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:27<11:17,  5.38s/it][A

	loss_cls: tensor(0.9138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1568, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:32<11:10,  5.37s/it][A

	loss_cls: tensor(0.3855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7571, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:38<11:03,  5.35s/it][A

	loss_cls: tensor(1.0880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3932, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:43<10:59,  5.36s/it][A

	loss_cls: tensor(0.8479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0495, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:48<10:52,  5.35s/it][A

	loss_cls: tensor(0.4314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5495, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:54<10:48,  5.36s/it][A

	loss_cls: tensor(0.7727, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3671, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [20:59<10:40,  5.34s/it][A

	loss_cls: tensor(0.6736, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0083, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:04<10:37,  5.35s/it][A

	loss_cls: tensor(0.7653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9713, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:10<10:30,  5.35s/it][A

	loss_cls: tensor(0.8104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1132, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:15<10:24,  5.34s/it][A

	loss_cls: tensor(0.5001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6798, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:20<10:21,  5.36s/it][A

	loss_cls: tensor(0.8808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2039, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:26<10:14,  5.34s/it][A

	loss_cls: tensor(0.7212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3919, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1131, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:31<10:11,  5.36s/it][A

	loss_cls: tensor(0.6796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9781, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:37<10:04,  5.35s/it][A

	loss_cls: tensor(0.6292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8950, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:42<10:00,  5.36s/it][A

	loss_cls: tensor(1.0306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2537, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:47<09:54,  5.35s/it][A

	loss_cls: tensor(0.6816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0372, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:53<09:48,  5.35s/it][A

	loss_cls: tensor(0.8585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9128, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [21:58<09:45,  5.37s/it][A

	loss_cls: tensor(0.4727, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7282, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:03<09:37,  5.35s/it][A

	loss_cls: tensor(0.4863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8567, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:09<09:33,  5.36s/it][A

	loss_cls: tensor(0.7231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2288, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:14<09:26,  5.35s/it][A

	loss_cls: tensor(0.8758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2994, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:19<09:22,  5.36s/it][A

	loss_cls: tensor(0.7967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1139, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:25<09:16,  5.36s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1485, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8135, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:30<09:10,  5.35s/it][A

	loss_cls: tensor(0.5411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8260, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:35<09:07,  5.36s/it][A

	loss_cls: tensor(0.3846, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4566, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:41<09:02,  5.37s/it][A

	loss_cls: tensor(0.3735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4412, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:46<08:57,  5.37s/it][A

	loss_cls: tensor(0.4833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5784, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:52<08:50,  5.36s/it][A

	loss_cls: tensor(0.4909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6281, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [22:57<08:46,  5.38s/it][A

	loss_cls: tensor(0.8965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0749, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:02<08:40,  5.37s/it][A

	loss_cls: tensor(0.3635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4330, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:08<08:33,  5.35s/it][A

	loss_cls: tensor(0.9259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4884, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:13<08:29,  5.36s/it][A

	loss_cls: tensor(0.6876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9351, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:18<08:22,  5.34s/it][A

	loss_cls: tensor(0.9452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1558, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:24<08:17,  5.35s/it][A

	loss_cls: tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8644, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:29<08:11,  5.34s/it][A

	loss_cls: tensor(0.4442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4946, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:34<08:07,  5.36s/it][A

	loss_cls: tensor(0.5184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8427, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:40<08:01,  5.35s/it][A

	loss_cls: tensor(0.7677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9692, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:45<07:55,  5.34s/it][A

	loss_cls: tensor(0.6020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7189, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:50<07:50,  5.35s/it][A

	loss_cls: tensor(0.7412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9648, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [23:56<07:43,  5.33s/it][A

	loss_cls: tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5694, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:01<07:39,  5.34s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7210, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:06<07:33,  5.33s/it][A

	loss_cls: tensor(0.7389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3152, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:12<07:30,  5.36s/it][A

	loss_cls: tensor(0.5365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5812, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:17<07:23,  5.34s/it][A

	loss_cls: tensor(0.6148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7646, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:22<07:17,  5.34s/it][A

	loss_cls: tensor(0.6997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1409, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:28<07:13,  5.35s/it][A

	loss_cls: tensor(0.6167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7801, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:33<07:06,  5.34s/it][A

	loss_cls: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4084, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0195, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:39<07:03,  5.36s/it][A

	loss_cls: tensor(0.6360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8081, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:44<06:57,  5.35s/it][A

	loss_cls: tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7022, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:49<06:53,  5.37s/it][A

	loss_cls: tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7263, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [24:55<06:46,  5.35s/it][A

	loss_cls: tensor(0.5149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6100, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:00<06:40,  5.34s/it][A

	loss_cls: tensor(0.3890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8526, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:05<06:36,  5.35s/it][A

	loss_cls: tensor(0.5463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7429, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:11<06:30,  5.34s/it][A

	loss_cls: tensor(1.0470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3461, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:16<06:26,  5.37s/it][A

	loss_cls: tensor(0.8338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9362, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:21<06:20,  5.36s/it][A

	loss_cls: tensor(0.5136, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6522, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:27<06:15,  5.37s/it][A

	loss_cls: tensor(0.2440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2800, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:32<06:08,  5.34s/it][A

	loss_cls: tensor(0.6409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7823, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:37<06:02,  5.33s/it][A

	loss_cls: tensor(0.5210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6933, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:43<05:58,  5.35s/it][A

	loss_cls: tensor(0.5199, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6480, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:48<05:52,  5.34s/it][A

	loss_cls: tensor(0.5213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7149, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [25:53<05:48,  5.36s/it][A

	loss_cls: tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2116, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1447, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [25:59<05:42,  5.35s/it][A

	loss_cls: tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6641, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:04<05:38,  5.37s/it][A

	loss_cls: tensor(0.6813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0345, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:10<05:31,  5.35s/it][A

	loss_cls: tensor(0.4419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5795, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:15<05:25,  5.34s/it][A

	loss_cls: tensor(0.7364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9580, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:20<05:21,  5.36s/it][A

	loss_cls: tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9326, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:26<05:15,  5.35s/it][A

	loss_cls: tensor(0.8459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2070, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:31<05:11,  5.37s/it][A

	loss_cls: tensor(0.6254, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7157, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:36<05:05,  5.36s/it][A

	loss_cls: tensor(0.6667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8217, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:42<05:01,  5.38s/it][A

	loss_cls: tensor(0.6945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8345, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:47<04:55,  5.36s/it][A

	loss_cls: tensor(0.7303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7868, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [26:52<04:49,  5.36s/it][A

	loss_cls: tensor(0.7920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9447, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [26:58<04:45,  5.38s/it][A

	loss_cls: tensor(0.6541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0311, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:03<04:39,  5.37s/it][A

	loss_cls: tensor(0.6422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0807, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:09<04:35,  5.41s/it][A

	loss_cls: tensor(0.4293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7685, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:14<04:30,  5.40s/it][A

	loss_cls: tensor(0.6681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0249, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:19<04:24,  5.41s/it][A

	loss_cls: tensor(0.5580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8702, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:25<04:18,  5.39s/it][A

	loss_cls: tensor(0.5206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6353, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:30<04:13,  5.38s/it][A

	loss_cls: tensor(0.7087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9857, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:36<04:08,  5.41s/it][A

	loss_cls: tensor(0.4351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6334, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:41<04:03,  5.40s/it][A

	loss_cls: tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8271, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:47<03:58,  5.42s/it][A

	loss_cls: tensor(0.6535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9503, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [27:52<03:52,  5.40s/it][A

	loss_cls: tensor(0.6389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8018, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [27:57<03:47,  5.41s/it][A

	loss_cls: tensor(0.4754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6022, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:03<03:41,  5.39s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6967, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:08<03:35,  5.38s/it][A

	loss_cls: tensor(0.5605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7889, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:13<03:30,  5.40s/it][A

	loss_cls: tensor(0.6077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1976, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:19<03:24,  5.39s/it][A

	loss_cls: tensor(0.4374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8107, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:24<03:19,  5.40s/it][A

	loss_cls: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7977, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:30<03:14,  5.39s/it][A

	loss_cls: tensor(0.7358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9863, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:35<03:09,  5.40s/it][A

	loss_cls: tensor(0.5387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:40<03:03,  5.39s/it][A

	loss_cls: tensor(0.8280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9629, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:46<02:57,  5.38s/it][A

	loss_cls: tensor(0.4449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7125, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:51<02:53,  5.41s/it][A

	loss_cls: tensor(0.7479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1450, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [28:57<02:47,  5.40s/it][A

	loss_cls: tensor(0.6780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8924, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:02<02:42,  5.42s/it][A

	loss_cls: tensor(0.5123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9004, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:07<02:36,  5.41s/it][A

	loss_cls: tensor(0.5174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5786, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:13<02:31,  5.43s/it][A

	loss_cls: tensor(0.4068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5700, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:18<02:26,  5.41s/it][A

	loss_cls: tensor(0.4316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4961, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:24<02:21,  5.43s/it][A

	loss_cls: tensor(0.5248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1221, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:29<02:15,  5.42s/it][A

	loss_cls: tensor(1.1309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5229, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:35<02:09,  5.40s/it][A

	loss_cls: tensor(0.6617, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9376, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:40<02:04,  5.42s/it][A

	loss_cls: tensor(0.6983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9851, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:45<01:58,  5.40s/it][A

	loss_cls: tensor(0.5215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7333, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:51<01:53,  5.42s/it][A

	loss_cls: tensor(0.5999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8732, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [29:56<01:48,  5.41s/it][A

	loss_cls: tensor(1.0051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3555, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:02<01:42,  5.42s/it][A

	loss_cls: tensor(0.4516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6548, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:07<01:37,  5.41s/it][A

	loss_cls: tensor(0.4570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6373, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:12<01:31,  5.40s/it][A

	loss_cls: tensor(0.6422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8250, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:18<01:26,  5.42s/it][A

	loss_cls: tensor(0.4211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6163, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:23<01:21,  5.41s/it][A

	loss_cls: tensor(0.5168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9090, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:29<01:15,  5.42s/it][A

	loss_cls: tensor(0.5382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6493, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:34<01:10,  5.40s/it][A

	loss_cls: tensor(0.6053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0273, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:40<01:05,  5.42s/it][A

	loss_cls: tensor(0.4661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6746, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:45<00:59,  5.41s/it][A

	loss_cls: tensor(0.6158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9531, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:50<00:53,  5.40s/it][A

	loss_cls: tensor(0.4997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0306, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [30:56<00:48,  5.42s/it][A

	loss_cls: tensor(0.6163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6870, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:01<00:43,  5.40s/it][A

	loss_cls: tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8298, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:07<00:37,  5.41s/it][A

	loss_cls: tensor(0.6154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7283, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:12<00:32,  5.41s/it][A

	loss_cls: tensor(0.6396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8068, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:17<00:27,  5.43s/it][A

	loss_cls: tensor(0.3746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0116, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:23<00:21,  5.42s/it][A

	loss_cls: tensor(0.5850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8973, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:28<00:16,  5.41s/it][A

	loss_cls: tensor(0.6313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7960, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:34<00:10,  5.42s/it][A

	loss_cls: tensor(0.6781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8672, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:39<00:05,  5.41s/it][A

	loss_cls: tensor(0.3849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5708, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:41<00:00,  5.37s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.5082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7604, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.9206956624142868

	Training cls acc: 0.6710216572504709

	Training cls prec: 0.5562021292423834

	Training cls rec: 0.5766956913143354

	Training cls f1: 0.4987859471758733

--
	Training ner acc: 0.9536479205653522

	Training ner prec: 0.2758251929387814

	Training ner rec: 0.28460068355142215

	Training ner f1: 0.2797634649456594

	Current Learning rate:  0.0009714285714285714



  1%|          | 1/177 [00:00<01:59,  1.48it/s][A
  1%|          | 2/177 [00:01<01:52,  1.55it/s][A
  2%|▏         | 3/177 [00:02<01:56,  1.49it/s][A
  2%|▏         | 4/177 [00:02<01:52,  1.53it/s][A
  3%|▎         | 5/177 [00:03<01:55,  1.49it/s][A
  3%|▎         | 6/177 [00:03<01:52,  1.52it/s][A
  4%|▍         | 7/177 [00:04<01:54,  1.48it/s][A
  5%|▍         | 8/177 [00:05<01:51,  1.52it/s][A
  5%|▌         | 9/177 [00:06<01:53,  1.48it/s][A
  6%|▌         | 10/177 [00:06<01:53,  1.47it/s][A
  6%|▌         | 11/177 [00:07<01:51,  1.49it/s][A
  7%|▋         | 12/177 [00:08<01:53,  1.46it/s][A
  7%|▋         | 13/177 [00:08<01:53,  1.44it/s][A
  8%|▊         | 14/177 [00:09<01:49,  1.48it/s][A
  8%|▊         | 15/177 [00:10<01:51,  1.45it/s][A
  9%|▉         | 16/177 [00:10<01:52,  1.43it/s][A
 10%|▉         | 17/177 [00:11<01:51,  1.43it/s][A
 10%|█         | 18/177 [00:12<01:47,  1.48it/s][A
 11%|█         | 19/177 [00:12<01:48,  1.45it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7835228587947997

	Validation cls acc: 0.6812617702448212

	Validation cls prec: 0.608440947000269

	Validation cls rec: 0.585774818401937

	Validation cls f1: 0.543976738891993

--
	Validation ner acc: 0.9554677495730383

	Validation ner prec: 0.4286903235150377

	Validation ner rec: 0.43898305084745765

	Validation ner f1: 0.4336314221745952



  0%|          | 1/354 [00:05<31:43,  5.39s/it][A

	loss_cls: tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7278, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:52,  5.43s/it][A

	loss_cls: tensor(0.6733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9792, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:38,  5.41s/it][A

	loss_cls: tensor(0.7942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9292, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:25,  5.39s/it][A

	loss_cls: tensor(0.6535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1791, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:36,  5.43s/it][A

	loss_cls: tensor(0.4474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6553, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:25,  5.42s/it][A

	loss_cls: tensor(0.7152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8387, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:26,  5.44s/it][A

	loss_cls: tensor(0.4890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7403, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:13,  5.41s/it][A

	loss_cls: tensor(0.5433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7400, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:12,  5.43s/it][A

	loss_cls: tensor(0.4388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7352, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:00,  5.41s/it][A

	loss_cls: tensor(0.7857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1714, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:58,  5.42s/it][A

	loss_cls: tensor(0.7504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1894, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:53,  5.42s/it][A

	loss_cls: tensor(1.0915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3649, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:45,  5.41s/it][A

	loss_cls: tensor(0.6192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9283, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:46,  5.43s/it][A

	loss_cls: tensor(1.0295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1440, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:35,  5.41s/it][A

	loss_cls: tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5151, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0231, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:36,  5.43s/it][A

	loss_cls: tensor(0.7278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8628, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:23,  5.41s/it][A

	loss_cls: tensor(0.7300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8523, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:24,  5.43s/it][A

	loss_cls: tensor(0.4087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9115, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:16,  5.42s/it][A

	loss_cls: tensor(0.5327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8159, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:06,  5.41s/it][A

	loss_cls: tensor(0.5179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8298, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:05,  5.42s/it][A

	loss_cls: tensor(0.7750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2092, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<29:55,  5.41s/it][A

	loss_cls: tensor(0.7988, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0554, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:53,  5.42s/it][A

	loss_cls: tensor(0.6822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0694, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:43,  5.41s/it][A

	loss_cls: tensor(0.6310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8166, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:44,  5.43s/it][A

	loss_cls: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7113, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:36,  5.42s/it][A

	loss_cls: tensor(0.5270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6476, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:28,  5.41s/it][A

	loss_cls: tensor(1.0904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4348, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:29,  5.43s/it][A

	loss_cls: tensor(0.6884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8330, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:17,  5.41s/it][A

	loss_cls: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7361, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:14,  5.42s/it][A

	loss_cls: tensor(0.6315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9947, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:02,  5.40s/it][A

	loss_cls: tensor(0.6018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0923, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<29:05,  5.42s/it][A

	loss_cls: tensor(1.1093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5127, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:55,  5.41s/it][A

	loss_cls: tensor(0.4478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8375, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:47,  5.40s/it][A

	loss_cls: tensor(0.7370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9997, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:47,  5.42s/it][A

	loss_cls: tensor(0.7186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8386, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:38,  5.40s/it][A

	loss_cls: tensor(0.7281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0342, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:36,  5.42s/it][A

	loss_cls: tensor(0.6222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8446, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:31,  5.42s/it][A

	loss_cls: tensor(1.2174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3229, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:31,  5.43s/it][A

	loss_cls: tensor(0.7274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0745, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:25,  5.43s/it][A

	loss_cls: tensor(0.5857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7789, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:42<28:14,  5.42s/it][A

	loss_cls: tensor(0.6600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0278, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:13,  5.43s/it][A

	loss_cls: tensor(0.5839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8707, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<28:06,  5.42s/it][A

	loss_cls: tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7148, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<28:04,  5.43s/it][A

	loss_cls: tensor(0.5654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9796, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:53,  5.41s/it][A

	loss_cls: tensor(0.4851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7705, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:51,  5.43s/it][A

	loss_cls: tensor(0.4082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5025, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:45,  5.42s/it][A

	loss_cls: tensor(0.5429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7972, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:20<27:37,  5.42s/it][A

	loss_cls: tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8962, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:43,  5.46s/it][A

	loss_cls: tensor(0.4920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7265, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:31<27:35,  5.44s/it][A

	loss_cls: tensor(0.6096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9185, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:36,  5.47s/it][A

	loss_cls: tensor(0.8131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9400, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:30,  5.47s/it][A

	loss_cls: tensor(0.6593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9558, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:47<27:32,  5.49s/it][A

	loss_cls: tensor(0.5172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8722, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<27:21,  5.47s/it][A

	loss_cls: tensor(0.6148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7719, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:58<27:11,  5.46s/it][A

	loss_cls: tensor(0.5319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8336, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<27:09,  5.47s/it][A

	loss_cls: tensor(0.7669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9503, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:09<26:58,  5.45s/it][A

	loss_cls: tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8934, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<27:02,  5.48s/it][A

	loss_cls: tensor(0.6940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1481, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:20<26:54,  5.47s/it][A

	loss_cls: tensor(0.4666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8317, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:53,  5.49s/it][A

	loss_cls: tensor(0.7118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8891, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:31<26:45,  5.48s/it][A

	loss_cls: tensor(0.8542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9484, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:36<26:36,  5.47s/it][A

	loss_cls: tensor(0.5874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6616, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:42<26:34,  5.48s/it][A

	loss_cls: tensor(0.6432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8924, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:47<26:24,  5.46s/it][A

	loss_cls: tensor(0.4777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6308, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:53<26:25,  5.49s/it][A

	loss_cls: tensor(0.7539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0390, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:58<26:14,  5.47s/it][A

	loss_cls: tensor(0.5312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7994, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:04<26:11,  5.48s/it][A

	loss_cls: tensor(0.5440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7085, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:09<26:01,  5.46s/it][A

	loss_cls: tensor(0.8342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9933, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:14<25:51,  5.44s/it][A

	loss_cls: tensor(0.3214, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7124, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:20<25:49,  5.46s/it][A

	loss_cls: tensor(0.3586, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3927, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:25<25:41,  5.45s/it][A

	loss_cls: tensor(0.7250, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0770, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:31<25:41,  5.47s/it][A

	loss_cls: tensor(0.2377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0298, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2674, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:36<25:34,  5.46s/it][A

	loss_cls: tensor(1.0146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1494, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:42<25:33,  5.48s/it][A

	loss_cls: tensor(0.3482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3895, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:47<25:23,  5.46s/it][A

	loss_cls: tensor(1.2307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4748, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:53<25:15,  5.45s/it][A

	loss_cls: tensor(0.2694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3054, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:58<25:14,  5.47s/it][A

	loss_cls: tensor(0.9407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1913, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:04<25:05,  5.46s/it][A

	loss_cls: tensor(0.8113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0542, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:09<25:05,  5.48s/it][A

	loss_cls: tensor(0.9569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1540, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:15<24:56,  5.46s/it][A

	loss_cls: tensor(0.3985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5416, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:20<24:54,  5.47s/it][A

	loss_cls: tensor(0.4060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4650, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:26<24:45,  5.46s/it][A

	loss_cls: tensor(0.6526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9274, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:31<24:38,  5.45s/it][A

	loss_cls: tensor(0.4539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6913, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:36<24:35,  5.47s/it][A

	loss_cls: tensor(0.6504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7487, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:42<24:28,  5.46s/it][A

	loss_cls: tensor(0.6306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8812, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:47<24:27,  5.48s/it][A

	loss_cls: tensor(0.5590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7765, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:53<24:18,  5.46s/it][A

	loss_cls: tensor(0.5718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9933, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:58<24:15,  5.47s/it][A

	loss_cls: tensor(0.7875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9809, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:03<23:43,  5.37s/it][A

	loss_cls: tensor(0.6420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8195, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:09<23:37,  5.37s/it][A

	loss_cls: tensor(0.8863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1374, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:14<23:36,  5.39s/it][A

	loss_cls: tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7766, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:20<23:31,  5.39s/it][A

	loss_cls: tensor(0.4889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8515, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:25<23:32,  5.41s/it][A

	loss_cls: tensor(0.7358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4168, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:30<23:23,  5.40s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8358, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:36<23:21,  5.41s/it][A

	loss_cls: tensor(0.6747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0432, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:41<23:10,  5.39s/it][A

	loss_cls: tensor(1.0740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3871, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:47<23:00,  5.37s/it][A

	loss_cls: tensor(0.6547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9247, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:52<22:56,  5.38s/it][A

	loss_cls: tensor(0.6723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8817, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:57<22:46,  5.36s/it][A

	loss_cls: tensor(0.6481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8251, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:03<22:42,  5.37s/it][A

	loss_cls: tensor(0.4567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8726, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:08<22:31,  5.34s/it][A

	loss_cls: tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7933, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:13<22:28,  5.35s/it][A

	loss_cls: tensor(0.7046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7923, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:19<22:17,  5.33s/it][A

	loss_cls: tensor(0.5168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9704, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:24<22:10,  5.32s/it][A

	loss_cls: tensor(0.5939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8933, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:29<22:10,  5.34s/it][A

	loss_cls: tensor(0.7216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8267, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:35<22:01,  5.33s/it][A

	loss_cls: tensor(0.6637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8242, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:40<22:00,  5.35s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6169, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:45<21:50,  5.33s/it][A

	loss_cls: tensor(0.5320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2485, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7804, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:51<21:49,  5.34s/it][A

	loss_cls: tensor(0.7184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9543, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:56<21:39,  5.33s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1325, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:01<21:34,  5.33s/it][A

	loss_cls: tensor(0.8140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0216, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:07<21:26,  5.32s/it][A

	loss_cls: tensor(0.8106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9418, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:12<21:21,  5.32s/it][A

	loss_cls: tensor(0.5073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8907, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:17<21:21,  5.34s/it][A

	loss_cls: tensor(0.7255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4358, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:23<21:13,  5.33s/it][A

	loss_cls: tensor(0.5577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8388, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:28<21:10,  5.34s/it][A

	loss_cls: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7724, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:33<21:01,  5.32s/it][A

	loss_cls: tensor(0.8460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1495, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:39<20:59,  5.34s/it][A

	loss_cls: tensor(0.6586, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7675, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:44<20:52,  5.33s/it][A

	loss_cls: tensor(0.7024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9564, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:49<20:45,  5.32s/it][A

	loss_cls: tensor(0.7451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9550, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:55<20:46,  5.35s/it][A

	loss_cls: tensor(0.6781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8364, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:00<20:37,  5.33s/it][A

	loss_cls: tensor(0.9024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1934, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:05<20:35,  5.35s/it][A

	loss_cls: tensor(0.6918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8020, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:11<20:24,  5.32s/it][A

	loss_cls: tensor(0.6717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9970, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:16<20:24,  5.35s/it][A

	loss_cls: tensor(0.6954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9628, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:21<20:16,  5.33s/it][A

	loss_cls: tensor(0.4509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2852, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7362, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:27<20:09,  5.33s/it][A

	loss_cls: tensor(0.6675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8451, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:32<20:06,  5.34s/it][A

	loss_cls: tensor(0.6678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1795, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:37<19:57,  5.32s/it][A

	loss_cls: tensor(0.6972, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7240, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:43<19:55,  5.34s/it][A

	loss_cls: tensor(0.5175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8521, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:48<19:45,  5.32s/it][A

	loss_cls: tensor(0.8555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9005, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:53<19:46,  5.35s/it][A

	loss_cls: tensor(0.4799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7067, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:59<19:37,  5.33s/it][A

	loss_cls: tensor(0.5223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6391, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:04<19:30,  5.32s/it][A

	loss_cls: tensor(0.5850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8524, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:09<19:28,  5.33s/it][A

	loss_cls: tensor(0.4991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6731, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:15<19:19,  5.32s/it][A

	loss_cls: tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8923, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:20<19:16,  5.33s/it][A

	loss_cls: tensor(0.6653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7883, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:25<19:08,  5.32s/it][A

	loss_cls: tensor(0.5713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7264, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:31<19:07,  5.34s/it][A

	loss_cls: tensor(0.7593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0177, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:36<19:00,  5.33s/it][A

	loss_cls: tensor(0.7464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4488, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:41<18:52,  5.32s/it][A

	loss_cls: tensor(0.5941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6962, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:47<18:50,  5.33s/it][A

	loss_cls: tensor(0.6147, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9164, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:52<18:42,  5.32s/it][A

	loss_cls: tensor(0.5044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5988, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:57<18:39,  5.33s/it][A

	loss_cls: tensor(0.5172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6615, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:02<18:31,  5.32s/it][A

	loss_cls: tensor(0.4832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8875, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:08<18:29,  5.33s/it][A

	loss_cls: tensor(0.6601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2099, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:13<18:23,  5.33s/it][A

	loss_cls: tensor(0.5439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7018, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:18<18:15,  5.32s/it][A

	loss_cls: tensor(0.6857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9183, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:24<18:14,  5.34s/it][A

	loss_cls: tensor(0.6203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9599, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:29<18:05,  5.32s/it][A

	loss_cls: tensor(0.5458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7292, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:34<18:03,  5.34s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8728, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:40<17:56,  5.33s/it][A

	loss_cls: tensor(0.7621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2111, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:45<17:54,  5.35s/it][A

	loss_cls: tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0347, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:50<17:47,  5.34s/it][A

	loss_cls: tensor(0.6906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1328, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:56<17:39,  5.33s/it][A

	loss_cls: tensor(0.7215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1263, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:01<17:39,  5.35s/it][A

	loss_cls: tensor(0.4026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7763, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:06<17:29,  5.32s/it][A

	loss_cls: tensor(0.5480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6897, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:12<17:26,  5.34s/it][A

	loss_cls: tensor(1.1298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2742, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:17<17:18,  5.33s/it][A

	loss_cls: tensor(0.7842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9479, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:23<17:16,  5.35s/it][A

	loss_cls: tensor(0.9023, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0006, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:28<17:09,  5.34s/it][A

	loss_cls: tensor(0.7460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8809, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:33<16:59,  5.31s/it][A

	loss_cls: tensor(0.5427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:38<16:57,  5.33s/it][A

	loss_cls: tensor(0.4865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6858, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:44<16:48,  5.31s/it][A

	loss_cls: tensor(0.7425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1108, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:49<16:48,  5.34s/it][A

	loss_cls: tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9322, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:54<16:40,  5.32s/it][A

	loss_cls: tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0431, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:00<16:38,  5.34s/it][A

	loss_cls: tensor(0.7355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0754, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:05<16:30,  5.33s/it][A

	loss_cls: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1962, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:10<16:22,  5.31s/it][A

	loss_cls: tensor(0.3701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4037, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:16<16:21,  5.34s/it][A

	loss_cls: tensor(0.9267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2770, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:21<16:14,  5.33s/it][A

	loss_cls: tensor(0.4797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8465, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:26<16:15,  5.36s/it][A

	loss_cls: tensor(0.4740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6959, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:32<16:07,  5.34s/it][A

	loss_cls: tensor(0.4674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6540, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:37<16:04,  5.36s/it][A

	loss_cls: tensor(0.6014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8375, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:42<15:55,  5.34s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8368, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:48<15:46,  5.32s/it][A

	loss_cls: tensor(0.6500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9364, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:53<15:43,  5.33s/it][A

	loss_cls: tensor(0.4899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7349, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:58<15:36,  5.32s/it][A

	loss_cls: tensor(0.7289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9527, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:04<15:35,  5.35s/it][A

	loss_cls: tensor(0.4794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6897, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:09<15:28,  5.33s/it][A

	loss_cls: tensor(0.6001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7716, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:15<15:32,  5.39s/it][A

	loss_cls: tensor(0.6079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0356, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6435, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:20<15:31,  5.42s/it][A

	loss_cls: tensor(0.5906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6196, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:26<15:28,  5.43s/it][A

	loss_cls: tensor(0.5841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6080, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:31<15:27,  5.46s/it][A

	loss_cls: tensor(0.6792, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8373, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:36<14:56,  5.30s/it][A

	loss_cls: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9675, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:41<14:54,  5.33s/it][A

	loss_cls: tensor(0.9618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1949, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:46<14:31,  5.22s/it][A

	loss_cls: tensor(0.7154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1867, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9020, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:52<14:34,  5.27s/it][A

	loss_cls: tensor(0.7092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8821, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:57<14:37,  5.32s/it][A

	loss_cls: tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2053, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:03<14:39,  5.36s/it][A

	loss_cls: tensor(0.3669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3844, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:08<14:42,  5.41s/it][A

	loss_cls: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7599, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:14<14:40,  5.43s/it][A

	loss_cls: tensor(0.6626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8980, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:19<14:42,  5.48s/it][A

	loss_cls: tensor(0.8771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0991, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:25<14:35,  5.47s/it][A

	loss_cls: tensor(0.6441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7893, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:30<14:32,  5.49s/it][A

	loss_cls: tensor(1.2856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4916, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:36<14:25,  5.48s/it][A

	loss_cls: tensor(0.3278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3635, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:41<14:18,  5.47s/it][A

	loss_cls: tensor(0.7975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0107, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:47<14:18,  5.50s/it][A

	loss_cls: tensor(1.0781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2285, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:52<14:10,  5.49s/it][A

	loss_cls: tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5388, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:58<14:07,  5.51s/it][A

	loss_cls: tensor(0.4579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6354, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:03<13:59,  5.49s/it][A

	loss_cls: tensor(0.4738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6322, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:09<13:56,  5.50s/it][A

	loss_cls: tensor(0.6779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2084, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8863, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:14<13:48,  5.48s/it][A

	loss_cls: tensor(0.6810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0070, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:20<13:39,  5.47s/it][A

	loss_cls: tensor(0.6285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7222, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:25<13:39,  5.50s/it][A

	loss_cls: tensor(0.7492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0108, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:31<13:32,  5.49s/it][A

	loss_cls: tensor(0.6437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9314, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:36<13:28,  5.50s/it][A

	loss_cls: tensor(0.7194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7555, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:42<13:20,  5.48s/it][A

	loss_cls: tensor(0.5797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9444, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:47<13:17,  5.50s/it][A

	loss_cls: tensor(0.6600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8465, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:53<13:08,  5.48s/it][A

	loss_cls: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9226, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:58<13:06,  5.50s/it][A

	loss_cls: tensor(0.7234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9962, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:04<12:58,  5.48s/it][A

	loss_cls: tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7534, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:09<12:52,  5.48s/it][A

	loss_cls: tensor(0.4062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6431, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:15<12:50,  5.51s/it][A

	loss_cls: tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9129, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:20<12:42,  5.49s/it][A

	loss_cls: tensor(0.5904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8530, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:26<12:38,  5.50s/it][A

	loss_cls: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8145, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:31<12:31,  5.48s/it][A

	loss_cls: tensor(0.5577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8019, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:37<12:27,  5.50s/it][A

	loss_cls: tensor(0.4244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6374, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:42<12:20,  5.48s/it][A

	loss_cls: tensor(0.6758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8306, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:47<12:13,  5.47s/it][A

	loss_cls: tensor(0.6085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6464, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:53<12:10,  5.49s/it][A

	loss_cls: tensor(0.5986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7461, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:58<12:02,  5.47s/it][A

	loss_cls: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0218, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:04<11:59,  5.49s/it][A

	loss_cls: tensor(0.6069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8820, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:09<11:52,  5.48s/it][A

	loss_cls: tensor(0.7093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0127, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:15<11:50,  5.51s/it][A

	loss_cls: tensor(0.5830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8785, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:20<11:44,  5.50s/it][A

	loss_cls: tensor(0.9661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2386, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:26<11:38,  5.50s/it][A

	loss_cls: tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9699, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:32<11:35,  5.52s/it][A

	loss_cls: tensor(0.5319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7199, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:37<11:27,  5.50s/it][A

	loss_cls: tensor(0.6307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0156, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:43<11:24,  5.52s/it][A

	loss_cls: tensor(0.6333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0842, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7175, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:48<11:15,  5.49s/it][A

	loss_cls: tensor(0.5561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0721, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:54<11:12,  5.51s/it][A

	loss_cls: tensor(0.7251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8290, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:59<11:04,  5.50s/it][A

	loss_cls: tensor(0.6725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8456, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:04<10:57,  5.48s/it][A

	loss_cls: tensor(0.7951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9447, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:10<10:54,  5.50s/it][A

	loss_cls: tensor(0.5135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7862, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:15<10:47,  5.49s/it][A

	loss_cls: tensor(0.5085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8975, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:21<10:44,  5.51s/it][A

	loss_cls: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6388, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:26<10:37,  5.50s/it][A

	loss_cls: tensor(0.6884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9205, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:32<10:33,  5.51s/it][A

	loss_cls: tensor(0.5848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7803, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:37<10:26,  5.49s/it][A

	loss_cls: tensor(0.6308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9009, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:43<10:18,  5.47s/it][A

	loss_cls: tensor(0.6405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9022, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:48<10:13,  5.48s/it][A

	loss_cls: tensor(0.6717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7471, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:54<10:06,  5.46s/it][A

	loss_cls: tensor(0.6306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9749, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:59<10:02,  5.48s/it][A

	loss_cls: tensor(0.9262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1433, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:05<09:56,  5.47s/it][A

	loss_cls: tensor(0.7288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0169, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:10<09:52,  5.49s/it][A

	loss_cls: tensor(0.3482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3787, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:16<09:45,  5.48s/it][A

	loss_cls: tensor(0.4584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5477, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:21<09:38,  5.46s/it][A

	loss_cls: tensor(0.4550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5149, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:27<09:34,  5.47s/it][A

	loss_cls: tensor(0.8994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0059, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:32<09:24,  5.42s/it][A

	loss_cls: tensor(0.7150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0190, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:37<09:18,  5.42s/it][A

	loss_cls: tensor(0.6384, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0613, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:43<09:10,  5.40s/it][A

	loss_cls: tensor(0.3346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4220, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:48<09:05,  5.40s/it][A

	loss_cls: tensor(0.8492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1383, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:53<08:58,  5.38s/it][A

	loss_cls: tensor(0.5477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8912, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:59<08:50,  5.36s/it][A

	loss_cls: tensor(1.0083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2516, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:04<08:46,  5.38s/it][A

	loss_cls: tensor(0.3358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3857, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:10<08:40,  5.36s/it][A

	loss_cls: tensor(0.6535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8259, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:15<08:36,  5.38s/it][A

	loss_cls: tensor(0.5822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9602, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:20<08:30,  5.38s/it][A

	loss_cls: tensor(0.4250, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5033, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:26<08:27,  5.40s/it][A

	loss_cls: tensor(0.5684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7845, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:31<08:19,  5.38s/it][A

	loss_cls: tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7228, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:36<08:13,  5.36s/it][A

	loss_cls: tensor(0.4895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5898, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:42<08:09,  5.37s/it][A

	loss_cls: tensor(0.5588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7355, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:47<08:02,  5.36s/it][A

	loss_cls: tensor(0.5729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8542, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:53<07:59,  5.39s/it][A

	loss_cls: tensor(0.5917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6794, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:58<07:53,  5.38s/it][A

	loss_cls: tensor(0.5888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9903, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:03<07:49,  5.40s/it][A

	loss_cls: tensor(0.6329, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8043, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:09<07:42,  5.38s/it][A

	loss_cls: tensor(0.5061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5324, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:14<07:36,  5.37s/it][A

	loss_cls: tensor(0.5555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6650, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:20<07:32,  5.39s/it][A

	loss_cls: tensor(0.8192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1790, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:25<07:26,  5.38s/it][A

	loss_cls: tensor(1.1730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7066, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:30<07:24,  5.41s/it][A

	loss_cls: tensor(0.6954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1489, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:36<07:17,  5.40s/it][A

	loss_cls: tensor(0.7605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0281, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:41<07:12,  5.41s/it][A

	loss_cls: tensor(0.5719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9027, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:46<07:05,  5.38s/it][A

	loss_cls: tensor(0.5757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7265, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:52<06:58,  5.37s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5887, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:57<06:54,  5.39s/it][A

	loss_cls: tensor(0.4724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5780, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:03<06:48,  5.37s/it][A

	loss_cls: tensor(0.5558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8035, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:08<06:45,  5.40s/it][A

	loss_cls: tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7075, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:13<06:38,  5.39s/it][A

	loss_cls: tensor(0.5929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7497, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:19<06:33,  5.40s/it][A

	loss_cls: tensor(0.5073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6507, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:24<06:27,  5.38s/it][A

	loss_cls: tensor(0.5594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6864, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:30<06:21,  5.37s/it][A

	loss_cls: tensor(0.4909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5092, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:35<06:16,  5.38s/it][A

	loss_cls: tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7121, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:40<06:10,  5.37s/it][A

	loss_cls: tensor(0.8788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4857, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:46<06:05,  5.38s/it][A

	loss_cls: tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8732, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:51<05:59,  5.37s/it][A

	loss_cls: tensor(0.5868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6977, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:56<05:55,  5.38s/it][A

	loss_cls: tensor(0.4748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4881, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:02<05:48,  5.36s/it][A

	loss_cls: tensor(0.6897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8661, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:07<05:43,  5.36s/it][A

	loss_cls: tensor(0.7709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0903, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:13<05:38,  5.38s/it][A

	loss_cls: tensor(0.7357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0226, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:18<05:32,  5.37s/it][A

	loss_cls: tensor(0.7797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8649, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:23<05:28,  5.39s/it][A

	loss_cls: tensor(0.7898, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8802, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:29<05:22,  5.38s/it][A

	loss_cls: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9327, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:34<05:17,  5.38s/it][A

	loss_cls: tensor(0.5213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8612, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:39<05:11,  5.37s/it][A

	loss_cls: tensor(0.6584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8959, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:45<05:05,  5.36s/it][A

	loss_cls: tensor(0.5339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6734, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:50<05:01,  5.38s/it][A

	loss_cls: tensor(0.6582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8704, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:56<04:55,  5.37s/it][A

	loss_cls: tensor(0.6350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8780, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:01<04:51,  5.39s/it][A

	loss_cls: tensor(0.4918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8525, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:06<04:45,  5.38s/it][A

	loss_cls: tensor(0.7949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9218, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:12<04:40,  5.39s/it][A

	loss_cls: tensor(0.5147, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:17<04:34,  5.37s/it][A

	loss_cls: tensor(0.6151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9865, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:22<04:27,  5.36s/it][A

	loss_cls: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9991, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:28<04:24,  5.39s/it][A

	loss_cls: tensor(0.7171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0746, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:33<04:18,  5.38s/it][A

	loss_cls: tensor(0.6161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0452, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:39<04:14,  5.41s/it][A

	loss_cls: tensor(0.7914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3228, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1142, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:44<04:07,  5.39s/it][A

	loss_cls: tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6051, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:49<04:02,  5.40s/it][A

	loss_cls: tensor(0.5590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9201, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:55<03:56,  5.38s/it][A

	loss_cls: tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0830, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6594, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:00<03:51,  5.39s/it][A

	loss_cls: tensor(0.6138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7796, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:06<03:46,  5.39s/it][A

	loss_cls: tensor(0.4432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5461, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:11<03:40,  5.39s/it][A

	loss_cls: tensor(0.4219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4918, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:16<03:36,  5.40s/it][A

	loss_cls: tensor(0.7080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8140, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:22<03:30,  5.39s/it][A

	loss_cls: tensor(0.8970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2648, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:27<03:25,  5.41s/it][A

	loss_cls: tensor(0.6370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8685, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:33<03:19,  5.39s/it][A

	loss_cls: tensor(0.8048, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0345, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:38<03:14,  5.41s/it][A

	loss_cls: tensor(0.9700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4775, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:43<03:08,  5.38s/it][A

	loss_cls: tensor(0.6956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0164, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:49<03:02,  5.38s/it][A

	loss_cls: tensor(0.4051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4313, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:54<02:57,  5.39s/it][A

	loss_cls: tensor(0.4447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5401, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:59<02:52,  5.38s/it][A

	loss_cls: tensor(0.3580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5449, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:05<02:46,  5.38s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6021, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:10<02:40,  5.36s/it][A

	loss_cls: tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7461, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:16<02:36,  5.38s/it][A

	loss_cls: tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7747, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:21<02:30,  5.38s/it][A

	loss_cls: tensor(0.3168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5311, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:26<02:24,  5.36s/it][A

	loss_cls: tensor(0.5478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6516, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:32<02:19,  5.37s/it][A

	loss_cls: tensor(0.3988, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9331, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:37<02:14,  5.36s/it][A

	loss_cls: tensor(0.6241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0455, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:42<02:08,  5.37s/it][A

	loss_cls: tensor(0.5553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2867, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8421, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:48<02:03,  5.35s/it][A

	loss_cls: tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8332, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:53<01:58,  5.37s/it][A

	loss_cls: tensor(0.6331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7342, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:59<01:52,  5.36s/it][A

	loss_cls: tensor(0.4552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5416, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:04<01:47,  5.37s/it][A

	loss_cls: tensor(0.7875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3652, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:09<01:42,  5.39s/it][A

	loss_cls: tensor(0.4494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6637, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:15<01:36,  5.37s/it][A

	loss_cls: tensor(0.4279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5206, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:20<01:31,  5.39s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6331, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:25<01:26,  5.38s/it][A

	loss_cls: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9732, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:31<01:21,  5.40s/it][A

	loss_cls: tensor(0.5437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8121, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:36<01:15,  5.39s/it][A

	loss_cls: tensor(0.7681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8351, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:42<01:10,  5.39s/it][A

	loss_cls: tensor(0.3790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5190, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:47<01:04,  5.40s/it][A

	loss_cls: tensor(0.8624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1966, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:52<00:59,  5.39s/it][A

	loss_cls: tensor(1.0230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3625, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:58<00:54,  5.40s/it][A

	loss_cls: tensor(0.5386, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9948, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:03<00:48,  5.39s/it][A

	loss_cls: tensor(0.5351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7863, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:09<00:43,  5.40s/it][A

	loss_cls: tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2526, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8798, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:14<00:37,  5.38s/it][A

	loss_cls: tensor(0.5048, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6648, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:19<00:32,  5.38s/it][A

	loss_cls: tensor(0.6167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1793, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:25<00:26,  5.40s/it][A

	loss_cls: tensor(0.7676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8675, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:30<00:21,  5.38s/it][A

	loss_cls: tensor(0.6915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9524, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:36<00:16,  5.40s/it][A

	loss_cls: tensor(0.8937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9978, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:41<00:10,  5.39s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7942, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:46<00:05,  5.41s/it][A

	loss_cls: tensor(0.5083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7376, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:48<00:00,  5.39s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.4181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8780, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8706995413801765

	Training cls acc: 0.6834981167608286

	Training cls prec: 0.5611095393934377

	Training cls rec: 0.58856847601551

	Training cls f1: 0.5140352370147747

--
	Training ner acc: 0.9549943951208619

	Training ner prec: 0.27947614049719205

	Training ner rec: 0.2887445931987775

	Training ner f1: 0.283905126143247

	Current Learning rate:  0.0009428571428571429



  1%|          | 1/177 [00:00<02:06,  1.39it/s][A
  1%|          | 2/177 [00:01<01:56,  1.50it/s][A
  2%|▏         | 3/177 [00:02<01:59,  1.45it/s][A
  2%|▏         | 4/177 [00:02<02:01,  1.43it/s][A
  3%|▎         | 5/177 [00:03<02:01,  1.41it/s][A
  3%|▎         | 6/177 [00:04<01:56,  1.47it/s][A
  4%|▍         | 7/177 [00:04<01:57,  1.45it/s][A
  5%|▍         | 8/177 [00:05<01:58,  1.43it/s][A
  5%|▌         | 9/177 [00:06<01:54,  1.47it/s][A
  6%|▌         | 10/177 [00:06<01:55,  1.45it/s][A
  6%|▌         | 11/177 [00:07<01:55,  1.43it/s][A
  7%|▋         | 12/177 [00:08<01:55,  1.42it/s][A
  7%|▋         | 13/177 [00:08<01:51,  1.46it/s][A
  8%|▊         | 14/177 [00:09<01:53,  1.44it/s][A
  8%|▊         | 15/177 [00:10<01:53,  1.43it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.42it/s][A
 10%|▉         | 17/177 [00:11<01:49,  1.46it/s][A
 10%|█         | 18/177 [00:12<01:50,  1.44it/s][A
 11%|█         | 19/177 [00:13<01:50,  1.43it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7908061899371066

	Validation cls acc: 0.701271186440678

	Validation cls prec: 0.6146993543179984

	Validation cls rec: 0.6094969061070756

	Validation cls f1: 0.5631861233556148

--
	Validation ner acc: 0.9539447800403682

	Validation ner prec: 0.41848529487536956

	Validation ner rec: 0.42909604519774014

	Validation ner f1: 0.4235663514228999



  0%|          | 1/354 [00:05<31:52,  5.42s/it][A

	loss_cls: tensor(0.6404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9367, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:23,  5.35s/it][A

	loss_cls: tensor(0.7741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0164, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:26,  5.37s/it][A

	loss_cls: tensor(0.6691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9146, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:11,  5.35s/it][A

	loss_cls: tensor(0.4576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5045, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:09,  5.36s/it][A

	loss_cls: tensor(0.7000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8268, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<30:57,  5.34s/it][A

	loss_cls: tensor(0.5125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8074, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:01,  5.37s/it][A

	loss_cls: tensor(0.5256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8031, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:42<30:54,  5.36s/it][A

	loss_cls: tensor(0.8947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3329, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:44,  5.35s/it][A

	loss_cls: tensor(0.3483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4470, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:46,  5.37s/it][A

	loss_cls: tensor(0.5846, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7120, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:58<30:33,  5.35s/it][A

	loss_cls: tensor(0.4451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5342, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:33,  5.36s/it][A

	loss_cls: tensor(0.4941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6937, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:20,  5.34s/it][A

	loss_cls: tensor(0.7690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2508, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:14<30:19,  5.35s/it][A

	loss_cls: tensor(0.5661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7634, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:09,  5.34s/it][A

	loss_cls: tensor(0.3626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5401, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:25<30:00,  5.33s/it][A

	loss_cls: tensor(0.5602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7443, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:30<30:01,  5.34s/it][A

	loss_cls: tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6423, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<29:50,  5.33s/it][A

	loss_cls: tensor(0.5981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9436, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:41<29:56,  5.36s/it][A

	loss_cls: tensor(0.4973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5926, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<29:46,  5.35s/it][A

	loss_cls: tensor(0.6562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8185, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:48,  5.37s/it][A

	loss_cls: tensor(1.1699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7005, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:57<29:40,  5.36s/it][A

	loss_cls: tensor(0.7006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0513, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:34,  5.36s/it][A

	loss_cls: tensor(0.4723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7010, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:34,  5.38s/it][A

	loss_cls: tensor(0.6874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8130, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:13<29:23,  5.36s/it][A

	loss_cls: tensor(0.6733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9783, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:23,  5.38s/it][A

	loss_cls: tensor(0.5076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7528, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:14,  5.36s/it][A

	loss_cls: tensor(0.8860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0291, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:30<29:16,  5.39s/it][A

	loss_cls: tensor(0.6718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1205, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<29:07,  5.38s/it][A

	loss_cls: tensor(1.0689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3296, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<28:56,  5.36s/it][A

	loss_cls: tensor(0.4935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8200, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:46<28:56,  5.38s/it][A

	loss_cls: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6315, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<28:44,  5.36s/it][A

	loss_cls: tensor(0.5703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9074, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:56<28:43,  5.37s/it][A

	loss_cls: tensor(0.5713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8999, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:02<28:36,  5.36s/it][A

	loss_cls: tensor(0.5057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7379, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:38,  5.39s/it][A

	loss_cls: tensor(0.4039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7263, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:13<28:29,  5.38s/it][A

	loss_cls: tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9152, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:22,  5.37s/it][A

	loss_cls: tensor(0.6293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8336, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:23<28:20,  5.38s/it][A

	loss_cls: tensor(0.7696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8418, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:29<28:10,  5.37s/it][A

	loss_cls: tensor(0.4314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8777, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:34<28:08,  5.38s/it][A

	loss_cls: tensor(0.5243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8408, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:39<28:03,  5.38s/it][A

	loss_cls: tensor(0.6187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9210, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:45<28:02,  5.39s/it][A

	loss_cls: tensor(0.9348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9935, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:50<27:54,  5.38s/it][A

	loss_cls: tensor(0.5599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8743, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:56<27:45,  5.37s/it][A

	loss_cls: tensor(0.7365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8190, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:01<27:44,  5.39s/it][A

	loss_cls: tensor(0.3127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4410, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:06<27:34,  5.37s/it][A

	loss_cls: tensor(0.4149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5461, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:12<27:35,  5.39s/it][A

	loss_cls: tensor(0.6403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0181, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:17<27:26,  5.38s/it][A

	loss_cls: tensor(0.4380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7960, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:22<27:25,  5.40s/it][A

	loss_cls: tensor(1.0556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3097, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:28<27:15,  5.38s/it][A

	loss_cls: tensor(0.6799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0579, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:33<27:07,  5.37s/it][A

	loss_cls: tensor(0.4463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5598, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:39<27:08,  5.39s/it][A

	loss_cls: tensor(0.3473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5156, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:44<26:57,  5.37s/it][A

	loss_cls: tensor(1.0523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3082, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:50<27:09,  5.43s/it][A

	loss_cls: tensor(1.0367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5797, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:55<27:36,  5.54s/it][A

	loss_cls: tensor(0.7595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9218, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:01<28:01,  5.64s/it][A

	loss_cls: tensor(0.5012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3261, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8272, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:07<27:37,  5.58s/it][A

	loss_cls: tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9884, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:12<27:18,  5.54s/it][A

	loss_cls: tensor(0.7487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8321, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:18<27:04,  5.51s/it][A

	loss_cls: tensor(0.6578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7262, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:23<26:47,  5.47s/it][A

	loss_cls: tensor(0.8017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0875, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:28<26:41,  5.47s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7579, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:34<26:27,  5.44s/it][A

	loss_cls: tensor(0.6386, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7273, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:39<26:22,  5.44s/it][A

	loss_cls: tensor(0.5983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8058, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:45<26:10,  5.42s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6946, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:50<25:58,  5.39s/it][A

	loss_cls: tensor(0.4701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6567, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:55<26:01,  5.42s/it][A

	loss_cls: tensor(0.8918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1191, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0109, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:01<25:56,  5.42s/it][A

	loss_cls: tensor(0.4925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8868, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:06<25:57,  5.44s/it][A

	loss_cls: tensor(0.8675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2086, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:12<25:47,  5.43s/it][A

	loss_cls: tensor(0.4287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7134, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:17<25:45,  5.44s/it][A

	loss_cls: tensor(0.4905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7930, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:23<25:33,  5.42s/it][A

	loss_cls: tensor(0.6601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9868, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:28<25:26,  5.41s/it][A

	loss_cls: tensor(0.4602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5751, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:33<25:24,  5.42s/it][A

	loss_cls: tensor(0.6342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0481, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:39<25:15,  5.41s/it][A

	loss_cls: tensor(0.7235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0239, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:44<25:13,  5.43s/it][A

	loss_cls: tensor(0.4054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6300, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:50<25:04,  5.41s/it][A

	loss_cls: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8774, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:55<25:03,  5.43s/it][A

	loss_cls: tensor(0.6919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9391, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:00<24:52,  5.41s/it][A

	loss_cls: tensor(0.5592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7626, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:06<24:45,  5.40s/it][A

	loss_cls: tensor(1.0133, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1731, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:11<24:44,  5.42s/it][A

	loss_cls: tensor(0.5166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7017, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<24:36,  5.41s/it][A

	loss_cls: tensor(0.6374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7630, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<24:33,  5.42s/it][A

	loss_cls: tensor(0.6739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7150, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:27<24:24,  5.40s/it][A

	loss_cls: tensor(0.6719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7264, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:22,  5.42s/it][A

	loss_cls: tensor(0.5627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7254, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:38<24:14,  5.41s/it][A

	loss_cls: tensor(0.4246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4789, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:06,  5.40s/it][A

	loss_cls: tensor(0.4517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5348, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:49<24:08,  5.42s/it][A

	loss_cls: tensor(0.3936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5453, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<23:59,  5.41s/it][A

	loss_cls: tensor(0.2647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2709, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<23:59,  5.43s/it][A

	loss_cls: tensor(1.0125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3832, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:05<23:49,  5.41s/it][A

	loss_cls: tensor(0.1300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2915, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:47,  5.43s/it][A

	loss_cls: tensor(1.6088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(2.1086, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<23:37,  5.41s/it][A

	loss_cls: tensor(0.6093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1134, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<23:28,  5.40s/it][A

	loss_cls: tensor(1.0168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1441, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:27<23:27,  5.41s/it][A

	loss_cls: tensor(0.1309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.1454, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:32<23:18,  5.40s/it][A

	loss_cls: tensor(1.1091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3849, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:38<23:17,  5.41s/it][A

	loss_cls: tensor(0.8957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0550, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:43<23:08,  5.40s/it][A

	loss_cls: tensor(0.5010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6612, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:49<23:09,  5.43s/it][A

	loss_cls: tensor(1.0387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4931, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:54<22:58,  5.41s/it][A

	loss_cls: tensor(0.5303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2127, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7430, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:59<22:53,  5.41s/it][A

	loss_cls: tensor(0.7042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9863, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:05<22:48,  5.41s/it][A

	loss_cls: tensor(0.5523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8090, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:10<22:41,  5.40s/it][A

	loss_cls: tensor(0.5305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7201, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:40,  5.42s/it][A

	loss_cls: tensor(0.6220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7387, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:21<22:32,  5.41s/it][A

	loss_cls: tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9472, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:27<22:29,  5.42s/it][A

	loss_cls: tensor(0.9014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9676, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:32<22:19,  5.40s/it][A

	loss_cls: tensor(0.8275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0120, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:37<22:17,  5.41s/it][A

	loss_cls: tensor(0.5533, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6293, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:43<22:09,  5.40s/it][A

	loss_cls: tensor(0.6094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:48<22:00,  5.39s/it][A

	loss_cls: tensor(0.6221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1513, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:54<21:58,  5.41s/it][A

	loss_cls: tensor(0.7406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2779, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:59<21:51,  5.40s/it][A

	loss_cls: tensor(0.6108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8092, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:04<21:45,  5.39s/it][A

	loss_cls: tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2835, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:10<21:38,  5.39s/it][A

	loss_cls: tensor(0.6594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7188, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:15<21:39,  5.41s/it][A

	loss_cls: tensor(0.6652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9774, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:21<21:33,  5.41s/it][A

	loss_cls: tensor(0.6716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7968, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:26<21:22,  5.39s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8523, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:31<21:20,  5.40s/it][A

	loss_cls: tensor(0.7660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0722, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:37<21:12,  5.39s/it][A

	loss_cls: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2447, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:42<21:09,  5.40s/it][A

	loss_cls: tensor(0.5028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7099, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:47<20:59,  5.38s/it][A

	loss_cls: tensor(0.5504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7209, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:53<20:58,  5.40s/it][A

	loss_cls: tensor(0.4346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7446, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:58<20:50,  5.39s/it][A

	loss_cls: tensor(0.6029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7677, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:04<20:40,  5.37s/it][A

	loss_cls: tensor(0.4642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6619, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:09<20:40,  5.39s/it][A

	loss_cls: tensor(0.5353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7787, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:14<20:32,  5.38s/it][A

	loss_cls: tensor(0.6247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9947, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:20<20:30,  5.40s/it][A

	loss_cls: tensor(0.5915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7362, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:25<20:24,  5.40s/it][A

	loss_cls: tensor(0.7081, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2978, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:31<20:24,  5.42s/it][A

	loss_cls: tensor(0.8644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1207, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:36<20:17,  5.41s/it][A

	loss_cls: tensor(0.6945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7814, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:41<20:09,  5.40s/it][A

	loss_cls: tensor(0.5952, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0062, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:47<20:05,  5.40s/it][A

	loss_cls: tensor(0.6332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8506, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:52<19:54,  5.38s/it][A

	loss_cls: tensor(0.4969, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8820, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:58<19:53,  5.40s/it][A

	loss_cls: tensor(0.5622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9182, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:03<19:44,  5.39s/it][A

	loss_cls: tensor(0.7817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0723, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:08<19:42,  5.40s/it][A

	loss_cls: tensor(0.4691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6779, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:14<19:34,  5.39s/it][A

	loss_cls: tensor(0.4909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6566, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:19<19:26,  5.38s/it][A

	loss_cls: tensor(0.8394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1258, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:25<19:22,  5.38s/it][A

	loss_cls: tensor(0.4239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7674, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:30<19:15,  5.37s/it][A

	loss_cls: tensor(0.6129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8009, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:35<19:13,  5.39s/it][A

	loss_cls: tensor(0.5839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8501, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:41<19:08,  5.39s/it][A

	loss_cls: tensor(0.5003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7470, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:46<19:05,  5.40s/it][A

	loss_cls: tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8309, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:51<18:56,  5.38s/it][A

	loss_cls: tensor(0.5041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7080, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:57<18:50,  5.38s/it][A

	loss_cls: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8381, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:02<18:48,  5.40s/it][A

	loss_cls: tensor(0.4866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8256, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:08<18:39,  5.38s/it][A

	loss_cls: tensor(0.9035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0667, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:13<18:38,  5.40s/it][A

	loss_cls: tensor(0.5752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9503, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:18<18:30,  5.39s/it][A

	loss_cls: tensor(0.6683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7062, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:24<18:27,  5.40s/it][A

	loss_cls: tensor(0.5612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7710, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:29<18:17,  5.38s/it][A

	loss_cls: tensor(0.4255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6258, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:35<18:09,  5.37s/it][A

	loss_cls: tensor(0.5510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9308, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:40<18:08,  5.39s/it][A

	loss_cls: tensor(0.6692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1179, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:45<18:00,  5.38s/it][A

	loss_cls: tensor(0.7431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9388, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:51<17:59,  5.40s/it][A

	loss_cls: tensor(0.6885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1059, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:56<17:50,  5.38s/it][A

	loss_cls: tensor(0.4046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4865, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:02<17:47,  5.39s/it][A

	loss_cls: tensor(0.6096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8344, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:07<17:39,  5.38s/it][A

	loss_cls: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1529, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:12<17:33,  5.37s/it][A

	loss_cls: tensor(0.6217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6983, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:18<17:29,  5.38s/it][A

	loss_cls: tensor(0.8759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4503, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:23<17:20,  5.37s/it][A

	loss_cls: tensor(0.5983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9161, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:28<17:20,  5.39s/it][A

	loss_cls: tensor(0.6434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7875, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:34<17:12,  5.38s/it][A

	loss_cls: tensor(0.4045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4578, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:39<17:10,  5.39s/it][A

	loss_cls: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1270, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:45<17:00,  5.37s/it][A

	loss_cls: tensor(0.8089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9088, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:50<16:53,  5.36s/it][A

	loss_cls: tensor(0.3746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6089, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:55<16:50,  5.38s/it][A

	loss_cls: tensor(0.4531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6221, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:01<16:43,  5.36s/it][A

	loss_cls: tensor(1.1656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5561, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:06<16:41,  5.38s/it][A

	loss_cls: tensor(0.8876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2160, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:11<16:34,  5.38s/it][A

	loss_cls: tensor(0.5464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8026, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:17<16:31,  5.39s/it][A

	loss_cls: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1848, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:22<16:22,  5.37s/it][A

	loss_cls: tensor(0.5263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8069, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:27<16:13,  5.35s/it][A

	loss_cls: tensor(0.4554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5978, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:33<16:12,  5.37s/it][A

	loss_cls: tensor(0.5888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6471, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:38<16:06,  5.37s/it][A

	loss_cls: tensor(0.7479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9570, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:44<16:04,  5.39s/it][A

	loss_cls: tensor(0.6949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9709, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:49<15:57,  5.38s/it][A

	loss_cls: tensor(0.7859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8328, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:54<15:55,  5.40s/it][A

	loss_cls: tensor(0.8031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0851, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:00<15:47,  5.38s/it][A

	loss_cls: tensor(0.7284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8046, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:05<15:39,  5.37s/it][A

	loss_cls: tensor(0.6102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6474, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:11<15:37,  5.39s/it][A

	loss_cls: tensor(0.8775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9704, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:16<15:31,  5.39s/it][A

	loss_cls: tensor(0.4917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6009, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:21<15:30,  5.41s/it][A

	loss_cls: tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0738, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:27<15:23,  5.40s/it][A

	loss_cls: tensor(0.8558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2264, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:32<15:20,  5.41s/it][A

	loss_cls: tensor(0.3796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3985, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:38<15:11,  5.39s/it][A

	loss_cls: tensor(0.4934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7257, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:43<15:02,  5.37s/it][A

	loss_cls: tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6449, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:48<15:00,  5.39s/it][A

	loss_cls: tensor(0.5349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9615, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:54<14:54,  5.39s/it][A

	loss_cls: tensor(0.5654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7538, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:59<14:53,  5.41s/it][A

	loss_cls: tensor(0.5294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7052, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:05<14:45,  5.40s/it][A

	loss_cls: tensor(0.6637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7328, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:10<14:43,  5.42s/it][A

	loss_cls: tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6754, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:15<14:34,  5.40s/it][A

	loss_cls: tensor(0.6606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5022, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1628, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:21<14:26,  5.38s/it][A

	loss_cls: tensor(0.4692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5976, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:26<14:24,  5.40s/it][A

	loss_cls: tensor(0.6651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8939, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:32<14:16,  5.39s/it][A

	loss_cls: tensor(0.3682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4327, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:37<14:12,  5.40s/it][A

	loss_cls: tensor(0.8883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1373, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:42<14:05,  5.38s/it][A

	loss_cls: tensor(0.9551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1275, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:48<14:02,  5.40s/it][A

	loss_cls: tensor(0.6716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8613, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:53<13:55,  5.39s/it][A

	loss_cls: tensor(0.7058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9438, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:59<13:51,  5.40s/it][A

	loss_cls: tensor(0.7466, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9732, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:04<13:46,  5.40s/it][A

	loss_cls: tensor(0.7396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9431, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:09<13:40,  5.40s/it][A

	loss_cls: tensor(0.5259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7674, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:15<13:39,  5.43s/it][A

	loss_cls: tensor(0.6304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1077, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:20<13:33,  5.42s/it][A

	loss_cls: tensor(0.6639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9448, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:26<13:33,  5.46s/it][A

	loss_cls: tensor(0.6744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8033, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:31<13:26,  5.45s/it][A

	loss_cls: tensor(0.6013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7757, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:37<13:19,  5.44s/it][A

	loss_cls: tensor(0.6814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9995, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:42<13:15,  5.45s/it][A

	loss_cls: tensor(0.6741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7235, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:48<13:10,  5.45s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1181, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:53<13:07,  5.47s/it][A

	loss_cls: tensor(0.6178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8222, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:58<13:00,  5.46s/it][A

	loss_cls: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8149, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:04<12:58,  5.48s/it][A

	loss_cls: tensor(0.5638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8974, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:09<12:51,  5.47s/it][A

	loss_cls: tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7643, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:15<12:49,  5.49s/it][A

	loss_cls: tensor(0.4841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6254, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:20<12:42,  5.48s/it][A

	loss_cls: tensor(0.7160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8140, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:26<12:36,  5.48s/it][A

	loss_cls: tensor(0.8429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3147, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:31<12:32,  5.50s/it][A

	loss_cls: tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7236, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:37<12:25,  5.48s/it][A

	loss_cls: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1476, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:42<12:21,  5.49s/it][A

	loss_cls: tensor(0.6183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9043, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:48<12:14,  5.48s/it][A

	loss_cls: tensor(0.5798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6270, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:53<12:11,  5.50s/it][A

	loss_cls: tensor(0.4627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7480, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:59<12:04,  5.49s/it][A

	loss_cls: tensor(0.7337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9184, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:04<11:58,  5.49s/it][A

	loss_cls: tensor(0.6555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9006, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:10<11:55,  5.50s/it][A

	loss_cls: tensor(0.5148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6439, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:15<11:47,  5.49s/it][A

	loss_cls: tensor(0.4320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9873, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:21<11:42,  5.49s/it][A

	loss_cls: tensor(0.4388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7319, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:26<11:36,  5.48s/it][A

	loss_cls: tensor(0.5477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8359, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:32<11:33,  5.50s/it][A

	loss_cls: tensor(0.5224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6485, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:37<11:26,  5.49s/it][A

	loss_cls: tensor(0.5389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5992, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:43<11:19,  5.48s/it][A

	loss_cls: tensor(0.4836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5975, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:48<11:16,  5.50s/it][A

	loss_cls: tensor(0.5667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7039, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:54<11:09,  5.49s/it][A

	loss_cls: tensor(0.4676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5912, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:59<11:04,  5.49s/it][A

	loss_cls: tensor(0.3899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1595, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5495, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:05<10:57,  5.48s/it][A

	loss_cls: tensor(0.8719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1924, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:10<10:54,  5.50s/it][A

	loss_cls: tensor(0.4576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5830, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:16<10:46,  5.48s/it][A

	loss_cls: tensor(0.7325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9471, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:21<10:39,  5.46s/it][A

	loss_cls: tensor(0.5622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8256, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:26<10:18,  5.33s/it][A

	loss_cls: tensor(0.9831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7065, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:31<10:08,  5.29s/it][A

	loss_cls: tensor(0.2816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3830, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:37<10:12,  5.37s/it][A

	loss_cls: tensor(0.6038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8808, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:42<10:10,  5.40s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7056, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:47<09:53,  5.30s/it][A

	loss_cls: tensor(0.5983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7390, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:52<09:38,  5.21s/it][A

	loss_cls: tensor(0.5313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7450, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:57<09:26,  5.15s/it][A

	loss_cls: tensor(0.5235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6005, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:03<09:18,  5.12s/it][A

	loss_cls: tensor(0.6323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9484, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:08<09:09,  5.09s/it][A

	loss_cls: tensor(0.9862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1152, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:13<09:03,  5.08s/it][A

	loss_cls: tensor(0.6665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0244, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:18<08:56,  5.06s/it][A

	loss_cls: tensor(0.4089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4893, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:23<08:52,  5.07s/it][A

	loss_cls: tensor(0.3915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6117, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:28<08:45,  5.05s/it][A

	loss_cls: tensor(0.4969, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6321, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:33<08:39,  5.04s/it][A

	loss_cls: tensor(0.8020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2918, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:38<08:35,  5.05s/it][A

	loss_cls: tensor(0.7515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1479, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:43<08:28,  5.04s/it][A

	loss_cls: tensor(0.4346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7444, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:48<08:24,  5.05s/it][A

	loss_cls: tensor(0.6100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9907, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:53<08:20,  5.05s/it][A

	loss_cls: tensor(0.5734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9024, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [22:58<08:24,  5.15s/it][A

	loss_cls: tensor(0.7573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8245, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:04<08:28,  5.24s/it][A

	loss_cls: tensor(0.3588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5632, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:09<08:28,  5.30s/it][A

	loss_cls: tensor(0.7761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8966, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:15<08:30,  5.37s/it][A

	loss_cls: tensor(0.4290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1745, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:20<08:26,  5.39s/it][A

	loss_cls: tensor(0.4561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6623, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:26<08:25,  5.43s/it][A

	loss_cls: tensor(0.4543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6584, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:31<08:20,  5.44s/it][A

	loss_cls: tensor(0.4675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5902, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:37<08:17,  5.47s/it][A

	loss_cls: tensor(0.6859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8747, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:42<08:11,  5.46s/it][A

	loss_cls: tensor(0.8020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9139, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:48<08:05,  5.46s/it][A

	loss_cls: tensor(0.6922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8827, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:53<08:02,  5.49s/it][A

	loss_cls: tensor(0.4959, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8590, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [23:59<07:56,  5.48s/it][A

	loss_cls: tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5796, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:04<07:53,  5.50s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8154, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:10<07:46,  5.48s/it][A

	loss_cls: tensor(0.3027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7072, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:15<07:41,  5.50s/it][A

	loss_cls: tensor(0.4391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6337, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:21<07:34,  5.47s/it][A

	loss_cls: tensor(0.6763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0306, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:26<07:27,  5.46s/it][A

	loss_cls: tensor(0.5508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7486, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:31<07:22,  5.47s/it][A

	loss_cls: tensor(0.5664, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7425, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:37<07:17,  5.47s/it][A

	loss_cls: tensor(0.3728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4437, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:42<07:13,  5.48s/it][A

	loss_cls: tensor(0.4600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8260, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:48<07:06,  5.47s/it][A

	loss_cls: tensor(0.7421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9585, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:53<07:02,  5.49s/it][A

	loss_cls: tensor(0.4106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8210, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [24:59<06:55,  5.47s/it][A

	loss_cls: tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7012, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:04<06:49,  5.46s/it][A

	loss_cls: tensor(0.3745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4602, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:10<06:45,  5.48s/it][A

	loss_cls: tensor(1.0950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2610, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:15<06:39,  5.47s/it][A

	loss_cls: tensor(0.6510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8506, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:21<06:35,  5.49s/it][A

	loss_cls: tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7447, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:26<06:28,  5.47s/it][A

	loss_cls: tensor(0.6404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7421, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:32<06:23,  5.48s/it][A

	loss_cls: tensor(0.5496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7108, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:37<06:17,  5.47s/it][A

	loss_cls: tensor(0.2491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5287, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:43<06:10,  5.45s/it][A

	loss_cls: tensor(0.6982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7254, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:48<06:06,  5.47s/it][A

	loss_cls: tensor(0.6312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0206, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:54<06:00,  5.46s/it][A

	loss_cls: tensor(0.4521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9651, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [25:59<05:55,  5.47s/it][A

	loss_cls: tensor(0.4426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4710, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:04<05:49,  5.46s/it][A

	loss_cls: tensor(0.8530, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1167, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:10<05:44,  5.47s/it][A

	loss_cls: tensor(1.4021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5539, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:15<05:38,  5.45s/it][A

	loss_cls: tensor(0.6780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9118, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:21<05:32,  5.44s/it][A

	loss_cls: tensor(0.9912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2716, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:26<05:28,  5.47s/it][A

	loss_cls: tensor(0.7427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9410, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:32<05:22,  5.47s/it][A

	loss_cls: tensor(0.7981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1164, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:37<05:17,  5.48s/it][A

	loss_cls: tensor(0.3712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5331, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:43<05:11,  5.47s/it][A

	loss_cls: tensor(0.4601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8454, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:48<05:06,  5.48s/it][A

	loss_cls: tensor(0.5881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9946, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:54<05:00,  5.46s/it][A

	loss_cls: tensor(0.6159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0360, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [26:59<04:55,  5.47s/it][A

	loss_cls: tensor(0.8399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0459, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:05<04:49,  5.47s/it][A

	loss_cls: tensor(0.6612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9363, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:10<04:43,  5.45s/it][A

	loss_cls: tensor(0.7576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8379, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:16<04:38,  5.47s/it][A

	loss_cls: tensor(0.7341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8796, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:21<04:33,  5.46s/it][A

	loss_cls: tensor(0.6191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8086, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:27<04:29,  5.49s/it][A

	loss_cls: tensor(0.5923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7910, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:32<04:22,  5.47s/it][A

	loss_cls: tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5274, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:37<04:17,  5.48s/it][A

	loss_cls: tensor(0.6378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7611, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:43<04:11,  5.47s/it][A

	loss_cls: tensor(0.4318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6071, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:48<04:05,  5.46s/it][A

	loss_cls: tensor(0.5736, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8642, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:54<04:01,  5.48s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6303, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [27:59<03:54,  5.46s/it][A

	loss_cls: tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3236, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:05<03:49,  5.47s/it][A

	loss_cls: tensor(0.7695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9524, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:10<03:43,  5.45s/it][A

	loss_cls: tensor(1.0205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2680, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:16<03:38,  5.47s/it][A

	loss_cls: tensor(0.7496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0132, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:21<03:32,  5.46s/it][A

	loss_cls: tensor(0.8539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1480, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:27<03:26,  5.44s/it][A

	loss_cls: tensor(0.7031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9636, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:32<03:22,  5.47s/it][A

	loss_cls: tensor(0.3359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0744, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4103, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:38<03:16,  5.45s/it][A

	loss_cls: tensor(0.7163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9547, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:43<03:11,  5.47s/it][A

	loss_cls: tensor(0.6997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1694, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:48<03:05,  5.45s/it][A

	loss_cls: tensor(0.6462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6927, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:54<03:00,  5.47s/it][A

	loss_cls: tensor(0.6161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8826, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:59<02:54,  5.45s/it][A

	loss_cls: tensor(0.4668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8734, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:05<02:49,  5.46s/it][A

	loss_cls: tensor(0.4037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6683, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:10<02:44,  5.47s/it][A

	loss_cls: tensor(0.6070, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8576, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:16<02:38,  5.46s/it][A

	loss_cls: tensor(0.7746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8958, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:21<02:33,  5.48s/it][A

	loss_cls: tensor(0.6485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8710, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:27<02:27,  5.47s/it][A

	loss_cls: tensor(0.7532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0031, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:32<02:22,  5.48s/it][A

	loss_cls: tensor(0.5374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8558, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:38<02:16,  5.48s/it][A

	loss_cls: tensor(0.4638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6917, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:43<02:10,  5.45s/it][A

	loss_cls: tensor(0.5742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7850, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:49<02:05,  5.47s/it][A

	loss_cls: tensor(0.5917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8857, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:54<01:59,  5.45s/it][A

	loss_cls: tensor(0.5041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8959, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:59<01:54,  5.46s/it][A

	loss_cls: tensor(0.4537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6250, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:05<01:49,  5.45s/it][A

	loss_cls: tensor(0.5986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6365, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:10<01:43,  5.47s/it][A

	loss_cls: tensor(0.6469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8828, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:16<01:38,  5.47s/it][A

	loss_cls: tensor(0.4322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7272, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:21<01:32,  5.46s/it][A

	loss_cls: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0812, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6670, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:27<01:27,  5.47s/it][A

	loss_cls: tensor(0.6552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8395, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:32<01:21,  5.46s/it][A

	loss_cls: tensor(0.4233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5605, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:38<01:16,  5.47s/it][A

	loss_cls: tensor(0.3514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4416, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:43<01:10,  5.46s/it][A

	loss_cls: tensor(0.7302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8531, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:49<01:05,  5.47s/it][A

	loss_cls: tensor(0.5649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8239, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:54<01:00,  5.46s/it][A

	loss_cls: tensor(0.8671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1244, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:00<00:54,  5.45s/it][A

	loss_cls: tensor(0.4249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6531, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:05<00:49,  5.47s/it][A

	loss_cls: tensor(1.0699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6835, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:10<00:43,  5.45s/it][A

	loss_cls: tensor(0.7115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2347, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:16<00:38,  5.48s/it][A

	loss_cls: tensor(1.0394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3184, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:21<00:32,  5.46s/it][A

	loss_cls: tensor(0.7177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9844, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:27<00:27,  5.49s/it][A

	loss_cls: tensor(0.6013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8366, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:32<00:21,  5.46s/it][A

	loss_cls: tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8227, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:38<00:16,  5.45s/it][A

	loss_cls: tensor(0.5323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8382, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:43<00:10,  5.46s/it][A

	loss_cls: tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9221, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:49<00:05,  5.44s/it][A

	loss_cls: tensor(0.7039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3988, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1026, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:51<00:00,  5.40s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.6979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9325, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8601649711276852

	Training cls acc: 0.6994467984934087

	Training cls prec: 0.567159368801318

	Training cls rec: 0.5964514475637357

	Training cls f1: 0.5256915536648303

--
	Training ner acc: 0.9550240123981322

	Training ner prec: 0.2837606716154671

	Training ner rec: 0.2922158128186439

	Training ner f1: 0.2875947208915619

	Current Learning rate:  0.0009142857142857143



  1%|          | 1/177 [00:00<01:53,  1.55it/s][A
  1%|          | 2/177 [00:01<02:01,  1.44it/s][A
  2%|▏         | 3/177 [00:02<02:03,  1.41it/s][A
  2%|▏         | 4/177 [00:02<02:04,  1.39it/s][A
  3%|▎         | 5/177 [00:03<01:59,  1.44it/s][A
  3%|▎         | 6/177 [00:04<02:00,  1.42it/s][A
  4%|▍         | 7/177 [00:04<02:01,  1.40it/s][A
  5%|▍         | 8/177 [00:05<01:57,  1.44it/s][A
  5%|▌         | 9/177 [00:06<01:58,  1.42it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.41it/s][A
  6%|▌         | 11/177 [00:07<01:59,  1.39it/s][A
  7%|▋         | 12/177 [00:08<01:54,  1.44it/s][A
  7%|▋         | 13/177 [00:09<01:55,  1.42it/s][A
  8%|▊         | 14/177 [00:09<01:56,  1.40it/s][A
  8%|▊         | 15/177 [00:10<01:56,  1.39it/s][A
  9%|▉         | 16/177 [00:11<01:52,  1.43it/s][A
 10%|▉         | 17/177 [00:11<01:53,  1.42it/s][A
 10%|█         | 18/177 [00:12<01:53,  1.40it/s][A
 11%|█         | 19/177 [00:13<01:49,  1.44it/s][A
 11%|█▏        | 20/


	Validation Loss: 1.0228819385760248

	Validation cls acc: 0.39030131826742004

	Validation cls prec: 0.5367803336023674

	Validation cls rec: 0.46834140435835353

	Validation cls f1: 0.3465296787330686

--
	Validation ner acc: 0.9538499994177064

	Validation ner prec: 0.3950823643386446

	Validation ner rec: 0.4056497175141244

	Validation ner f1: 0.40015375449419277



  0%|          | 1/354 [00:05<32:22,  5.50s/it][A

	loss_cls: tensor(0.4128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7802, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:59,  5.45s/it][A

	loss_cls: tensor(0.6989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8568, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:03,  5.48s/it][A

	loss_cls: tensor(0.6758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9223, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:55,  5.47s/it][A

	loss_cls: tensor(0.6492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8747, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:42,  5.45s/it][A

	loss_cls: tensor(0.6314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9088, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:45,  5.48s/it][A

	loss_cls: tensor(1.1118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1802, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:32,  5.45s/it][A

	loss_cls: tensor(1.1321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2529, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:33,  5.47s/it][A

	loss_cls: tensor(0.6525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9874, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:24,  5.46s/it][A

	loss_cls: tensor(0.5663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8528, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:24,  5.48s/it][A

	loss_cls: tensor(0.5757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9338, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:14,  5.46s/it][A

	loss_cls: tensor(0.5546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9166, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:05,  5.45s/it][A

	loss_cls: tensor(0.6908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9654, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<31:06,  5.47s/it][A

	loss_cls: tensor(0.5594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8202, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<30:54,  5.46s/it][A

	loss_cls: tensor(0.7910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1689, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:53,  5.47s/it][A

	loss_cls: tensor(0.5968, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9261, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<30:46,  5.46s/it][A

	loss_cls: tensor(0.6428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9560, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:47,  5.48s/it][A

	loss_cls: tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8414, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:38,  5.47s/it][A

	loss_cls: tensor(0.6284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7964, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:26,  5.45s/it][A

	loss_cls: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5702, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:26,  5.47s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8021, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:16,  5.45s/it][A

	loss_cls: tensor(0.5525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8858, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:15,  5.47s/it][A

	loss_cls: tensor(0.5771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6140, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:05<30:09,  5.47s/it][A

	loss_cls: tensor(0.5785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8279, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<30:09,  5.48s/it][A

	loss_cls: tensor(0.6375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6977, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:16<29:58,  5.47s/it][A

	loss_cls: tensor(0.4607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4806, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:22<29:51,  5.46s/it][A

	loss_cls: tensor(0.8165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9337, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:27<29:51,  5.48s/it][A

	loss_cls: tensor(0.7087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9763, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:33<29:40,  5.46s/it][A

	loss_cls: tensor(0.5470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0240, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:38<29:40,  5.48s/it][A

	loss_cls: tensor(0.5159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2252, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:44<29:33,  5.47s/it][A

	loss_cls: tensor(0.5787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:49<29:30,  5.48s/it][A

	loss_cls: tensor(0.4497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4747, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:54<29:19,  5.46s/it][A

	loss_cls: tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7807, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [03:00<29:08,  5.45s/it][A

	loss_cls: tensor(0.5762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8427, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:05<29:09,  5.47s/it][A

	loss_cls: tensor(0.6068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9692, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:11<28:58,  5.45s/it][A

	loss_cls: tensor(0.5059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9396, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:16<29:01,  5.48s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7440, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:22<28:54,  5.47s/it][A

	loss_cls: tensor(0.5778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7073, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:27<28:51,  5.48s/it][A

	loss_cls: tensor(0.7310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9965, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:33<28:41,  5.46s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8064, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:38<28:34,  5.46s/it][A

	loss_cls: tensor(0.5825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0216, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:44<28:32,  5.47s/it][A

	loss_cls: tensor(0.6983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9256, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:49<28:23,  5.46s/it][A

	loss_cls: tensor(0.7350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8348, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:55<28:25,  5.48s/it][A

	loss_cls: tensor(0.5674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6605, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [04:00<28:16,  5.47s/it][A

	loss_cls: tensor(0.4637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0585, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:06<28:14,  5.48s/it][A

	loss_cls: tensor(0.5929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6523, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:11<28:03,  5.47s/it][A

	loss_cls: tensor(0.7729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1312, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:16<27:54,  5.45s/it][A

	loss_cls: tensor(0.4483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6208, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:22<27:54,  5.47s/it][A

	loss_cls: tensor(0.6227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8126, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:27<27:46,  5.46s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7837, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:33<27:48,  5.49s/it][A

	loss_cls: tensor(0.7139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9399, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:38<27:38,  5.47s/it][A

	loss_cls: tensor(0.6412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2191, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8603, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:44<27:37,  5.49s/it][A

	loss_cls: tensor(0.6527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9817, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:49<27:26,  5.47s/it][A

	loss_cls: tensor(1.2666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3509, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:55<27:15,  5.45s/it][A

	loss_cls: tensor(0.6416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9325, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [05:00<27:20,  5.49s/it][A

	loss_cls: tensor(0.8591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0213, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:06<27:09,  5.47s/it][A

	loss_cls: tensor(0.6837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0391, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:11<27:11,  5.49s/it][A

	loss_cls: tensor(0.7109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9589, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:17<27:00,  5.47s/it][A

	loss_cls: tensor(0.5273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0260, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:22<26:57,  5.48s/it][A

	loss_cls: tensor(0.5494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6587, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:28<26:47,  5.47s/it][A

	loss_cls: tensor(0.5037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8135, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:33<26:37,  5.45s/it][A

	loss_cls: tensor(0.6668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9059, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:39<26:40,  5.48s/it][A

	loss_cls: tensor(0.9063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1367, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:44<26:30,  5.46s/it][A

	loss_cls: tensor(0.8317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9423, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:50<26:29,  5.48s/it][A

	loss_cls: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8969, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:55<26:19,  5.46s/it][A

	loss_cls: tensor(0.7368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9282, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [06:01<26:18,  5.48s/it][A

	loss_cls: tensor(0.4206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6751, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:06<26:09,  5.47s/it][A

	loss_cls: tensor(0.4827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6375, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:11<26:00,  5.46s/it][A

	loss_cls: tensor(0.8061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0856, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:17<26:01,  5.48s/it][A

	loss_cls: tensor(0.5103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5960, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:22<25:52,  5.47s/it][A

	loss_cls: tensor(0.5121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9249, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:28<25:55,  5.50s/it][A

	loss_cls: tensor(0.7685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0017, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:33<25:43,  5.47s/it][A

	loss_cls: tensor(0.7371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2736, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0107, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:39<25:39,  5.48s/it][A

	loss_cls: tensor(0.7108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1391, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:44<25:28,  5.46s/it][A

	loss_cls: tensor(0.6831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7823, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:50<25:20,  5.45s/it][A

	loss_cls: tensor(0.7188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9744, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:55<25:20,  5.47s/it][A

	loss_cls: tensor(0.5358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7291, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [07:01<25:13,  5.46s/it][A

	loss_cls: tensor(0.6823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7970, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:06<25:11,  5.48s/it][A

	loss_cls: tensor(0.5933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8357, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:12<25:02,  5.46s/it][A

	loss_cls: tensor(0.7171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9288, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:17<25:00,  5.48s/it][A

	loss_cls: tensor(0.5183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7151, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:22<24:49,  5.46s/it][A

	loss_cls: tensor(0.5253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6682, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:28<24:48,  5.47s/it][A

	loss_cls: tensor(0.6008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6562, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:33<24:40,  5.46s/it][A

	loss_cls: tensor(0.6567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9570, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:39<24:30,  5.45s/it][A

	loss_cls: tensor(0.5821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7484, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:44<24:29,  5.46s/it][A

	loss_cls: tensor(0.5311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7394, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:50<24:19,  5.45s/it][A

	loss_cls: tensor(0.6789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9200, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:55<24:20,  5.47s/it][A

	loss_cls: tensor(0.8944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2761, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [08:01<24:11,  5.46s/it][A

	loss_cls: tensor(0.7321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9744, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:06<24:10,  5.47s/it][A

	loss_cls: tensor(0.4228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7458, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:12<24:02,  5.47s/it][A

	loss_cls: tensor(0.5470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9956, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:17<23:55,  5.46s/it][A

	loss_cls: tensor(0.6112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7820, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:23<23:54,  5.48s/it][A

	loss_cls: tensor(0.5117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8077, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:28<23:46,  5.47s/it][A

	loss_cls: tensor(0.5402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6600, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:34<23:44,  5.48s/it][A

	loss_cls: tensor(0.6364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8088, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:39<23:32,  5.45s/it][A

	loss_cls: tensor(0.7331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8466, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:45<23:34,  5.48s/it][A

	loss_cls: tensor(0.5447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7132, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:50<23:26,  5.47s/it][A

	loss_cls: tensor(0.6739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8134, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:55<23:16,  5.46s/it][A

	loss_cls: tensor(0.4632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6260, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [09:01<23:16,  5.48s/it][A

	loss_cls: tensor(0.3889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4901, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:06<23:07,  5.46s/it][A

	loss_cls: tensor(0.6156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7117, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:12<23:03,  5.47s/it][A

	loss_cls: tensor(0.6138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8086, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:17<22:55,  5.46s/it][A

	loss_cls: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2375, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:23<22:57,  5.49s/it][A

	loss_cls: tensor(0.5562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7582, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:28<22:46,  5.47s/it][A

	loss_cls: tensor(0.4135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6412, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:34<22:38,  5.45s/it][A

	loss_cls: tensor(0.7864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3615, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:39<22:37,  5.47s/it][A

	loss_cls: tensor(0.6867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3106, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:45<22:27,  5.46s/it][A

	loss_cls: tensor(0.7259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0830, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8090, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:50<22:26,  5.47s/it][A

	loss_cls: tensor(0.3956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4698, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:56<22:17,  5.46s/it][A

	loss_cls: tensor(0.7197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0285, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [10:01<22:15,  5.47s/it][A

	loss_cls: tensor(0.5831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9062, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:06<22:06,  5.46s/it][A

	loss_cls: tensor(0.6015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7478, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:12<21:58,  5.45s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8624, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:17<21:55,  5.46s/it][A

	loss_cls: tensor(0.5238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9716, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:23<21:46,  5.44s/it][A

	loss_cls: tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9031, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:28<21:46,  5.47s/it][A

	loss_cls: tensor(0.6266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:34<21:41,  5.47s/it][A

	loss_cls: tensor(0.6110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8692, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:39<21:38,  5.48s/it][A

	loss_cls: tensor(1.0001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2729, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:45<21:29,  5.47s/it][A

	loss_cls: tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1633, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:50<21:20,  5.45s/it][A

	loss_cls: tensor(0.6262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7454, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:56<21:17,  5.46s/it][A

	loss_cls: tensor(0.6843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4102, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0945, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [11:01<21:07,  5.44s/it][A

	loss_cls: tensor(0.7449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7910, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:07<21:06,  5.46s/it][A

	loss_cls: tensor(0.4593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6936, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:12<20:59,  5.45s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8015, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:17<21:01,  5.48s/it][A

	loss_cls: tensor(0.7077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9415, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:23<20:51,  5.47s/it][A

	loss_cls: tensor(0.5963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7210, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:28<20:43,  5.46s/it][A

	loss_cls: tensor(0.8423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3391, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:34<20:45,  5.48s/it][A

	loss_cls: tensor(0.8271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9384, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:39<20:35,  5.46s/it][A

	loss_cls: tensor(0.4038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4997, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:45<20:33,  5.48s/it][A

	loss_cls: tensor(0.4867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5921, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:50<20:24,  5.47s/it][A

	loss_cls: tensor(0.4844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6594, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:56<20:20,  5.47s/it][A

	loss_cls: tensor(0.3895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5286, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [12:01<20:11,  5.46s/it][A

	loss_cls: tensor(0.6405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1016, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:07<20:04,  5.45s/it][A

	loss_cls: tensor(0.8287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9573, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:12<20:02,  5.47s/it][A

	loss_cls: tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8102, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:18<19:53,  5.45s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7509, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:23<19:53,  5.48s/it][A

	loss_cls: tensor(0.6976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8855, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:29<19:47,  5.47s/it][A

	loss_cls: tensor(0.4218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4599, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:34<19:45,  5.49s/it][A

	loss_cls: tensor(0.4720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6119, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:40<19:36,  5.47s/it][A

	loss_cls: tensor(0.7447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0501, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:45<19:28,  5.46s/it][A

	loss_cls: tensor(0.5327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8652, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:50<19:25,  5.47s/it][A

	loss_cls: tensor(0.5053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5974, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:56<19:18,  5.46s/it][A

	loss_cls: tensor(0.8089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0123, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [13:01<19:16,  5.48s/it][A

	loss_cls: tensor(0.5385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5891, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:07<19:08,  5.47s/it][A

	loss_cls: tensor(0.4010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6958, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:12<19:05,  5.48s/it][A

	loss_cls: tensor(0.6185, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8051, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:18<18:56,  5.46s/it][A

	loss_cls: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7758, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:23<18:49,  5.46s/it][A

	loss_cls: tensor(0.4428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6010, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:29<18:46,  5.47s/it][A

	loss_cls: tensor(0.5021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5668, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:34<18:38,  5.46s/it][A

	loss_cls: tensor(0.5369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6322, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:40<18:39,  5.49s/it][A

	loss_cls: tensor(0.5414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9845, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:45<18:30,  5.47s/it][A

	loss_cls: tensor(0.6499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9684, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:51<18:26,  5.48s/it][A

	loss_cls: tensor(0.5038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6178, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:56<18:16,  5.46s/it][A

	loss_cls: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4417, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [14:01<18:07,  5.44s/it][A

	loss_cls: tensor(0.6889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8135, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:07<18:07,  5.47s/it][A

	loss_cls: tensor(0.4296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5393, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:12<18:00,  5.46s/it][A

	loss_cls: tensor(0.5411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6609, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:18<17:59,  5.48s/it][A

	loss_cls: tensor(0.5204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1095, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:23<17:53,  5.48s/it][A

	loss_cls: tensor(0.7048, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0485, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:29<17:50,  5.49s/it][A

	loss_cls: tensor(0.8337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9726, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:34<17:43,  5.48s/it][A

	loss_cls: tensor(0.8293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3373, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:40<17:35,  5.47s/it][A

	loss_cls: tensor(0.3217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4616, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:45<17:32,  5.48s/it][A

	loss_cls: tensor(1.0267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3207, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:51<17:26,  5.48s/it][A

	loss_cls: tensor(0.6653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8691, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:56<17:23,  5.49s/it][A

	loss_cls: tensor(0.5923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7483, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [15:02<17:16,  5.48s/it][A

	loss_cls: tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8526, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:07<17:13,  5.50s/it][A

	loss_cls: tensor(0.5567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9200, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:13<17:05,  5.48s/it][A

	loss_cls: tensor(0.6791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2313, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9104, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:18<16:58,  5.47s/it][A

	loss_cls: tensor(0.8047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9056, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:24<16:57,  5.50s/it][A

	loss_cls: tensor(0.6681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9587, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:29<16:49,  5.49s/it][A

	loss_cls: tensor(0.5407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7923, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:35<16:46,  5.50s/it][A

	loss_cls: tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7499, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:40<16:38,  5.49s/it][A

	loss_cls: tensor(0.6131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7174, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:46<16:35,  5.50s/it][A

	loss_cls: tensor(0.9507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3750, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:51<16:26,  5.48s/it][A

	loss_cls: tensor(0.6355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7254, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:57<16:18,  5.47s/it][A

	loss_cls: tensor(0.5892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7152, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [16:02<16:17,  5.49s/it][A

	loss_cls: tensor(0.5148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5729, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:08<16:08,  5.47s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7822, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:13<16:01,  5.46s/it][A

	loss_cls: tensor(0.4698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5082, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:18<15:50,  5.43s/it][A

	loss_cls: tensor(0.5923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7206, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:24<15:45,  5.43s/it][A

	loss_cls: tensor(0.8505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3641, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:29<15:37,  5.42s/it][A

	loss_cls: tensor(0.9919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5663, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:35<15:30,  5.41s/it][A

	loss_cls: tensor(0.9487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2234, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:40<15:24,  5.40s/it][A

	loss_cls: tensor(0.8077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0547, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:45<15:17,  5.40s/it][A

	loss_cls: tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7293, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:51<15:15,  5.41s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7057, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:56<15:07,  5.40s/it][A

	loss_cls: tensor(0.5562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7707, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [17:02<15:04,  5.42s/it][A

	loss_cls: tensor(0.6759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0130, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:07<14:56,  5.40s/it][A

	loss_cls: tensor(0.7576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0764, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:13<14:53,  5.42s/it][A

	loss_cls: tensor(0.4557, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5799, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:18<14:47,  5.41s/it][A

	loss_cls: tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7460, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:23<14:38,  5.39s/it][A

	loss_cls: tensor(0.2486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4646, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:29<14:36,  5.41s/it][A

	loss_cls: tensor(0.6113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0386, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:34<14:28,  5.39s/it][A

	loss_cls: tensor(0.4387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7418, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:40<14:25,  5.41s/it][A

	loss_cls: tensor(0.4978, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5650, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:45<14:17,  5.39s/it][A

	loss_cls: tensor(0.6366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9721, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:50<14:16,  5.42s/it][A

	loss_cls: tensor(0.4946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7907, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:56<14:08,  5.40s/it][A

	loss_cls: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7108, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [18:01<14:00,  5.39s/it][A

	loss_cls: tensor(0.6283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7825, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:07<13:58,  5.41s/it][A

	loss_cls: tensor(0.7143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9283, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:12<13:51,  5.40s/it][A

	loss_cls: tensor(0.6977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8805, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:17<13:47,  5.41s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5179, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:23<13:39,  5.39s/it][A

	loss_cls: tensor(0.4638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6814, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:28<13:36,  5.41s/it][A

	loss_cls: tensor(0.4391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8597, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:33<13:29,  5.40s/it][A

	loss_cls: tensor(0.6705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2822, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9527, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:39<13:22,  5.39s/it][A

	loss_cls: tensor(0.4153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5294, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:44<13:19,  5.40s/it][A

	loss_cls: tensor(0.6134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8855, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:50<13:12,  5.39s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8164, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:55<13:08,  5.40s/it][A

	loss_cls: tensor(0.6599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1923, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [19:00<13:01,  5.39s/it][A

	loss_cls: tensor(0.5588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8409, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:06<12:59,  5.41s/it][A

	loss_cls: tensor(0.5848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8237, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:11<12:52,  5.41s/it][A

	loss_cls: tensor(0.5416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8117, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:17<12:47,  5.40s/it][A

	loss_cls: tensor(0.3699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6054, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:22<12:42,  5.41s/it][A

	loss_cls: tensor(0.7746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9195, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:27<12:35,  5.40s/it][A

	loss_cls: tensor(0.6418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7249, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:33<12:32,  5.41s/it][A

	loss_cls: tensor(0.9753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1987, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:38<12:25,  5.40s/it][A

	loss_cls: tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7932, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:44<12:22,  5.42s/it][A

	loss_cls: tensor(0.4172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4508, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:49<12:16,  5.41s/it][A

	loss_cls: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4198, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:55<12:09,  5.41s/it][A

	loss_cls: tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7410, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [20:00<12:06,  5.42s/it][A

	loss_cls: tensor(0.8260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2151, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0411, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:05<11:58,  5.40s/it][A

	loss_cls: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9067, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:11<11:55,  5.42s/it][A

	loss_cls: tensor(0.4325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1712, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6036, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:16<11:48,  5.41s/it][A

	loss_cls: tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7207, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:22<11:45,  5.43s/it][A

	loss_cls: tensor(0.6516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0381, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:27<11:38,  5.42s/it][A

	loss_cls: tensor(0.4320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6808, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:32<11:31,  5.40s/it][A

	loss_cls: tensor(0.5185, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8962, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:38<11:28,  5.42s/it][A

	loss_cls: tensor(0.6356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7246, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:43<11:23,  5.42s/it][A

	loss_cls: tensor(0.5078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0023, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:49<11:22,  5.46s/it][A

	loss_cls: tensor(0.5270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0061, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:54<11:16,  5.45s/it][A

	loss_cls: tensor(0.7211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9098, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [21:00<11:14,  5.49s/it][A

	loss_cls: tensor(0.5653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8307, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:05<11:08,  5.48s/it][A

	loss_cls: tensor(0.8396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1514, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9910, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:11<11:01,  5.47s/it][A

	loss_cls: tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8696, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:16<10:58,  5.48s/it][A

	loss_cls: tensor(0.9625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0833, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:22<10:50,  5.47s/it][A

	loss_cls: tensor(0.7938, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2919, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0857, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:27<10:47,  5.49s/it][A

	loss_cls: tensor(0.4879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8689, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:33<10:40,  5.48s/it][A

	loss_cls: tensor(0.5911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8044, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:38<10:38,  5.51s/it][A

	loss_cls: tensor(0.6210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9385, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:44<10:31,  5.49s/it][A

	loss_cls: tensor(0.6886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1500, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:49<10:24,  5.47s/it][A

	loss_cls: tensor(0.5973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7974, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:55<10:20,  5.49s/it][A

	loss_cls: tensor(0.6038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8490, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [22:00<10:13,  5.48s/it][A

	loss_cls: tensor(0.5330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6984, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:06<10:10,  5.50s/it][A

	loss_cls: tensor(0.4628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7330, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:11<10:02,  5.48s/it][A

	loss_cls: tensor(0.7116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9521, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:17<09:59,  5.50s/it][A

	loss_cls: tensor(0.5954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7820, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:22<09:51,  5.48s/it][A

	loss_cls: tensor(0.6259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0449, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:28<09:43,  5.45s/it][A

	loss_cls: tensor(0.5455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0181, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:33<09:39,  5.46s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7908, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:38<09:32,  5.45s/it][A

	loss_cls: tensor(0.6821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8010, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:44<09:29,  5.48s/it][A

	loss_cls: tensor(0.8317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2788, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:49<09:22,  5.46s/it][A

	loss_cls: tensor(1.0684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5990, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:55<09:19,  5.48s/it][A

	loss_cls: tensor(0.8425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0572, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [23:00<09:11,  5.46s/it][A

	loss_cls: tensor(0.7716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0908, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8624, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:06<09:04,  5.45s/it][A

	loss_cls: tensor(0.5191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6064, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:11<09:00,  5.46s/it][A

	loss_cls: tensor(0.5729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7288, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:17<08:54,  5.46s/it][A

	loss_cls: tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7965, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:22<08:51,  5.48s/it][A

	loss_cls: tensor(0.5719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8489, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:28<08:45,  5.48s/it][A

	loss_cls: tensor(0.8119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3314, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1433, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:33<08:41,  5.49s/it][A

	loss_cls: tensor(0.4944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6520, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:39<08:35,  5.48s/it][A

	loss_cls: tensor(0.6729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8405, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:44<08:27,  5.46s/it][A

	loss_cls: tensor(0.6170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8867, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:50<08:23,  5.47s/it][A

	loss_cls: tensor(0.6868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7106, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:55<08:14,  5.44s/it][A

	loss_cls: tensor(0.7869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1260, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [24:00<08:11,  5.46s/it][A

	loss_cls: tensor(0.5031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9518, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:06<08:03,  5.43s/it][A

	loss_cls: tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7174, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:11<07:58,  5.43s/it][A

	loss_cls: tensor(0.3978, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5843, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:17<07:50,  5.41s/it][A

	loss_cls: tensor(0.8433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0327, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:22<07:43,  5.39s/it][A

	loss_cls: tensor(0.4733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5198, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:27<07:38,  5.40s/it][A

	loss_cls: tensor(0.6121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7887, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:33<07:32,  5.39s/it][A

	loss_cls: tensor(0.7158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8160, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:38<07:28,  5.40s/it][A

	loss_cls: tensor(0.7185, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9244, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:44<07:22,  5.39s/it][A

	loss_cls: tensor(0.8894, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1256, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:49<07:17,  5.40s/it][A

	loss_cls: tensor(0.7791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0853, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:54<07:11,  5.39s/it][A

	loss_cls: tensor(0.8656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3486, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [25:00<07:05,  5.39s/it][A

	loss_cls: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2022, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:05<07:02,  5.42s/it][A

	loss_cls: tensor(0.6980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0511, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:11<06:56,  5.41s/it][A

	loss_cls: tensor(0.7572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0318, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:16<06:52,  5.42s/it][A

	loss_cls: tensor(0.5454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8152, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:21<06:45,  5.40s/it][A

	loss_cls: tensor(0.5462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8041, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:27<06:40,  5.41s/it][A

	loss_cls: tensor(1.0451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2834, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:32<06:33,  5.39s/it][A

	loss_cls: tensor(0.5917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8288, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:38<06:28,  5.39s/it][A

	loss_cls: tensor(0.5977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8115, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:43<06:23,  5.40s/it][A

	loss_cls: tensor(0.5252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7535, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:48<06:16,  5.38s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8324, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:54<06:12,  5.40s/it][A

	loss_cls: tensor(0.5757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8289, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:59<06:06,  5.39s/it][A

	loss_cls: tensor(0.5609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6857, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:05<06:01,  5.40s/it][A

	loss_cls: tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8559, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:10<05:54,  5.38s/it][A

	loss_cls: tensor(0.5696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7833, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:15<05:50,  5.40s/it][A

	loss_cls: tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7427, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:21<05:44,  5.39s/it][A

	loss_cls: tensor(0.8210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3360, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:26<05:38,  5.38s/it][A

	loss_cls: tensor(0.5367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8669, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:31<05:34,  5.40s/it][A

	loss_cls: tensor(0.7068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1514, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8582, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:37<05:28,  5.38s/it][A

	loss_cls: tensor(0.5216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6533, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:42<05:23,  5.40s/it][A

	loss_cls: tensor(0.6208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7830, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:48<05:17,  5.38s/it][A

	loss_cls: tensor(0.6151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9127, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:53<05:13,  5.40s/it][A

	loss_cls: tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6441, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:58<05:07,  5.39s/it][A

	loss_cls: tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9180, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:04<05:01,  5.38s/it][A

	loss_cls: tensor(0.6530, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7790, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:09<04:57,  5.41s/it][A

	loss_cls: tensor(0.7991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1266, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:15<04:51,  5.40s/it][A

	loss_cls: tensor(0.2710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3556, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:20<04:46,  5.41s/it][A

	loss_cls: tensor(0.4858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6366, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:25<04:40,  5.39s/it][A

	loss_cls: tensor(0.8917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0955, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:31<04:36,  5.42s/it][A

	loss_cls: tensor(0.4301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5030, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:36<04:30,  5.41s/it][A

	loss_cls: tensor(0.2739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3311, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:42<04:25,  5.41s/it][A

	loss_cls: tensor(0.7616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8889, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:47<04:20,  5.42s/it][A

	loss_cls: tensor(0.8993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0740, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:53<04:13,  5.40s/it][A

	loss_cls: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6868, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:58<04:08,  5.41s/it][A

	loss_cls: tensor(0.6852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8553, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:03<04:02,  5.40s/it][A

	loss_cls: tensor(0.5695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7516, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:09<03:57,  5.41s/it][A

	loss_cls: tensor(0.6768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8637, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:14<03:52,  5.40s/it][A

	loss_cls: tensor(0.5849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6021, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:20<03:46,  5.40s/it][A

	loss_cls: tensor(0.4507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4669, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:25<03:41,  5.41s/it][A

	loss_cls: tensor(0.8693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9466, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:30<03:35,  5.40s/it][A

	loss_cls: tensor(0.6285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8693, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:36<03:31,  5.41s/it][A

	loss_cls: tensor(0.5156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8403, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:41<03:25,  5.40s/it][A

	loss_cls: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2448, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8390, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:47<03:20,  5.42s/it][A

	loss_cls: tensor(0.7108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0967, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:52<03:14,  5.41s/it][A

	loss_cls: tensor(0.7629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0812, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:57<03:09,  5.40s/it][A

	loss_cls: tensor(0.7461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0012, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:03<03:04,  5.42s/it][A

	loss_cls: tensor(0.6579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7969, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:08<02:58,  5.40s/it][A

	loss_cls: tensor(0.4787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8194, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:14<02:53,  5.42s/it][A

	loss_cls: tensor(0.4762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7324, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:19<02:47,  5.41s/it][A

	loss_cls: tensor(0.7964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9057, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:25<02:42,  5.43s/it][A

	loss_cls: tensor(0.4976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8159, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:30<02:37,  5.41s/it][A

	loss_cls: tensor(0.6284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9321, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:35<02:31,  5.40s/it][A

	loss_cls: tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8235, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:41<02:26,  5.41s/it][A

	loss_cls: tensor(0.7413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8846, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:46<02:20,  5.40s/it][A

	loss_cls: tensor(0.8108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2852, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0960, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:52<02:15,  5.42s/it][A

	loss_cls: tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6564, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:57<02:09,  5.40s/it][A

	loss_cls: tensor(0.8433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9246, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:02<02:04,  5.42s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8821, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:08<01:59,  5.41s/it][A

	loss_cls: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7207, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:13<01:53,  5.40s/it][A

	loss_cls: tensor(0.5627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0936, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:19<01:48,  5.42s/it][A

	loss_cls: tensor(0.7923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0914, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:24<01:42,  5.40s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1822, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7734, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:29<01:37,  5.43s/it][A

	loss_cls: tensor(0.6073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7226, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:35<01:32,  5.43s/it][A

	loss_cls: tensor(0.8308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9688, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:40<01:26,  5.43s/it][A

	loss_cls: tensor(0.4552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5568, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:46<01:21,  5.42s/it][A

	loss_cls: tensor(0.4845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7635, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:51<01:15,  5.40s/it][A

	loss_cls: tensor(0.5338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8057, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:57<01:10,  5.42s/it][A

	loss_cls: tensor(0.9079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0508, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:02<01:04,  5.41s/it][A

	loss_cls: tensor(0.5405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7463, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:07<00:59,  5.43s/it][A

	loss_cls: tensor(0.8144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0221, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:13<00:54,  5.42s/it][A

	loss_cls: tensor(0.6726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9092, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:18<00:48,  5.43s/it][A

	loss_cls: tensor(0.8852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1957, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:24<00:43,  5.41s/it][A

	loss_cls: tensor(0.5983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8496, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:29<00:37,  5.41s/it][A

	loss_cls: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9488, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:34<00:32,  5.42s/it][A

	loss_cls: tensor(0.8506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1955, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:40<00:27,  5.41s/it][A

	loss_cls: tensor(0.7153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2789, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9943, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:45<00:21,  5.43s/it][A

	loss_cls: tensor(0.6576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8004, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:51<00:16,  5.41s/it][A

	loss_cls: tensor(0.4828, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6521, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:56<00:10,  5.43s/it][A

	loss_cls: tensor(0.5798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7512, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:02<00:05,  5.42s/it][A

	loss_cls: tensor(0.4867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6157, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:04<00:00,  5.44s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.6568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9349, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8626345995455812

	Training cls acc: 0.6803790018832392

	Training cls prec: 0.5670065202374525

	Training cls rec: 0.5958320457790797

	Training cls f1: 0.5179479077577911

--
	Training ner acc: 0.9553383877273349

	Training ner prec: 0.26861811129604346

	Training ner rec: 0.2754919102166797

	Training ner f1: 0.2713494728190745

	Current Learning rate:  0.0008857142857142857



  1%|          | 1/177 [00:00<02:08,  1.37it/s][A
  1%|          | 2/177 [00:01<02:06,  1.38it/s][A
  2%|▏         | 3/177 [00:02<02:06,  1.37it/s][A
  2%|▏         | 4/177 [00:02<01:59,  1.44it/s][A
  3%|▎         | 5/177 [00:03<02:01,  1.42it/s][A
  3%|▎         | 6/177 [00:04<02:01,  1.40it/s][A
  4%|▍         | 7/177 [00:04<01:57,  1.45it/s][A
  5%|▍         | 8/177 [00:05<01:58,  1.43it/s][A
  5%|▌         | 9/177 [00:06<01:58,  1.41it/s][A
  6%|▌         | 10/177 [00:07<01:59,  1.40it/s][A
  6%|▌         | 11/177 [00:07<01:55,  1.44it/s][A
  7%|▋         | 12/177 [00:08<01:56,  1.42it/s][A
  7%|▋         | 13/177 [00:09<01:57,  1.40it/s][A
  8%|▊         | 14/177 [00:09<01:56,  1.39it/s][A
  8%|▊         | 15/177 [00:10<01:52,  1.44it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.42it/s][A
 10%|▉         | 17/177 [00:12<01:53,  1.41it/s][A
 10%|█         | 18/177 [00:12<01:49,  1.45it/s][A
 11%|█         | 19/177 [00:13<01:50,  1.42it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8368406885087827

	Validation cls acc: 0.5948681732580037

	Validation cls prec: 0.5802596179714823

	Validation cls rec: 0.540624159268227

	Validation cls f1: 0.4859113893012198

--
	Validation ner acc: 0.9537783302749114

	Validation ner prec: 0.4317942257688138

	Validation ner rec: 0.44227871939736346

	Validation ner f1: 0.43680666007323915



  0%|          | 1/354 [00:05<32:05,  5.45s/it][A

	loss_cls: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7100, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:11<32:20,  5.51s/it][A

	loss_cls: tensor(0.5721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8532, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:02,  5.48s/it][A

	loss_cls: tensor(0.7385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9795, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<32:04,  5.50s/it][A

	loss_cls: tensor(0.6176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8053, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:51,  5.48s/it][A

	loss_cls: tensor(0.4547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4849, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:52,  5.49s/it][A

	loss_cls: tensor(0.6199, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8898, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:44,  5.49s/it][A

	loss_cls: tensor(0.4827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7282, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:38,  5.49s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7722, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:38,  5.50s/it][A

	loss_cls: tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0861, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:27,  5.49s/it][A

	loss_cls: tensor(0.4247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4582, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:27,  5.50s/it][A

	loss_cls: tensor(0.4552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5383, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:15,  5.48s/it][A

	loss_cls: tensor(0.6163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8980, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<31:17,  5.51s/it][A

	loss_cls: tensor(0.2741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4124, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<31:07,  5.49s/it][A

	loss_cls: tensor(0.7949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0438, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:22<30:56,  5.48s/it][A

	loss_cls: tensor(0.5906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1840, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7746, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<30:56,  5.49s/it][A

	loss_cls: tensor(0.2807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4943, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:33<30:44,  5.47s/it][A

	loss_cls: tensor(1.1597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5579, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:43,  5.49s/it][A

	loss_cls: tensor(0.3729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4580, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:44<30:35,  5.48s/it][A

	loss_cls: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6540, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:38,  5.50s/it][A

	loss_cls: tensor(0.7114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0019, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:55<30:28,  5.49s/it][A

	loss_cls: tensor(0.9719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4919, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:19,  5.48s/it][A

	loss_cls: tensor(0.5890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8245, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:06<30:22,  5.50s/it][A

	loss_cls: tensor(1.1834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3483, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5317, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<30:10,  5.49s/it][A

	loss_cls: tensor(0.4855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5890, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:17<30:11,  5.51s/it][A

	loss_cls: tensor(0.6202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9769, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:22<30:04,  5.50s/it][A

	loss_cls: tensor(0.5953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0336, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:28<30:04,  5.52s/it][A

	loss_cls: tensor(0.8166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0209, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:33<29:54,  5.50s/it][A

	loss_cls: tensor(0.6255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7311, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:39<29:47,  5.50s/it][A

	loss_cls: tensor(0.7638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0852, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8490, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:44<29:44,  5.51s/it][A

	loss_cls: tensor(0.5417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6401, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:50<29:34,  5.50s/it][A

	loss_cls: tensor(0.4993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6826, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:55<29:33,  5.51s/it][A

	loss_cls: tensor(0.7629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8541, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [03:01<29:25,  5.50s/it][A

	loss_cls: tensor(0.3589, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6789, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0377, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:06<28:35,  5.36s/it][A

	loss_cls: tensor(0.4875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5894, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:11<27:57,  5.26s/it][A

	loss_cls: tensor(0.6159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9754, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:16<27:27,  5.18s/it][A

	loss_cls: tensor(0.5202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8144, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:21<27:37,  5.23s/it][A

	loss_cls: tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9384, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:27<27:51,  5.29s/it][A

	loss_cls: tensor(0.5631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8205, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:32<28:10,  5.37s/it][A

	loss_cls: tensor(1.1188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4403, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:38<28:13,  5.39s/it][A

	loss_cls: tensor(0.6330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9338, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:43<28:20,  5.43s/it][A

	loss_cls: tensor(0.6741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1019, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:49<28:16,  5.44s/it][A

	loss_cls: tensor(0.5836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0917, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6753, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:54<28:12,  5.44s/it][A

	loss_cls: tensor(0.6009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7728, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [04:00<28:14,  5.47s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8240, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:05<28:04,  5.45s/it][A

	loss_cls: tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9107, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:11<28:04,  5.47s/it][A

	loss_cls: tensor(0.5928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8722, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:16<27:58,  5.47s/it][A

	loss_cls: tensor(0.9004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0252, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:22<27:58,  5.48s/it][A

	loss_cls: tensor(0.6515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8844, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:27<27:48,  5.47s/it][A

	loss_cls: tensor(0.9290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5787, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:32<27:38,  5.46s/it][A

	loss_cls: tensor(0.5934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7038, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:38<27:38,  5.47s/it][A

	loss_cls: tensor(0.6096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7599, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:43<27:28,  5.46s/it][A

	loss_cls: tensor(0.4055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6566, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:49<27:29,  5.48s/it][A

	loss_cls: tensor(0.4587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6419, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:54<27:21,  5.47s/it][A

	loss_cls: tensor(0.7718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2230, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [05:00<27:24,  5.50s/it][A

	loss_cls: tensor(0.4880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7839, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:05<27:13,  5.48s/it][A

	loss_cls: tensor(0.6022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7772, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:11<27:01,  5.46s/it][A

	loss_cls: tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8415, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:16<27:00,  5.47s/it][A

	loss_cls: tensor(0.4974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8469, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:22<26:51,  5.46s/it][A

	loss_cls: tensor(0.7034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8709, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:27<26:52,  5.49s/it][A

	loss_cls: tensor(0.5656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8522, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:33<26:43,  5.47s/it][A

	loss_cls: tensor(0.6665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9759, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:38<26:45,  5.50s/it][A

	loss_cls: tensor(0.6274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9894, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:44<26:34,  5.48s/it][A

	loss_cls: tensor(0.7089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9540, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:49<26:24,  5.46s/it][A

	loss_cls: tensor(0.6365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9293, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:55<26:21,  5.47s/it][A

	loss_cls: tensor(0.7261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8778, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [06:00<26:14,  5.47s/it][A

	loss_cls: tensor(0.5822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6399, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:06<26:16,  5.49s/it][A

	loss_cls: tensor(0.6330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7431, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:11<26:09,  5.49s/it][A

	loss_cls: tensor(0.6399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9848, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:17<26:05,  5.49s/it][A

	loss_cls: tensor(0.6117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8883, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:22<25:53,  5.47s/it][A

	loss_cls: tensor(0.7525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0515, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:27<25:44,  5.46s/it][A

	loss_cls: tensor(0.7609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1694, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:33<25:42,  5.47s/it][A

	loss_cls: tensor(0.8874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0512, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:38<25:33,  5.46s/it][A

	loss_cls: tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5502, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:44<25:35,  5.48s/it][A

	loss_cls: tensor(0.5502, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6958, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:49<25:25,  5.47s/it][A

	loss_cls: tensor(0.5027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7244, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:55<25:23,  5.48s/it][A

	loss_cls: tensor(0.5062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7885, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [07:00<25:14,  5.47s/it][A

	loss_cls: tensor(0.7601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8502, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:06<25:04,  5.45s/it][A

	loss_cls: tensor(0.5555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7879, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:11<25:04,  5.47s/it][A

	loss_cls: tensor(0.4924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6095, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:17<24:57,  5.47s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6291, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:22<24:49,  5.46s/it][A

	loss_cls: tensor(0.8491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1267, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:27<24:36,  5.43s/it][A

	loss_cls: tensor(0.6402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8513, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:33<24:33,  5.44s/it][A

	loss_cls: tensor(0.8663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1734, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:38<24:17,  5.40s/it][A

	loss_cls: tensor(0.9737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1680, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:44<24:07,  5.38s/it][A

	loss_cls: tensor(0.5597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7399, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:49<24:05,  5.40s/it][A

	loss_cls: tensor(0.9537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1138, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:54<23:58,  5.39s/it][A

	loss_cls: tensor(0.5872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0995, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [08:00<23:55,  5.40s/it][A

	loss_cls: tensor(0.6251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8906, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:05<23:45,  5.38s/it][A

	loss_cls: tensor(0.5120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5538, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:11<23:45,  5.40s/it][A

	loss_cls: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8619, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:16<23:32,  5.37s/it][A

	loss_cls: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7458, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:21<23:30,  5.38s/it][A

	loss_cls: tensor(0.7084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7691, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:27<23:25,  5.39s/it][A

	loss_cls: tensor(0.5025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7411, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:32<23:19,  5.38s/it][A

	loss_cls: tensor(0.6000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9769, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:37<23:18,  5.40s/it][A

	loss_cls: tensor(0.5556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7228, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:43<23:08,  5.38s/it][A

	loss_cls: tensor(0.7037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8970, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:48<23:08,  5.40s/it][A

	loss_cls: tensor(0.7743, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9707, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:54<22:58,  5.38s/it][A

	loss_cls: tensor(0.5734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7375, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:59<22:55,  5.39s/it][A

	loss_cls: tensor(0.7448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8742, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:04<22:48,  5.39s/it][A

	loss_cls: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7522, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:10<22:40,  5.38s/it][A

	loss_cls: tensor(0.5642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6804, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:15<22:39,  5.39s/it][A

	loss_cls: tensor(0.6533, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0211, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:20<22:30,  5.38s/it][A

	loss_cls: tensor(0.4498, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5553, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:26<22:27,  5.39s/it][A

	loss_cls: tensor(0.6116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7190, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:31<22:18,  5.38s/it][A

	loss_cls: tensor(0.7790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3204, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:37<22:18,  5.40s/it][A

	loss_cls: tensor(0.5432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8645, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:42<22:08,  5.38s/it][A

	loss_cls: tensor(0.5370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6643, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:47<22:01,  5.37s/it][A

	loss_cls: tensor(0.6779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0005, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:53<22:00,  5.39s/it][A

	loss_cls: tensor(0.4916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7713, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:58<21:51,  5.38s/it][A

	loss_cls: tensor(0.6470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0946, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:04<21:51,  5.40s/it][A

	loss_cls: tensor(0.5982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7941, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:09<21:40,  5.37s/it][A

	loss_cls: tensor(0.6869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1157, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:14<21:39,  5.39s/it][A

	loss_cls: tensor(0.7192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9296, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:20<21:32,  5.39s/it][A

	loss_cls: tensor(0.7424, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9776, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:25<21:25,  5.38s/it][A

	loss_cls: tensor(0.5803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8536, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:31<21:23,  5.39s/it][A

	loss_cls: tensor(0.5205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7522, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:36<21:16,  5.38s/it][A

	loss_cls: tensor(0.5659, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7564, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:41<21:12,  5.39s/it][A

	loss_cls: tensor(0.5643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7862, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:47<21:05,  5.39s/it][A

	loss_cls: tensor(0.5342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8430, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:52<21:04,  5.41s/it][A

	loss_cls: tensor(0.7071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9705, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:57<20:55,  5.39s/it][A

	loss_cls: tensor(0.9207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3139, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:03<20:48,  5.38s/it][A

	loss_cls: tensor(0.9084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2353, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:08<20:45,  5.39s/it][A

	loss_cls: tensor(0.6479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9744, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:14<20:37,  5.38s/it][A

	loss_cls: tensor(0.6206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6656, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:19<20:35,  5.39s/it][A

	loss_cls: tensor(0.4082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5704, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:24<20:26,  5.38s/it][A

	loss_cls: tensor(0.5598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8357, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:30<20:26,  5.40s/it][A

	loss_cls: tensor(0.6644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1440, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:35<20:17,  5.39s/it][A

	loss_cls: tensor(0.6662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8562, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:41<20:09,  5.38s/it][A

	loss_cls: tensor(0.4888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6796, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:46<20:08,  5.40s/it][A

	loss_cls: tensor(0.5009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8503, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:51<20:00,  5.39s/it][A

	loss_cls: tensor(0.5361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6780, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:57<19:58,  5.40s/it][A

	loss_cls: tensor(0.5990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9195, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:02<19:51,  5.39s/it][A

	loss_cls: tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7042, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:08<19:48,  5.40s/it][A

	loss_cls: tensor(0.6301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8286, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:13<19:40,  5.39s/it][A

	loss_cls: tensor(0.5815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6322, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:18<19:30,  5.37s/it][A

	loss_cls: tensor(0.4217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5360, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:24<19:29,  5.39s/it][A

	loss_cls: tensor(0.8009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0305, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:29<19:20,  5.37s/it][A

	loss_cls: tensor(0.8618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1538, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:34<19:18,  5.39s/it][A

	loss_cls: tensor(0.6063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1079, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:40<19:11,  5.38s/it][A

	loss_cls: tensor(0.8752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2340, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:45<19:10,  5.40s/it][A

	loss_cls: tensor(0.3948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4617, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:51<19:01,  5.39s/it][A

	loss_cls: tensor(0.7145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0183, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:56<18:55,  5.38s/it][A

	loss_cls: tensor(0.5904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6833, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:01<18:54,  5.40s/it][A

	loss_cls: tensor(0.3545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4173, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:07<18:45,  5.39s/it][A

	loss_cls: tensor(0.3684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5859, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:12<18:44,  5.40s/it][A

	loss_cls: tensor(0.5051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6007, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:18<18:35,  5.39s/it][A

	loss_cls: tensor(0.3855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5826, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:23<18:35,  5.41s/it][A

	loss_cls: tensor(0.7956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1152, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:28<18:27,  5.40s/it][A

	loss_cls: tensor(0.3148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3515, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:34<18:19,  5.39s/it][A

	loss_cls: tensor(0.7657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9258, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:39<18:16,  5.40s/it][A

	loss_cls: tensor(0.5182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9795, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:45<18:04,  5.37s/it][A

	loss_cls: tensor(0.6703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9166, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:50<18:03,  5.39s/it][A

	loss_cls: tensor(0.9837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1617, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:55<17:53,  5.37s/it][A

	loss_cls: tensor(0.3666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5260, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:01<17:51,  5.38s/it][A

	loss_cls: tensor(0.4935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6327, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:06<17:44,  5.38s/it][A

	loss_cls: tensor(0.6977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8664, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:11<17:37,  5.37s/it][A

	loss_cls: tensor(0.6219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0791, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:17<17:34,  5.38s/it][A

	loss_cls: tensor(0.8770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9727, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:22<17:28,  5.38s/it][A

	loss_cls: tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6409, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:28<17:27,  5.40s/it][A

	loss_cls: tensor(0.4745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6393, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:33<17:20,  5.39s/it][A

	loss_cls: tensor(0.7131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0655, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:38<17:18,  5.41s/it][A

	loss_cls: tensor(0.5705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7377, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:44<17:09,  5.39s/it][A

	loss_cls: tensor(0.6277, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0926, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:49<17:00,  5.37s/it][A

	loss_cls: tensor(0.5123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5554, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:55<16:58,  5.39s/it][A

	loss_cls: tensor(0.5554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6593, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:00<16:49,  5.37s/it][A

	loss_cls: tensor(0.6584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7043, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:05<16:49,  5.40s/it][A

	loss_cls: tensor(0.5143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6888, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:11<16:40,  5.38s/it][A

	loss_cls: tensor(0.4942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2295, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7237, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:16<16:37,  5.39s/it][A

	loss_cls: tensor(0.7449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8321, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:21<16:29,  5.38s/it][A

	loss_cls: tensor(0.7452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9097, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:27<16:22,  5.37s/it][A

	loss_cls: tensor(0.7932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9235, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:32<16:20,  5.39s/it][A

	loss_cls: tensor(0.6143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8160, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:38<16:13,  5.38s/it][A

	loss_cls: tensor(0.3452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3995, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:43<16:09,  5.39s/it][A

	loss_cls: tensor(0.4016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4854, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:48<16:03,  5.38s/it][A

	loss_cls: tensor(0.5450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6926, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:54<15:58,  5.38s/it][A

	loss_cls: tensor(0.4618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5800, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:59<15:50,  5.37s/it][A

	loss_cls: tensor(0.7420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1879, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:04<15:43,  5.36s/it][A

	loss_cls: tensor(0.2052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2564, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:10<15:41,  5.38s/it][A

	loss_cls: tensor(0.2903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3477, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:15<15:35,  5.38s/it][A

	loss_cls: tensor(0.6178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8165, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:21<15:34,  5.40s/it][A

	loss_cls: tensor(0.9478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3076, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:26<15:26,  5.39s/it][A

	loss_cls: tensor(0.7114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0096, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:31<15:24,  5.40s/it][A

	loss_cls: tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7603, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:37<15:16,  5.39s/it][A

	loss_cls: tensor(0.4686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5176, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:42<15:08,  5.37s/it][A

	loss_cls: tensor(0.8434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2374, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:48<15:06,  5.39s/it][A

	loss_cls: tensor(0.5245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7137, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:53<14:58,  5.38s/it][A

	loss_cls: tensor(0.8964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0959, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:58<14:55,  5.40s/it][A

	loss_cls: tensor(0.6862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0117, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:04<14:48,  5.38s/it][A

	loss_cls: tensor(0.5318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7593, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:09<14:45,  5.40s/it][A

	loss_cls: tensor(0.3746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6416, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:15<14:37,  5.38s/it][A

	loss_cls: tensor(0.6728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7954, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:20<14:33,  5.39s/it][A

	loss_cls: tensor(0.6796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8647, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:25<14:27,  5.39s/it][A

	loss_cls: tensor(0.6546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8760, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:31<14:20,  5.38s/it][A

	loss_cls: tensor(0.8875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0351, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:36<14:18,  5.40s/it][A

	loss_cls: tensor(0.6298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7621, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:41<14:10,  5.39s/it][A

	loss_cls: tensor(0.5710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6031, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:47<14:08,  5.41s/it][A

	loss_cls: tensor(0.6645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7919, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:52<13:59,  5.38s/it][A

	loss_cls: tensor(0.7668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9479, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:58<13:55,  5.39s/it][A

	loss_cls: tensor(0.6167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0093, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:03<13:48,  5.38s/it][A

	loss_cls: tensor(0.5973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8196, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:08<13:42,  5.38s/it][A

	loss_cls: tensor(0.8544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9598, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:14<13:40,  5.40s/it][A

	loss_cls: tensor(0.5243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5910, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:19<13:33,  5.38s/it][A

	loss_cls: tensor(0.6810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9435, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:25<13:30,  5.40s/it][A

	loss_cls: tensor(0.9300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1171, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:30<13:22,  5.39s/it][A

	loss_cls: tensor(0.4489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6211, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:35<13:20,  5.41s/it][A

	loss_cls: tensor(0.4941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9066, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:41<13:14,  5.40s/it][A

	loss_cls: tensor(0.4470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4883, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:46<13:07,  5.39s/it][A

	loss_cls: tensor(0.8397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3404, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:52<13:04,  5.41s/it][A

	loss_cls: tensor(0.5128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7591, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:57<12:57,  5.40s/it][A

	loss_cls: tensor(0.6602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0006, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:03<12:54,  5.42s/it][A

	loss_cls: tensor(0.3654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4992, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:08<12:46,  5.40s/it][A

	loss_cls: tensor(0.5300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7777, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:13<12:43,  5.42s/it][A

	loss_cls: tensor(0.5898, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9262, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:19<12:36,  5.41s/it][A

	loss_cls: tensor(0.8504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1533, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:24<12:30,  5.40s/it][A

	loss_cls: tensor(0.7229, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9918, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:30<12:28,  5.42s/it][A

	loss_cls: tensor(0.6366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0227, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:35<12:20,  5.41s/it][A

	loss_cls: tensor(0.6898, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9543, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:40<12:16,  5.42s/it][A

	loss_cls: tensor(0.7572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8724, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:46<12:09,  5.41s/it][A

	loss_cls: tensor(0.5886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0549, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:51<12:06,  5.42s/it][A

	loss_cls: tensor(0.5139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8496, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:57<12:00,  5.42s/it][A

	loss_cls: tensor(0.3818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6165, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:02<11:53,  5.41s/it][A

	loss_cls: tensor(0.5836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9793, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:07<11:50,  5.42s/it][A

	loss_cls: tensor(0.5819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7992, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:13<11:46,  5.43s/it][A

	loss_cls: tensor(0.5819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6934, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:18<11:44,  5.46s/it][A

	loss_cls: tensor(0.7904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1751, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:24<11:38,  5.46s/it][A

	loss_cls: tensor(0.7924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9244, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:29<11:35,  5.48s/it][A

	loss_cls: tensor(0.6204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8942, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:35<11:30,  5.48s/it][A

	loss_cls: tensor(0.6236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8769, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:40<11:23,  5.47s/it][A

	loss_cls: tensor(0.8612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1342, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:46<11:21,  5.49s/it][A

	loss_cls: tensor(0.7455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0510, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:51<11:14,  5.48s/it][A

	loss_cls: tensor(0.4628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9307, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:57<11:10,  5.50s/it][A

	loss_cls: tensor(0.6830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8974, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:02<11:03,  5.48s/it][A

	loss_cls: tensor(0.4810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8810, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:08<10:59,  5.50s/it][A

	loss_cls: tensor(0.4464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6187, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:13<10:53,  5.49s/it][A

	loss_cls: tensor(0.4177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5767, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:19<10:45,  5.47s/it][A

	loss_cls: tensor(0.3857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6060, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:24<10:43,  5.50s/it][A

	loss_cls: tensor(0.5445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8310, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:30<10:35,  5.48s/it][A

	loss_cls: tensor(0.7448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0835, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:35<10:32,  5.50s/it][A

	loss_cls: tensor(0.9093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0413, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:41<10:26,  5.49s/it][A

	loss_cls: tensor(0.6618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8169, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:46<10:21,  5.50s/it][A

	loss_cls: tensor(0.7707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0562, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:52<10:05,  5.40s/it][A

	loss_cls: tensor(1.0018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3254, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:57<10:01,  5.42s/it][A

	loss_cls: tensor(0.7467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0719, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:02<09:59,  5.45s/it][A

	loss_cls: tensor(0.5994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9321, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:08<09:53,  5.44s/it][A

	loss_cls: tensor(0.5340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6859, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:13<09:52,  5.48s/it][A

	loss_cls: tensor(0.7189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9355, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:18<09:30,  5.34s/it][A

	loss_cls: tensor(0.5360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8432, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:24<09:16,  5.25s/it][A

	loss_cls: tensor(0.4773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7252, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:29<09:16,  5.30s/it][A

	loss_cls: tensor(0.5090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7560, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:34<09:15,  5.35s/it][A

	loss_cls: tensor(0.6318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9899, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:40<09:04,  5.29s/it][A

	loss_cls: tensor(0.7560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8608, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:45<09:04,  5.34s/it][A

	loss_cls: tensor(0.6367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7658, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:51<09:05,  5.40s/it][A

	loss_cls: tensor(0.6946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8091, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:56<09:00,  5.41s/it][A

	loss_cls: tensor(0.4804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8617, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:02<08:59,  5.45s/it][A

	loss_cls: tensor(0.5844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2886, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8729, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:06<08:39,  5.30s/it][A

	loss_cls: tensor(0.5828, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7941, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:12<08:35,  5.32s/it][A

	loss_cls: tensor(0.6149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8201, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:17<08:36,  5.38s/it][A

	loss_cls: tensor(0.3910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6192, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:23<08:33,  5.41s/it][A

	loss_cls: tensor(0.6176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7911, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:28<08:32,  5.45s/it][A

	loss_cls: tensor(0.6855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9363, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:34<08:27,  5.46s/it][A

	loss_cls: tensor(0.7445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9772, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:39<08:24,  5.48s/it][A

	loss_cls: tensor(0.4693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6318, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:45<08:18,  5.48s/it][A

	loss_cls: tensor(0.6084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7915, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:50<08:12,  5.47s/it][A

	loss_cls: tensor(0.4834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6120, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:56<08:08,  5.48s/it][A

	loss_cls: tensor(0.7057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0907, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:01<08:02,  5.48s/it][A

	loss_cls: tensor(0.3895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5389, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:07<07:58,  5.50s/it][A

	loss_cls: tensor(0.8873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0533, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:12<07:52,  5.49s/it][A

	loss_cls: tensor(0.6223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3062, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:18<07:48,  5.51s/it][A

	loss_cls: tensor(0.2890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3426, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:23<07:41,  5.49s/it][A

	loss_cls: tensor(0.9104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1653, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:29<07:34,  5.47s/it][A

	loss_cls: tensor(0.6599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1381, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:34<07:30,  5.49s/it][A

	loss_cls: tensor(0.5975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7243, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:40<07:23,  5.48s/it][A

	loss_cls: tensor(0.5673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6454, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:45<07:20,  5.51s/it][A

	loss_cls: tensor(0.6073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7307, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:51<07:14,  5.51s/it][A

	loss_cls: tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8154, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:56<07:10,  5.52s/it][A

	loss_cls: tensor(0.6607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7128, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:02<07:04,  5.51s/it][A

	loss_cls: tensor(0.7418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9178, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:07<06:57,  5.49s/it][A

	loss_cls: tensor(0.8204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0488, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:13<06:52,  5.51s/it][A

	loss_cls: tensor(0.6643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7936, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:18<06:46,  5.50s/it][A

	loss_cls: tensor(0.8244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3495, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:24<06:42,  5.51s/it][A

	loss_cls: tensor(0.4525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5520, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:29<06:29,  5.42s/it][A

	loss_cls: tensor(0.7959, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9894, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:35<06:27,  5.45s/it][A

	loss_cls: tensor(0.6794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1398, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:40<06:21,  5.45s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5633, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:45<06:15,  5.45s/it][A

	loss_cls: tensor(0.6497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8128, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:51<06:01,  5.32s/it][A

	loss_cls: tensor(0.5845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8448, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:56<05:50,  5.23s/it][A

	loss_cls: tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9964, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:01<05:41,  5.18s/it][A

	loss_cls: tensor(0.7095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1626, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:06<05:33,  5.13s/it][A

	loss_cls: tensor(0.6223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8446, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:11<05:28,  5.13s/it][A

	loss_cls: tensor(0.7800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9300, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:16<05:20,  5.09s/it][A

	loss_cls: tensor(0.5749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7816, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:21<05:18,  5.13s/it][A

	loss_cls: tensor(0.5311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7783, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:26<05:19,  5.23s/it][A

	loss_cls: tensor(0.7178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8107, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:32<05:18,  5.31s/it][A

	loss_cls: tensor(0.7567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8554, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:37<05:17,  5.38s/it][A

	loss_cls: tensor(0.5549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7692, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:43<05:13,  5.40s/it][A

	loss_cls: tensor(0.5909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0433, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:48<05:09,  5.44s/it][A

	loss_cls: tensor(0.5310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9545, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:54<05:04,  5.44s/it][A

	loss_cls: tensor(0.5001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7098, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:59<05:00,  5.46s/it][A

	loss_cls: tensor(0.4908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9576, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:05<04:55,  5.47s/it][A

	loss_cls: tensor(0.5190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5567, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:10<04:49,  5.47s/it][A

	loss_cls: tensor(0.6144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:16<04:46,  5.50s/it][A

	loss_cls: tensor(0.6376, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9968, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:21<04:39,  5.49s/it][A

	loss_cls: tensor(0.5330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6689, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:27<04:34,  5.50s/it][A

	loss_cls: tensor(0.4800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8550, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:32<04:28,  5.49s/it][A

	loss_cls: tensor(0.5147, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9349, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:38<04:22,  5.48s/it][A

	loss_cls: tensor(0.6606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0450, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:43<04:15,  5.45s/it][A

	loss_cls: tensor(0.4976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7413, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:49<04:09,  5.43s/it][A

	loss_cls: tensor(0.7893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1018, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:54<04:04,  5.43s/it][A

	loss_cls: tensor(0.6773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8979, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:59<03:58,  5.41s/it][A

	loss_cls: tensor(0.8160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9167, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:05<03:52,  5.42s/it][A

	loss_cls: tensor(0.7212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8058, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:10<03:47,  5.41s/it][A

	loss_cls: tensor(0.5551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7948, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:16<03:42,  5.42s/it][A

	loss_cls: tensor(0.4505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6011, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:21<03:36,  5.41s/it][A

	loss_cls: tensor(0.4614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6465, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:26<03:30,  5.40s/it][A

	loss_cls: tensor(0.5196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5862, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:32<03:25,  5.41s/it][A

	loss_cls: tensor(0.5591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7278, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:37<03:19,  5.40s/it][A

	loss_cls: tensor(0.4834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7049, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:43<03:14,  5.41s/it][A

	loss_cls: tensor(0.8395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2596, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:48<03:08,  5.39s/it][A

	loss_cls: tensor(0.5595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7866, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:53<03:03,  5.41s/it][A

	loss_cls: tensor(0.7569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1432, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:59<02:58,  5.39s/it][A

	loss_cls: tensor(0.5510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8097, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:04<02:52,  5.38s/it][A

	loss_cls: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6921, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:10<02:47,  5.40s/it][A

	loss_cls: tensor(0.7261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2194, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:15<02:41,  5.39s/it][A

	loss_cls: tensor(0.7564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8471, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:20<02:36,  5.41s/it][A

	loss_cls: tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8018, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:26<02:31,  5.40s/it][A

	loss_cls: tensor(0.7965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1736, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9701, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:31<02:26,  5.41s/it][A

	loss_cls: tensor(0.4588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6655, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:37<02:20,  5.39s/it][A

	loss_cls: tensor(0.4156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9376, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:42<02:14,  5.38s/it][A

	loss_cls: tensor(0.4982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7050, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:47<02:09,  5.40s/it][A

	loss_cls: tensor(0.7592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9430, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:53<02:03,  5.38s/it][A

	loss_cls: tensor(0.5859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7407, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:58<01:58,  5.40s/it][A

	loss_cls: tensor(0.8169, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0109, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:04<01:53,  5.38s/it][A

	loss_cls: tensor(0.5905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7407, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:09<01:47,  5.39s/it][A

	loss_cls: tensor(0.5510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9217, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:14<01:42,  5.37s/it][A

	loss_cls: tensor(0.5309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8052, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:20<01:36,  5.37s/it][A

	loss_cls: tensor(0.6044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8710, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:25<01:31,  5.39s/it][A

	loss_cls: tensor(0.6837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9623, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:30<01:26,  5.38s/it][A

	loss_cls: tensor(0.7284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7670, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:36<01:21,  5.40s/it][A

	loss_cls: tensor(0.7139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1120, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:41<01:15,  5.40s/it][A

	loss_cls: tensor(0.7231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8527, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:47<01:10,  5.41s/it][A

	loss_cls: tensor(0.4992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7739, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:52<01:04,  5.40s/it][A

	loss_cls: tensor(0.7968, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0701, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:57<00:59,  5.39s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6790, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:03<00:54,  5.41s/it][A

	loss_cls: tensor(0.6893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9371, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:08<00:48,  5.40s/it][A

	loss_cls: tensor(0.8931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5508, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:14<00:43,  5.42s/it][A

	loss_cls: tensor(0.5947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9256, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:19<00:37,  5.41s/it][A

	loss_cls: tensor(0.8392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1303, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:25<00:32,  5.43s/it][A

	loss_cls: tensor(0.8524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9818, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:30<00:27,  5.41s/it][A

	loss_cls: tensor(0.7542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0558, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:35<00:21,  5.39s/it][A

	loss_cls: tensor(0.6817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9415, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:41<00:16,  5.41s/it][A

	loss_cls: tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8089, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:46<00:10,  5.40s/it][A

	loss_cls: tensor(0.6693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8900, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:52<00:05,  5.42s/it][A

	loss_cls: tensor(0.8547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9109, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:54<00:00,  5.41s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.8317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9766, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8599780903192563

	Training cls acc: 0.6827919020715632

	Training cls prec: 0.5723941077754637

	Training cls rec: 0.5929820238612611

	Training cls f1: 0.5201999209378942

--
	Training ner acc: 0.9551588160747773

	Training ner prec: 0.27550153642730074

	Training ner rec: 0.28110224922004096

	Training ner f1: 0.27707644137574666

	Current Learning rate:  0.0008571428571428571



  1%|          | 1/177 [00:00<02:07,  1.38it/s][A
  1%|          | 2/177 [00:01<02:07,  1.37it/s][A
  2%|▏         | 3/177 [00:02<01:59,  1.46it/s][A
  2%|▏         | 4/177 [00:02<02:01,  1.43it/s][A
  3%|▎         | 5/177 [00:03<02:02,  1.41it/s][A
  3%|▎         | 6/177 [00:04<01:57,  1.46it/s][A
  4%|▍         | 7/177 [00:04<01:58,  1.44it/s][A
  5%|▍         | 8/177 [00:05<01:59,  1.42it/s][A
  5%|▌         | 9/177 [00:06<01:59,  1.41it/s][A
  6%|▌         | 10/177 [00:06<01:54,  1.45it/s][A
  6%|▌         | 11/177 [00:07<01:55,  1.43it/s][A
  7%|▋         | 12/177 [00:08<01:56,  1.42it/s][A
  7%|▋         | 13/177 [00:09<01:56,  1.41it/s][A
  8%|▊         | 14/177 [00:09<01:52,  1.45it/s][A
  8%|▊         | 15/177 [00:10<01:53,  1.43it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.41it/s][A
 10%|▉         | 17/177 [00:11<01:50,  1.45it/s][A
 10%|█         | 18/177 [00:12<01:51,  1.43it/s][A
 11%|█         | 19/177 [00:13<01:51,  1.42it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8461357680080974

	Validation cls acc: 0.53954802259887

	Validation cls prec: 0.5662530266343826

	Validation cls rec: 0.5293751681463545

	Validation cls f1: 0.4490750931428898

--
	Validation ner acc: 0.9549104432991464

	Validation ner prec: 0.40341800092838553

	Validation ner rec: 0.4137476459510358

	Validation ner f1: 0.4083756680480803



  0%|          | 1/354 [00:05<31:38,  5.38s/it][A

	loss_cls: tensor(0.5309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8800, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:50,  5.43s/it][A

	loss_cls: tensor(0.7913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8839, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:35,  5.40s/it][A

	loss_cls: tensor(0.6464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9657, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:23,  5.38s/it][A

	loss_cls: tensor(0.6580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6756, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:26,  5.41s/it][A

	loss_cls: tensor(0.7357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7550, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:14,  5.39s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7520, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:13,  5.40s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8435, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:04,  5.39s/it][A

	loss_cls: tensor(0.7767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2592, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:06,  5.41s/it][A

	loss_cls: tensor(0.7129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0924, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:58,  5.40s/it][A

	loss_cls: tensor(0.6392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8481, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:48,  5.39s/it][A

	loss_cls: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1016, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:49,  5.41s/it][A

	loss_cls: tensor(0.5257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6104, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:41,  5.40s/it][A

	loss_cls: tensor(0.3874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5943, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:44,  5.42s/it][A

	loss_cls: tensor(0.6514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7910, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:35,  5.41s/it][A

	loss_cls: tensor(0.3836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4640, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:35,  5.43s/it][A

	loss_cls: tensor(0.4299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6017, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:24,  5.41s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7477, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:14,  5.40s/it][A

	loss_cls: tensor(0.6625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7793, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:13,  5.41s/it][A

	loss_cls: tensor(0.5008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6387, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:00,  5.39s/it][A

	loss_cls: tensor(0.5687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8852, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:03,  5.42s/it][A

	loss_cls: tensor(0.8447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9952, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:56,  5.41s/it][A

	loss_cls: tensor(0.6360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8692, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:56,  5.43s/it][A

	loss_cls: tensor(0.3166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5578, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:44,  5.41s/it][A

	loss_cls: tensor(0.3243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4619, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:32,  5.39s/it][A

	loss_cls: tensor(0.6746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8322, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:30,  5.40s/it][A

	loss_cls: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6387, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:25<29:18,  5.38s/it][A

	loss_cls: tensor(1.0581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4243, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:21,  5.40s/it][A

	loss_cls: tensor(0.6716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9925, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:11,  5.39s/it][A

	loss_cls: tensor(0.3574, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5495, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:12,  5.41s/it][A

	loss_cls: tensor(0.5888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7065, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:01,  5.39s/it][A

	loss_cls: tensor(0.6233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9071, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:52<28:52,  5.38s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5631, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:53,  5.40s/it][A

	loss_cls: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3910, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<28:41,  5.38s/it][A

	loss_cls: tensor(0.4357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7095, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:44,  5.40s/it][A

	loss_cls: tensor(0.4918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5852, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:34,  5.39s/it][A

	loss_cls: tensor(0.5561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1736, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7297, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:19<28:34,  5.41s/it][A

	loss_cls: tensor(0.5361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8660, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:25,  5.40s/it][A

	loss_cls: tensor(0.6742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8625, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:30<28:15,  5.38s/it][A

	loss_cls: tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7533, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:13,  5.39s/it][A

	loss_cls: tensor(0.6666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7689, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<28:05,  5.39s/it][A

	loss_cls: tensor(0.4992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7580, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:46<28:07,  5.41s/it][A

	loss_cls: tensor(0.4859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6297, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<27:56,  5.39s/it][A

	loss_cls: tensor(0.5147, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7049, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:57<27:56,  5.41s/it][A

	loss_cls: tensor(0.4888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6561, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:02<27:46,  5.39s/it][A

	loss_cls: tensor(0.6623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7932, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:08<27:35,  5.38s/it][A

	loss_cls: tensor(0.6311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7500, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:13<27:33,  5.39s/it][A

	loss_cls: tensor(0.6118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7619, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:23,  5.37s/it][A

	loss_cls: tensor(0.9547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2058, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:24<27:23,  5.39s/it][A

	loss_cls: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7341, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:29<27:15,  5.38s/it][A

	loss_cls: tensor(0.9042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3418, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:35<27:15,  5.40s/it][A

	loss_cls: tensor(0.6924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3341, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:40<27:03,  5.38s/it][A

	loss_cls: tensor(0.5060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8364, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:45<26:53,  5.36s/it][A

	loss_cls: tensor(0.6950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0530, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:51<26:53,  5.38s/it][A

	loss_cls: tensor(0.5685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8473, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:56<26:46,  5.37s/it][A

	loss_cls: tensor(0.3558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6073, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:02<26:48,  5.40s/it][A

	loss_cls: tensor(1.0532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3097, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:07<26:38,  5.38s/it][A

	loss_cls: tensor(0.4360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7442, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:12<26:37,  5.40s/it][A

	loss_cls: tensor(0.4787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8390, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:18<26:27,  5.38s/it][A

	loss_cls: tensor(0.6623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7655, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:23<26:17,  5.37s/it][A

	loss_cls: tensor(0.7886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2645, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:29<26:18,  5.39s/it][A

	loss_cls: tensor(0.8833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1170, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:34<26:11,  5.38s/it][A

	loss_cls: tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8258, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:39<26:12,  5.40s/it][A

	loss_cls: tensor(0.6088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1466, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:45<26:02,  5.39s/it][A

	loss_cls: tensor(0.5699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6739, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:50<26:03,  5.41s/it][A

	loss_cls: tensor(0.5303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8200, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:56<25:51,  5.39s/it][A

	loss_cls: tensor(0.8418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3783, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:01<25:41,  5.37s/it][A

	loss_cls: tensor(0.5663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8170, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:06<25:42,  5.40s/it][A

	loss_cls: tensor(0.6924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9445, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:12<25:35,  5.39s/it][A

	loss_cls: tensor(1.0341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1386, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:17<25:36,  5.41s/it][A

	loss_cls: tensor(0.4414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6633, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:23<25:28,  5.40s/it][A

	loss_cls: tensor(0.4030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5430, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:28<25:24,  5.41s/it][A

	loss_cls: tensor(0.5915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9544, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:33<25:14,  5.39s/it][A

	loss_cls: tensor(0.5161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6700, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:39<25:10,  5.40s/it][A

	loss_cls: tensor(0.6437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8392, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:44<25:03,  5.39s/it][A

	loss_cls: tensor(0.3326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5812, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:49<24:54,  5.38s/it][A

	loss_cls: tensor(0.4383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5946, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:55<24:56,  5.40s/it][A

	loss_cls: tensor(0.6808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0831, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:00<24:49,  5.40s/it][A

	loss_cls: tensor(0.4145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5466, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:06<24:49,  5.42s/it][A

	loss_cls: tensor(0.7802, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9952, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:11<24:38,  5.40s/it][A

	loss_cls: tensor(0.6880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0548, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<24:39,  5.42s/it][A

	loss_cls: tensor(0.8512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0080, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<24:32,  5.41s/it][A

	loss_cls: tensor(0.8697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1070, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:27<24:23,  5.40s/it][A

	loss_cls: tensor(0.8223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0761, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:23,  5.42s/it][A

	loss_cls: tensor(0.4665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5510, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:38<24:14,  5.41s/it][A

	loss_cls: tensor(0.4221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4907, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:11,  5.42s/it][A

	loss_cls: tensor(0.4771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6398, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:49<24:00,  5.39s/it][A

	loss_cls: tensor(0.6219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8067, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:54<24:01,  5.42s/it][A

	loss_cls: tensor(0.5594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7816, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<23:53,  5.41s/it][A

	loss_cls: tensor(0.6762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:05<23:44,  5.40s/it][A

	loss_cls: tensor(0.8367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1042, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:46,  5.42s/it][A

	loss_cls: tensor(0.7255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8117, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<23:37,  5.41s/it][A

	loss_cls: tensor(0.5790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1228, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7018, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<23:36,  5.43s/it][A

	loss_cls: tensor(0.6336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7595, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:27<23:26,  5.41s/it][A

	loss_cls: tensor(0.4809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7063, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:32<23:26,  5.43s/it][A

	loss_cls: tensor(0.7186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1490, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:38<23:16,  5.41s/it][A

	loss_cls: tensor(0.6007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2483, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8491, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:43<23:07,  5.40s/it][A

	loss_cls: tensor(0.4337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5900, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:49<23:05,  5.41s/it][A

	loss_cls: tensor(0.4851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6053, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:54<22:56,  5.40s/it][A

	loss_cls: tensor(0.5281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6152, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:59<22:53,  5.41s/it][A

	loss_cls: tensor(0.4947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5791, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:05<22:46,  5.40s/it][A

	loss_cls: tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8765, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:10<22:46,  5.42s/it][A

	loss_cls: tensor(0.4823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7069, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:35,  5.40s/it][A

	loss_cls: tensor(0.7553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9694, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:21<22:28,  5.39s/it][A

	loss_cls: tensor(0.6979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8300, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:26<22:27,  5.41s/it][A

	loss_cls: tensor(0.3939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8187, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:32<22:19,  5.40s/it][A

	loss_cls: tensor(0.5022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6746, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:37<22:16,  5.41s/it][A

	loss_cls: tensor(0.4656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6211, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:43<22:08,  5.40s/it][A

	loss_cls: tensor(0.6215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7169, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:48<22:08,  5.42s/it][A

	loss_cls: tensor(0.4997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7587, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:53<22:00,  5.41s/it][A

	loss_cls: tensor(0.3431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4519, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:59<21:54,  5.41s/it][A

	loss_cls: tensor(0.4559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8475, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:04<21:54,  5.43s/it][A

	loss_cls: tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6411, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:10<21:44,  5.41s/it][A

	loss_cls: tensor(0.5310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7110, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:15<21:42,  5.43s/it][A

	loss_cls: tensor(0.5615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7892, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:21<21:33,  5.41s/it][A

	loss_cls: tensor(0.4915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6770, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:26<21:32,  5.43s/it][A

	loss_cls: tensor(0.5991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7323, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:31<21:23,  5.42s/it][A

	loss_cls: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1812, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8192, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:37<21:16,  5.41s/it][A

	loss_cls: tensor(0.2473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2820, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:42<21:13,  5.42s/it][A

	loss_cls: tensor(0.4568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6123, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:48<21:01,  5.39s/it][A

	loss_cls: tensor(0.4965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7054, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:53<21:01,  5.41s/it][A

	loss_cls: tensor(0.8839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9878, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:58<20:57,  5.42s/it][A

	loss_cls: tensor(0.4669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6810, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:04<20:55,  5.43s/it][A

	loss_cls: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2372, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:09<20:47,  5.42s/it][A

	loss_cls: tensor(0.5531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7627, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:15<20:49,  5.46s/it][A

	loss_cls: tensor(0.6617, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8404, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:21<21:12,  5.58s/it][A

	loss_cls: tensor(0.8660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0012, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:26<21:08,  5.59s/it][A

	loss_cls: tensor(0.8639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0130, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:32<21:18,  5.66s/it][A

	loss_cls: tensor(0.6252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7799, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:38<21:05,  5.63s/it][A

	loss_cls: tensor(0.9557, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0868, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:43<20:52,  5.59s/it][A

	loss_cls: tensor(1.1103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1515, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:49<20:34,  5.53s/it][A

	loss_cls: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8649, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:54<20:18,  5.49s/it][A

	loss_cls: tensor(0.6279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8827, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:59<20:08,  5.47s/it][A

	loss_cls: tensor(0.7341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1324, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:05<19:55,  5.43s/it][A

	loss_cls: tensor(0.5849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6767, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:10<19:54,  5.45s/it][A

	loss_cls: tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6112, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:16<19:45,  5.44s/it][A

	loss_cls: tensor(0.8626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2025, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:21<19:41,  5.45s/it][A

	loss_cls: tensor(0.5737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8805, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:27<19:32,  5.43s/it][A

	loss_cls: tensor(1.1376, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3051, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:32<19:26,  5.43s/it][A

	loss_cls: tensor(0.5310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:37<19:23,  5.44s/it][A

	loss_cls: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0813, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:43<19:16,  5.43s/it][A

	loss_cls: tensor(0.4757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7351, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:48<19:15,  5.45s/it][A

	loss_cls: tensor(0.4994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9025, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:54<19:07,  5.44s/it][A

	loss_cls: tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7544, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:59<19:05,  5.45s/it][A

	loss_cls: tensor(0.4606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8679, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:05<18:56,  5.44s/it][A

	loss_cls: tensor(0.6177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0029, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:10<18:47,  5.42s/it][A

	loss_cls: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7200, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:15<18:44,  5.43s/it][A

	loss_cls: tensor(0.5470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8693, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:21<18:36,  5.42s/it][A

	loss_cls: tensor(0.8207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9842, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:26<18:33,  5.43s/it][A

	loss_cls: tensor(0.9250, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1865, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:32<18:26,  5.42s/it][A

	loss_cls: tensor(0.5323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8398, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:37<18:24,  5.44s/it][A

	loss_cls: tensor(0.4467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7137, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:43<18:16,  5.43s/it][A

	loss_cls: tensor(0.8783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2381, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:48<18:08,  5.42s/it][A

	loss_cls: tensor(0.6102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1984, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:53<18:06,  5.43s/it][A

	loss_cls: tensor(0.6839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9850, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:59<17:59,  5.43s/it][A

	loss_cls: tensor(0.5468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7020, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:04<17:57,  5.44s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2830, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7997, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:10<17:46,  5.42s/it][A

	loss_cls: tensor(0.7291, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8978, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:15<17:45,  5.44s/it][A

	loss_cls: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8881, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:21<17:34,  5.41s/it][A

	loss_cls: tensor(0.5466, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7662, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:26<17:30,  5.41s/it][A

	loss_cls: tensor(0.5658, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9466, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:32<17:34,  5.46s/it][A

	loss_cls: tensor(0.7681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0637, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:37<17:30,  5.47s/it][A

	loss_cls: tensor(0.7729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9436, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:43<17:30,  5.50s/it][A

	loss_cls: tensor(0.7413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9005, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:48<17:23,  5.49s/it][A

	loss_cls: tensor(0.3889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6653, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:54<17:15,  5.48s/it][A

	loss_cls: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6389, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:59<17:09,  5.47s/it][A

	loss_cls: tensor(0.5940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6798, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:04<17:01,  5.46s/it][A

	loss_cls: tensor(0.5479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7984, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:10<17:01,  5.49s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0387, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:15<16:56,  5.50s/it][A

	loss_cls: tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7355, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:21<16:27,  5.37s/it][A

	loss_cls: tensor(0.5471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6280, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:26<16:03,  5.27s/it][A

	loss_cls: tensor(0.7863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2779, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:31<15:48,  5.21s/it][A

	loss_cls: tensor(0.8369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0554, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:36<15:34,  5.16s/it][A

	loss_cls: tensor(0.6317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2744, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9061, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:41<15:48,  5.27s/it][A

	loss_cls: tensor(0.8489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0374, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:47<15:55,  5.34s/it][A

	loss_cls: tensor(0.6085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7431, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:52<15:59,  5.39s/it][A

	loss_cls: tensor(0.4433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5165, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:58<16:03,  5.44s/it][A

	loss_cls: tensor(0.7595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2102, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9697, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:03<15:59,  5.45s/it][A

	loss_cls: tensor(0.4827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6340, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:09<15:58,  5.48s/it][A

	loss_cls: tensor(0.6027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0818, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:14<15:54,  5.48s/it][A

	loss_cls: tensor(0.7725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9230, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:20<15:51,  5.50s/it][A

	loss_cls: tensor(0.6764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8464, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:25<15:43,  5.48s/it][A

	loss_cls: tensor(0.5445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6797, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:31<15:38,  5.49s/it][A

	loss_cls: tensor(0.6882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8898, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:36<15:35,  5.50s/it][A

	loss_cls: tensor(0.5467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9548, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:42<15:29,  5.50s/it][A

	loss_cls: tensor(0.5624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6975, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:47<15:23,  5.50s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9471, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:53<15:09,  5.45s/it][A

	loss_cls: tensor(0.3790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5476, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:58<15:05,  5.46s/it][A

	loss_cls: tensor(0.5310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8090, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:03<14:56,  5.43s/it][A

	loss_cls: tensor(0.6811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9605, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:09<14:48,  5.42s/it][A

	loss_cls: tensor(0.4778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5810, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:14<14:45,  5.43s/it][A

	loss_cls: tensor(0.6960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9851, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:20<14:37,  5.42s/it][A

	loss_cls: tensor(0.4732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0642, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5374, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:25<14:32,  5.42s/it][A

	loss_cls: tensor(0.6190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7170, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:31<14:24,  5.40s/it][A

	loss_cls: tensor(1.0514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1883, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:36<14:22,  5.42s/it][A

	loss_cls: tensor(0.6737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9080, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:41<14:17,  5.43s/it][A

	loss_cls: tensor(0.5786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7476, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:47<14:11,  5.42s/it][A

	loss_cls: tensor(0.5855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7581, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:52<14:08,  5.44s/it][A

	loss_cls: tensor(0.7280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8746, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:58<13:59,  5.42s/it][A

	loss_cls: tensor(0.6313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8453, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:03<13:55,  5.43s/it][A

	loss_cls: tensor(0.6694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1771, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:09<13:48,  5.42s/it][A

	loss_cls: tensor(0.5639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6688, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:14<13:46,  5.44s/it][A

	loss_cls: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5628, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:19<13:38,  5.42s/it][A

	loss_cls: tensor(0.8665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2725, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:25<13:30,  5.41s/it][A

	loss_cls: tensor(0.8724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0652, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:30<13:26,  5.41s/it][A

	loss_cls: tensor(0.3878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8491, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:36<13:18,  5.39s/it][A

	loss_cls: tensor(0.6022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7651, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:41<13:14,  5.41s/it][A

	loss_cls: tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5839, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:46<13:08,  5.40s/it][A

	loss_cls: tensor(0.7323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0599, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:52<13:05,  5.42s/it][A

	loss_cls: tensor(0.6465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9508, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:57<12:56,  5.40s/it][A

	loss_cls: tensor(0.5629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8333, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:03<12:49,  5.38s/it][A

	loss_cls: tensor(0.5483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8252, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:08<12:45,  5.39s/it][A

	loss_cls: tensor(0.7664, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2232, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:13<12:37,  5.37s/it][A

	loss_cls: tensor(0.6237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8707, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:19<12:34,  5.39s/it][A

	loss_cls: tensor(0.5977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9925, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:24<12:27,  5.38s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7238, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:29<12:24,  5.39s/it][A

	loss_cls: tensor(0.4931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8892, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:35<12:17,  5.38s/it][A

	loss_cls: tensor(0.5823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0619, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:40<12:10,  5.37s/it][A

	loss_cls: tensor(1.0280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2480, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:46<12:07,  5.39s/it][A

	loss_cls: tensor(0.7371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9543, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:51<11:59,  5.37s/it][A

	loss_cls: tensor(0.6181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7932, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:56<11:56,  5.39s/it][A

	loss_cls: tensor(0.7057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9851, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:02<11:50,  5.38s/it][A

	loss_cls: tensor(0.5317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6864, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:07<11:47,  5.40s/it][A

	loss_cls: tensor(0.6309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9341, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:13<11:39,  5.38s/it][A

	loss_cls: tensor(0.5952, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6453, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:18<11:32,  5.37s/it][A

	loss_cls: tensor(0.6987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8104, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:23<11:29,  5.39s/it][A

	loss_cls: tensor(0.5779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9187, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:29<11:22,  5.38s/it][A

	loss_cls: tensor(0.3537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4701, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:34<11:19,  5.39s/it][A

	loss_cls: tensor(0.5683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6877, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:39<11:12,  5.38s/it][A

	loss_cls: tensor(0.8298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1456, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:45<11:09,  5.40s/it][A

	loss_cls: tensor(0.2168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6055, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:50<11:01,  5.38s/it][A

	loss_cls: tensor(0.7317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3052, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:56<10:54,  5.37s/it][A

	loss_cls: tensor(0.6714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8115, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:01<10:50,  5.38s/it][A

	loss_cls: tensor(0.5154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8625, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:06<10:42,  5.36s/it][A

	loss_cls: tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7716, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:12<10:41,  5.39s/it][A

	loss_cls: tensor(0.9409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2828, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:17<10:34,  5.37s/it][A

	loss_cls: tensor(0.6345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0522, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:22<10:30,  5.39s/it][A

	loss_cls: tensor(0.6443, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9846, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:28<10:23,  5.38s/it][A

	loss_cls: tensor(0.7707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0824, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:33<10:16,  5.36s/it][A

	loss_cls: tensor(0.4462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6519, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:39<10:13,  5.38s/it][A

	loss_cls: tensor(0.7160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8289, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:44<10:06,  5.37s/it][A

	loss_cls: tensor(1.0545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2678, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:49<10:03,  5.39s/it][A

	loss_cls: tensor(0.4560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8209, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:55<09:56,  5.37s/it][A

	loss_cls: tensor(0.4312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7588, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:00<09:52,  5.39s/it][A

	loss_cls: tensor(0.6095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9256, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:05<09:45,  5.37s/it][A

	loss_cls: tensor(0.9629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3464, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:11<09:39,  5.37s/it][A

	loss_cls: tensor(0.7255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9739, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:16<09:35,  5.38s/it][A

	loss_cls: tensor(0.5492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9259, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:22<09:29,  5.37s/it][A

	loss_cls: tensor(1.0958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3582, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:27<09:25,  5.39s/it][A

	loss_cls: tensor(0.4868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7473, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:32<09:20,  5.39s/it][A

	loss_cls: tensor(0.5482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5975, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:38<09:15,  5.40s/it][A

	loss_cls: tensor(0.5681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6395, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:43<09:09,  5.39s/it][A

	loss_cls: tensor(0.6105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9591, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:48<09:02,  5.37s/it][A

	loss_cls: tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7279, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:54<08:58,  5.38s/it][A

	loss_cls: tensor(0.3915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6008, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:59<08:52,  5.38s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6347, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:05<08:48,  5.40s/it][A

	loss_cls: tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7514, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:10<08:42,  5.38s/it][A

	loss_cls: tensor(0.3505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5859, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:15<08:38,  5.40s/it][A

	loss_cls: tensor(0.4868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7259, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:21<08:31,  5.38s/it][A

	loss_cls: tensor(0.7773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9892, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:26<08:24,  5.37s/it][A

	loss_cls: tensor(1.0867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6225, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:32<08:21,  5.39s/it][A

	loss_cls: tensor(0.4236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7356, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:37<08:14,  5.38s/it][A

	loss_cls: tensor(0.4649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7503, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:42<08:10,  5.39s/it][A

	loss_cls: tensor(0.4838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5591, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:48<08:04,  5.38s/it][A

	loss_cls: tensor(0.6815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9303, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:53<08:00,  5.40s/it][A

	loss_cls: tensor(0.3844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5466, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:59<07:53,  5.38s/it][A

	loss_cls: tensor(0.5328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5946, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:04<07:46,  5.37s/it][A

	loss_cls: tensor(0.3189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5116, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:09<07:43,  5.39s/it][A

	loss_cls: tensor(0.7239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8497, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:15<07:37,  5.38s/it][A

	loss_cls: tensor(0.8511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0373, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:20<07:33,  5.39s/it][A

	loss_cls: tensor(0.6367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8622, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:25<07:26,  5.38s/it][A

	loss_cls: tensor(0.4507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5021, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:31<07:22,  5.39s/it][A

	loss_cls: tensor(0.3997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6007, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:36<07:15,  5.38s/it][A

	loss_cls: tensor(0.5191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6126, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:42<07:11,  5.39s/it][A

	loss_cls: tensor(0.5095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6259, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:47<07:05,  5.39s/it][A

	loss_cls: tensor(0.4587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6066, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:52<06:59,  5.38s/it][A

	loss_cls: tensor(0.8300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9354, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:58<06:55,  5.40s/it][A

	loss_cls: tensor(0.6266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8660, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:03<06:49,  5.38s/it][A

	loss_cls: tensor(0.4677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5005, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:09<06:44,  5.39s/it][A

	loss_cls: tensor(0.6912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1327, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:14<06:37,  5.37s/it][A

	loss_cls: tensor(0.3400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4027, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:19<06:33,  5.39s/it][A

	loss_cls: tensor(0.6002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1994, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:25<06:27,  5.38s/it][A

	loss_cls: tensor(0.4832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7877, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:30<06:21,  5.37s/it][A

	loss_cls: tensor(0.5297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8930, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:35<06:16,  5.39s/it][A

	loss_cls: tensor(0.4361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1561, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5922, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:41<06:10,  5.37s/it][A

	loss_cls: tensor(0.7186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8809, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:46<06:06,  5.39s/it][A

	loss_cls: tensor(0.4704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8351, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:51<05:59,  5.37s/it][A

	loss_cls: tensor(0.5001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6805, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:57<05:55,  5.39s/it][A

	loss_cls: tensor(0.5877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9144, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:02<05:49,  5.38s/it][A

	loss_cls: tensor(0.5786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9864, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:08<05:43,  5.37s/it][A

	loss_cls: tensor(0.6502, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9462, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:13<05:38,  5.38s/it][A

	loss_cls: tensor(0.6331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7915, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:18<05:32,  5.37s/it][A

	loss_cls: tensor(0.3753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6835, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:24<05:28,  5.38s/it][A

	loss_cls: tensor(1.0303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0937, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:29<05:21,  5.36s/it][A

	loss_cls: tensor(0.4867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8505, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:35<05:18,  5.39s/it][A

	loss_cls: tensor(0.7096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9887, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:40<05:12,  5.38s/it][A

	loss_cls: tensor(0.7222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8710, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:45<05:06,  5.37s/it][A

	loss_cls: tensor(0.6967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8203, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:51<05:01,  5.39s/it][A

	loss_cls: tensor(0.6445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9064, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:56<04:55,  5.38s/it][A

	loss_cls: tensor(0.7564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0775, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:01<04:51,  5.39s/it][A

	loss_cls: tensor(0.7752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9378, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:07<04:45,  5.38s/it][A

	loss_cls: tensor(0.6458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7902, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:12<04:41,  5.40s/it][A

	loss_cls: tensor(0.4317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5961, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:18<04:34,  5.39s/it][A

	loss_cls: tensor(0.4928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7205, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:23<04:28,  5.38s/it][A

	loss_cls: tensor(0.4943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7595, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:28<04:24,  5.39s/it][A

	loss_cls: tensor(0.6576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7839, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:34<04:17,  5.37s/it][A

	loss_cls: tensor(1.3769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5221, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:39<04:12,  5.38s/it][A

	loss_cls: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8428, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:45<04:07,  5.37s/it][A

	loss_cls: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8969, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:50<04:02,  5.39s/it][A

	loss_cls: tensor(0.5463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0217, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:55<03:56,  5.38s/it][A

	loss_cls: tensor(0.7514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0538, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:01<03:50,  5.36s/it][A

	loss_cls: tensor(0.3865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5323, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:06<03:46,  5.38s/it][A

	loss_cls: tensor(0.6175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1789, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7964, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:11<03:40,  5.37s/it][A

	loss_cls: tensor(0.6099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8178, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:17<03:35,  5.39s/it][A

	loss_cls: tensor(0.5861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1108, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:22<03:29,  5.38s/it][A

	loss_cls: tensor(0.6864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9560, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:28<03:24,  5.39s/it][A

	loss_cls: tensor(0.7222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0112, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:33<03:18,  5.38s/it][A

	loss_cls: tensor(0.4609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9478, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:38<03:13,  5.37s/it][A

	loss_cls: tensor(0.7816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9030, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:44<03:08,  5.39s/it][A

	loss_cls: tensor(0.4477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6564, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:49<03:02,  5.37s/it][A

	loss_cls: tensor(0.7668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8715, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:54<02:57,  5.39s/it][A

	loss_cls: tensor(0.5060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6970, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:00<02:52,  5.38s/it][A

	loss_cls: tensor(0.6172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8498, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:05<02:47,  5.40s/it][A

	loss_cls: tensor(0.5520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7651, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:11<02:41,  5.38s/it][A

	loss_cls: tensor(0.5296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6153, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:16<02:35,  5.36s/it][A

	loss_cls: tensor(0.4574, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6996, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:21<02:30,  5.38s/it][A

	loss_cls: tensor(0.6987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0051, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:27<02:24,  5.36s/it][A

	loss_cls: tensor(0.9416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0803, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:32<02:19,  5.38s/it][A

	loss_cls: tensor(0.4642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6698, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:37<02:14,  5.37s/it][A

	loss_cls: tensor(0.6216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8865, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:43<02:09,  5.39s/it][A

	loss_cls: tensor(0.5055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7346, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:48<02:03,  5.38s/it][A

	loss_cls: tensor(0.5063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7128, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:54<01:57,  5.36s/it][A

	loss_cls: tensor(0.4536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0224, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:59<01:52,  5.38s/it][A

	loss_cls: tensor(0.6044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7690, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:04<01:47,  5.37s/it][A

	loss_cls: tensor(0.5506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7126, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:10<01:42,  5.39s/it][A

	loss_cls: tensor(0.5630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6581, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:15<01:36,  5.38s/it][A

	loss_cls: tensor(0.4989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6449, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:21<01:31,  5.40s/it][A

	loss_cls: tensor(0.5468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9562, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:26<01:26,  5.38s/it][A

	loss_cls: tensor(0.6112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7870, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:31<01:20,  5.37s/it][A

	loss_cls: tensor(0.5976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7556, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:37<01:15,  5.39s/it][A

	loss_cls: tensor(0.5789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7913, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:42<01:09,  5.38s/it][A

	loss_cls: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5293, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:47<01:04,  5.40s/it][A

	loss_cls: tensor(0.6243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8574, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:53<00:59,  5.39s/it][A

	loss_cls: tensor(0.6842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1721, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:58<00:54,  5.41s/it][A

	loss_cls: tensor(0.2999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4130, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:04<00:48,  5.39s/it][A

	loss_cls: tensor(0.6197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8551, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:09<00:43,  5.38s/it][A

	loss_cls: tensor(0.6356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7660, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:14<00:37,  5.39s/it][A

	loss_cls: tensor(0.6034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8512, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:20<00:32,  5.38s/it][A

	loss_cls: tensor(0.5700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7119, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:25<00:26,  5.40s/it][A

	loss_cls: tensor(0.5721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9038, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:31<00:21,  5.38s/it][A

	loss_cls: tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0183, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:36<00:16,  5.40s/it][A

	loss_cls: tensor(0.3199, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3601, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:41<00:10,  5.38s/it][A

	loss_cls: tensor(0.6554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8351, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:47<00:05,  5.36s/it][A

	loss_cls: tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7441, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:49<00:00,  5.39s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.4671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7777, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8397592051049411

	Training cls acc: 0.6883239171374764

	Training cls prec: 0.5743465789546298

	Training cls rec: 0.6178023130353638

	Training cls f1: 0.5330464886305865

--
	Training ner acc: 0.9553025349117198

	Training ner prec: 0.2520663273036473

	Training ner rec: 0.25992878782840917

	Training ner f1: 0.2553963542585461

	Current Learning rate:  0.0008285714285714286



  1%|          | 1/177 [00:00<02:08,  1.37it/s][A
  1%|          | 2/177 [00:01<01:58,  1.48it/s][A
  2%|▏         | 3/177 [00:02<02:01,  1.43it/s][A
  2%|▏         | 4/177 [00:02<02:02,  1.41it/s][A
  3%|▎         | 5/177 [00:03<01:57,  1.46it/s][A
  3%|▎         | 6/177 [00:04<01:59,  1.43it/s][A
  4%|▍         | 7/177 [00:04<02:00,  1.41it/s][A
  5%|▍         | 8/177 [00:05<02:00,  1.40it/s][A
  5%|▌         | 9/177 [00:06<01:56,  1.45it/s][A
  6%|▌         | 10/177 [00:07<01:57,  1.42it/s][A
  6%|▌         | 11/177 [00:07<01:57,  1.41it/s][A
  7%|▋         | 12/177 [00:08<01:57,  1.40it/s][A
  7%|▋         | 13/177 [00:09<01:53,  1.44it/s][A
  8%|▊         | 14/177 [00:09<01:54,  1.42it/s][A
  8%|▊         | 15/177 [00:10<01:55,  1.41it/s][A
  9%|▉         | 16/177 [00:11<01:51,  1.45it/s][A
 10%|▉         | 17/177 [00:11<01:52,  1.43it/s][A
 10%|█         | 18/177 [00:12<01:52,  1.41it/s][A
 11%|█         | 19/177 [00:13<01:52,  1.40it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7876062076644036

	Validation cls acc: 0.678201506591337

	Validation cls prec: 0.597807371536185

	Validation cls rec: 0.5749663707290825

	Validation cls f1: 0.5320481307769444

--
	Validation ner acc: 0.9540424666218263

	Validation ner prec: 0.43561141257521757

	Validation ner rec: 0.4463276836158192

	Validation ner f1: 0.44073336979305683



  0%|          | 1/354 [00:05<32:28,  5.52s/it][A

	loss_cls: tensor(0.4730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5823, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<32:00,  5.46s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1595, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7007, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:59,  5.47s/it][A

	loss_cls: tensor(0.7203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:49,  5.46s/it][A

	loss_cls: tensor(0.3844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4728, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:47,  5.47s/it][A

	loss_cls: tensor(0.5581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6094, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:37,  5.45s/it][A

	loss_cls: tensor(0.7258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1323, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:28,  5.44s/it][A

	loss_cls: tensor(0.3838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8120, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:26,  5.45s/it][A

	loss_cls: tensor(0.3777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8780, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:14,  5.43s/it][A

	loss_cls: tensor(0.4065, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6098, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:13,  5.45s/it][A

	loss_cls: tensor(0.5260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7687, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<31:05,  5.44s/it][A

	loss_cls: tensor(0.7215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9866, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:08,  5.46s/it][A

	loss_cls: tensor(0.6190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7573, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:59,  5.45s/it][A

	loss_cls: tensor(0.9042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2493, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<30:50,  5.44s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7651, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:50,  5.46s/it][A

	loss_cls: tensor(0.4259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5892, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<30:34,  5.43s/it][A

	loss_cls: tensor(0.4837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8770, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:36,  5.45s/it][A

	loss_cls: tensor(0.8270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0636, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:29,  5.45s/it][A

	loss_cls: tensor(0.4101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8109, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:26,  5.45s/it][A

	loss_cls: tensor(0.3803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7006, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:15,  5.44s/it][A

	loss_cls: tensor(0.5024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6367, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:02,  5.41s/it][A

	loss_cls: tensor(0.9289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2723, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<30:03,  5.43s/it][A

	loss_cls: tensor(0.7334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9709, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:05<29:52,  5.42s/it][A

	loss_cls: tensor(0.6249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9980, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:53,  5.43s/it][A

	loss_cls: tensor(0.5857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9381, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:16<29:44,  5.42s/it][A

	loss_cls: tensor(0.3802, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5811, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:21<29:43,  5.44s/it][A

	loss_cls: tensor(0.4274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8316, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:33,  5.42s/it][A

	loss_cls: tensor(0.8007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9930, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:32<29:23,  5.41s/it][A

	loss_cls: tensor(0.5463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8725, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:25,  5.43s/it][A

	loss_cls: tensor(0.4859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5661, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:43<29:15,  5.42s/it][A

	loss_cls: tensor(0.4581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8014, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:48<29:18,  5.45s/it][A

	loss_cls: tensor(1.1066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2367, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:54<29:09,  5.43s/it][A

	loss_cls: tensor(0.7492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0010, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:59<29:07,  5.44s/it][A

	loss_cls: tensor(0.8800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0550, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:56,  5.43s/it][A

	loss_cls: tensor(0.4403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5378, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:10<28:47,  5.42s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6583, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:15<28:45,  5.43s/it][A

	loss_cls: tensor(0.4637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2018, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6654, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:21<28:36,  5.42s/it][A

	loss_cls: tensor(0.2755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4333, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:26<28:37,  5.44s/it][A

	loss_cls: tensor(0.3754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4367, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:32<28:29,  5.43s/it][A

	loss_cls: tensor(0.6915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3289, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0204, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:37<28:28,  5.44s/it][A

	loss_cls: tensor(0.3193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5282, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:42<28:16,  5.42s/it][A

	loss_cls: tensor(0.3372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4932, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:48<28:03,  5.40s/it][A

	loss_cls: tensor(0.6810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1052, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:53<28:01,  5.41s/it][A

	loss_cls: tensor(0.7922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1392, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:59<27:55,  5.40s/it][A

	loss_cls: tensor(0.4100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5035, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:04<27:54,  5.42s/it][A

	loss_cls: tensor(0.5296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5714, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:47,  5.41s/it][A

	loss_cls: tensor(0.4509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5367, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:15<27:46,  5.43s/it][A

	loss_cls: tensor(0.8710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3908, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2618, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:20<27:38,  5.42s/it][A

	loss_cls: tensor(0.4595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6858, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:26<27:27,  5.40s/it][A

	loss_cls: tensor(0.6397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8348, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:31<27:28,  5.42s/it][A

	loss_cls: tensor(0.9135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4715, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:37<27:20,  5.41s/it][A

	loss_cls: tensor(0.5820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7202, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:42<27:21,  5.43s/it][A

	loss_cls: tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5864, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:47<27:13,  5.43s/it][A

	loss_cls: tensor(0.9153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1110, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:53<27:11,  5.44s/it][A

	loss_cls: tensor(0.6441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8487, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:58<27:02,  5.43s/it][A

	loss_cls: tensor(0.6397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8097, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:04<26:55,  5.42s/it][A

	loss_cls: tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0509, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:09<26:57,  5.45s/it][A

	loss_cls: tensor(0.4201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6177, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:15<26:48,  5.43s/it][A

	loss_cls: tensor(0.6244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9573, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:20<26:49,  5.46s/it][A

	loss_cls: tensor(0.4863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7637, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:38,  5.44s/it][A

	loss_cls: tensor(0.8564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1345, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:31<26:38,  5.45s/it][A

	loss_cls: tensor(0.8554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9488, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:36<26:25,  5.43s/it][A

	loss_cls: tensor(0.6862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9177, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:42<26:14,  5.41s/it][A

	loss_cls: tensor(0.9596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1142, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:47<26:16,  5.43s/it][A

	loss_cls: tensor(0.5302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9586, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:53<26:08,  5.43s/it][A

	loss_cls: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8177, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:58<26:06,  5.44s/it][A

	loss_cls: tensor(0.7729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2413, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:03<25:56,  5.42s/it][A

	loss_cls: tensor(0.6280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8719, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:09<25:54,  5.43s/it][A

	loss_cls: tensor(0.7696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9626, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:14<25:43,  5.42s/it][A

	loss_cls: tensor(0.5429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7731, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:20<25:41,  5.43s/it][A

	loss_cls: tensor(0.6304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7923, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:25<25:32,  5.41s/it][A

	loss_cls: tensor(0.6596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7755, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:31<25:24,  5.40s/it][A

	loss_cls: tensor(0.5099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6395, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:36<25:24,  5.43s/it][A

	loss_cls: tensor(0.5407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0622, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:41<25:14,  5.41s/it][A

	loss_cls: tensor(0.6918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2519, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:47<25:13,  5.42s/it][A

	loss_cls: tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2584, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:52<25:08,  5.42s/it][A

	loss_cls: tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9039, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:58<25:04,  5.43s/it][A

	loss_cls: tensor(0.5645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7541, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:03<24:57,  5.43s/it][A

	loss_cls: tensor(0.3418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7255, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:08<24:48,  5.41s/it][A

	loss_cls: tensor(0.9000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1981, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0982, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:14<24:47,  5.43s/it][A

	loss_cls: tensor(0.6878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9314, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:19<24:40,  5.42s/it][A

	loss_cls: tensor(0.8235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1475, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:25<24:38,  5.44s/it][A

	loss_cls: tensor(0.8987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2638, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:30<24:27,  5.41s/it][A

	loss_cls: tensor(0.8057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8894, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:36<24:27,  5.44s/it][A

	loss_cls: tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9159, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:41<24:18,  5.42s/it][A

	loss_cls: tensor(0.6164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8213, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:46<24:09,  5.41s/it][A

	loss_cls: tensor(0.5772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6718, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:52<24:08,  5.42s/it][A

	loss_cls: tensor(0.6907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8357, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:57<24:00,  5.41s/it][A

	loss_cls: tensor(0.5174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6075, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:03<23:56,  5.42s/it][A

	loss_cls: tensor(0.5972, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7815, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:08<23:49,  5.41s/it][A

	loss_cls: tensor(0.6724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9547, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:14<23:56,  5.46s/it][A

	loss_cls: tensor(0.9674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5029, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:19<23:50,  5.46s/it][A

	loss_cls: tensor(0.4353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5296, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:25<23:43,  5.45s/it][A

	loss_cls: tensor(0.3489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4370, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:30<23:45,  5.48s/it][A

	loss_cls: tensor(0.6241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0491, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:36<23:36,  5.47s/it][A

	loss_cls: tensor(0.5047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6438, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:41<23:36,  5.49s/it][A

	loss_cls: tensor(0.6873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8404, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:46<22:54,  5.35s/it][A

	loss_cls: tensor(0.7316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8468, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:52<22:52,  5.36s/it][A

	loss_cls: tensor(0.8987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1051, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:57<22:54,  5.39s/it][A

	loss_cls: tensor(0.6789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8698, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:02<22:54,  5.41s/it][A

	loss_cls: tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1866, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:08<22:59,  5.45s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6792, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:13<22:54,  5.45s/it][A

	loss_cls: tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8073, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:19<22:56,  5.48s/it][A

	loss_cls: tensor(0.6579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8241, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:24<22:49,  5.48s/it][A

	loss_cls: tensor(0.5527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6734, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:30<22:47,  5.49s/it][A

	loss_cls: tensor(0.5964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1606, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:35<22:39,  5.48s/it][A

	loss_cls: tensor(0.7028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1012, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:41<22:33,  5.48s/it][A

	loss_cls: tensor(0.4825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5815, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:46<22:31,  5.49s/it][A

	loss_cls: tensor(0.5567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9354, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:52<22:23,  5.48s/it][A

	loss_cls: tensor(0.6412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8994, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:57<22:22,  5.50s/it][A

	loss_cls: tensor(0.8869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0024, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:03<22:15,  5.49s/it][A

	loss_cls: tensor(0.7697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2852, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0549, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:08<22:13,  5.51s/it][A

	loss_cls: tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9968, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:14<22:02,  5.49s/it][A

	loss_cls: tensor(0.8757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1011, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:19<21:55,  5.48s/it][A

	loss_cls: tensor(0.7699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9143, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:25<21:54,  5.50s/it][A

	loss_cls: tensor(0.6935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9083, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:30<21:44,  5.48s/it][A

	loss_cls: tensor(0.7312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9083, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:36<21:43,  5.50s/it][A

	loss_cls: tensor(0.7517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9725, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:41<21:35,  5.49s/it][A

	loss_cls: tensor(0.7303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8697, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:47<21:32,  5.50s/it][A

	loss_cls: tensor(0.5801, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8704, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:52<21:25,  5.49s/it][A

	loss_cls: tensor(0.7519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8760, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:58<21:18,  5.49s/it][A

	loss_cls: tensor(0.7167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7691, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:03<21:16,  5.50s/it][A

	loss_cls: tensor(0.5705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7389, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:09<21:06,  5.48s/it][A

	loss_cls: tensor(0.6715, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4609, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:14<21:08,  5.51s/it][A

	loss_cls: tensor(0.6788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9120, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:20<21:00,  5.50s/it][A

	loss_cls: tensor(0.5117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6114, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:25<20:57,  5.51s/it][A

	loss_cls: tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0817, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:31<20:44,  5.48s/it][A

	loss_cls: tensor(0.7405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0344, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:36<20:36,  5.47s/it][A

	loss_cls: tensor(0.6436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7977, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:42<20:36,  5.49s/it][A

	loss_cls: tensor(0.7859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0291, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:47<20:29,  5.49s/it][A

	loss_cls: tensor(0.5071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6306, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:53<20:28,  5.51s/it][A

	loss_cls: tensor(0.7006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8971, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:58<20:19,  5.50s/it][A

	loss_cls: tensor(0.7767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1222, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:04<20:17,  5.51s/it][A

	loss_cls: tensor(0.6558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8223, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:09<20:09,  5.50s/it][A

	loss_cls: tensor(0.5305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6564, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:15<20:02,  5.49s/it][A

	loss_cls: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0040, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:20<20:00,  5.51s/it][A

	loss_cls: tensor(0.5510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6564, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:26<19:50,  5.49s/it][A

	loss_cls: tensor(0.7594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9027, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:31<19:49,  5.51s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6632, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:37<19:45,  5.51s/it][A

	loss_cls: tensor(0.6694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0364, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:42<19:42,  5.53s/it][A

	loss_cls: tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0912, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:47<19:03,  5.37s/it][A

	loss_cls: tensor(0.5053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7110, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:53<18:46,  5.32s/it][A

	loss_cls: tensor(0.4561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8801, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:58<18:55,  5.38s/it][A

	loss_cls: tensor(0.5963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6757, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:04<18:55,  5.41s/it][A

	loss_cls: tensor(0.6132, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8250, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:09<18:57,  5.44s/it][A

	loss_cls: tensor(0.6415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8685, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:15<18:53,  5.45s/it][A

	loss_cls: tensor(0.5108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7274, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:20<18:53,  5.47s/it][A

	loss_cls: tensor(0.7241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0064, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:26<18:46,  5.47s/it][A

	loss_cls: tensor(0.5311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7904, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:31<18:38,  5.46s/it][A

	loss_cls: tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2339, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9297, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:37<18:37,  5.48s/it][A

	loss_cls: tensor(0.8467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1126, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:42<18:31,  5.47s/it][A

	loss_cls: tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8763, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:48<18:29,  5.49s/it][A

	loss_cls: tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8759, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:53<18:21,  5.48s/it][A

	loss_cls: tensor(0.5429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6706, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:59<18:19,  5.50s/it][A

	loss_cls: tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6957, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:04<18:10,  5.48s/it][A

	loss_cls: tensor(0.4251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7632, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:09<18:04,  5.48s/it][A

	loss_cls: tensor(0.6407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8253, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:15<18:04,  5.51s/it][A

	loss_cls: tensor(0.6151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8276, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:21<17:57,  5.50s/it][A

	loss_cls: tensor(0.5706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8998, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:26<17:55,  5.52s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5818, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:32<17:48,  5.51s/it][A

	loss_cls: tensor(0.4419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4671, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:37<17:44,  5.51s/it][A

	loss_cls: tensor(1.0272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2747, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:43<17:35,  5.50s/it][A

	loss_cls: tensor(0.4054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5325, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:48<17:25,  5.47s/it][A

	loss_cls: tensor(0.7900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5084, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2984, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:54<17:24,  5.50s/it][A

	loss_cls: tensor(0.7020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9326, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:59<17:19,  5.50s/it][A

	loss_cls: tensor(0.6499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7703, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:05<17:17,  5.52s/it][A

	loss_cls: tensor(0.4428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6128, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:10<17:07,  5.50s/it][A

	loss_cls: tensor(0.7972, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9988, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:16<17:03,  5.50s/it][A

	loss_cls: tensor(0.4719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6631, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:21<16:55,  5.49s/it][A

	loss_cls: tensor(0.4091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6518, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:27<16:50,  5.49s/it][A

	loss_cls: tensor(0.4761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0981, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5742, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:32<16:44,  5.49s/it][A

	loss_cls: tensor(0.6593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0322, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:37<16:38,  5.48s/it][A

	loss_cls: tensor(0.6818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9005, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:43<16:37,  5.51s/it][A

	loss_cls: tensor(0.3253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5385, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:48<16:29,  5.50s/it][A

	loss_cls: tensor(0.4975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6190, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:54<16:26,  5.51s/it][A

	loss_cls: tensor(0.5442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0420, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:59<15:59,  5.39s/it][A

	loss_cls: tensor(0.7308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8079, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:05<16:02,  5.44s/it][A

	loss_cls: tensor(0.7537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8097, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:10<15:58,  5.44s/it][A

	loss_cls: tensor(1.0468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1710, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:16<15:53,  5.45s/it][A

	loss_cls: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7994, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:21<15:52,  5.48s/it][A

	loss_cls: tensor(0.5012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9972, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:27<15:45,  5.47s/it][A

	loss_cls: tensor(0.4844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6175, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:32<15:43,  5.48s/it][A

	loss_cls: tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8486, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:38<15:35,  5.47s/it][A

	loss_cls: tensor(0.8943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9657, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:43<15:32,  5.49s/it][A

	loss_cls: tensor(0.4468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6098, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:49<15:27,  5.49s/it][A

	loss_cls: tensor(0.4979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0746, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:54<15:20,  5.48s/it][A

	loss_cls: tensor(0.4361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7523, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [17:00<15:16,  5.49s/it][A

	loss_cls: tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8379, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:05<15:11,  5.49s/it][A

	loss_cls: tensor(0.4598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6933, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:11<15:07,  5.50s/it][A

	loss_cls: tensor(0.6519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0349, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:16<15:02,  5.51s/it][A

	loss_cls: tensor(0.5954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9202, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:22<14:58,  5.51s/it][A

	loss_cls: tensor(0.7181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8859, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:27<14:52,  5.51s/it][A

	loss_cls: tensor(0.9220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1498, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:33<14:45,  5.50s/it][A

	loss_cls: tensor(0.5098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6096, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:38<14:42,  5.52s/it][A

	loss_cls: tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2975, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0025, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:44<14:34,  5.50s/it][A

	loss_cls: tensor(0.5105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8448, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:49<14:30,  5.51s/it][A

	loss_cls: tensor(0.6457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7441, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:55<14:26,  5.52s/it][A

	loss_cls: tensor(0.5347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7218, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [18:00<14:22,  5.53s/it][A

	loss_cls: tensor(0.4000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4674, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:06<14:14,  5.51s/it][A

	loss_cls: tensor(0.4414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8821, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:11<14:06,  5.50s/it][A

	loss_cls: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8588, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:17<14:03,  5.51s/it][A

	loss_cls: tensor(1.3140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3819, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:22<13:55,  5.50s/it][A

	loss_cls: tensor(0.8473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0084, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:28<13:50,  5.50s/it][A

	loss_cls: tensor(0.6103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6946, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:33<13:46,  5.51s/it][A

	loss_cls: tensor(0.6045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7516, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:39<13:44,  5.53s/it][A

	loss_cls: tensor(0.5627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6272, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:44<13:35,  5.51s/it][A

	loss_cls: tensor(0.4841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6669, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:50<13:23,  5.47s/it][A

	loss_cls: tensor(0.5788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2474, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8262, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:55<13:20,  5.49s/it][A

	loss_cls: tensor(0.5346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8529, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [19:01<13:12,  5.47s/it][A

	loss_cls: tensor(0.4653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6734, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:06<13:10,  5.49s/it][A

	loss_cls: tensor(0.7393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1875, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:12<13:04,  5.48s/it][A

	loss_cls: tensor(0.9017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1880, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:17<13:01,  5.50s/it][A

	loss_cls: tensor(0.6241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8304, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:23<12:54,  5.49s/it][A

	loss_cls: tensor(0.7528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2524, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:28<12:47,  5.48s/it][A

	loss_cls: tensor(0.5222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7488, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:34<12:44,  5.50s/it][A

	loss_cls: tensor(0.6471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7384, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:39<12:37,  5.49s/it][A

	loss_cls: tensor(0.4582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3280, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7862, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:45<12:34,  5.51s/it][A

	loss_cls: tensor(0.5582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7264, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:50<12:27,  5.50s/it][A

	loss_cls: tensor(0.5276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8124, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:56<12:23,  5.51s/it][A

	loss_cls: tensor(0.4157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7955, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [20:01<12:16,  5.49s/it][A

	loss_cls: tensor(0.6723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1661, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:07<12:09,  5.48s/it][A

	loss_cls: tensor(0.5657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6779, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:12<12:06,  5.50s/it][A

	loss_cls: tensor(0.6085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8300, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:18<11:58,  5.48s/it][A

	loss_cls: tensor(0.7899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9363, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:23<11:56,  5.51s/it][A

	loss_cls: tensor(0.4307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6521, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:29<11:49,  5.50s/it][A

	loss_cls: tensor(0.7282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9098, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:34<11:26,  5.36s/it][A

	loss_cls: tensor(0.7958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0906, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:39<11:08,  5.26s/it][A

	loss_cls: tensor(0.5269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8590, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:44<10:52,  5.18s/it][A

	loss_cls: tensor(0.4771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9293, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:49<10:42,  5.14s/it][A

	loss_cls: tensor(0.5026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:54<10:34,  5.12s/it][A

	loss_cls: tensor(0.9830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0845, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:59<10:44,  5.24s/it][A

	loss_cls: tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7792, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:05<10:49,  5.32s/it][A

	loss_cls: tensor(0.6269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8461, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:10<10:53,  5.40s/it][A

	loss_cls: tensor(0.4840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1314, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6155, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:16<10:52,  5.44s/it][A

	loss_cls: tensor(0.6171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7096, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:21<10:48,  5.45s/it][A

	loss_cls: tensor(0.5604, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8790, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:27<10:45,  5.47s/it][A

	loss_cls: tensor(0.8069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1203, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:32<10:40,  5.48s/it][A

	loss_cls: tensor(0.4350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7188, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:38<10:41,  5.53s/it][A

	loss_cls: tensor(0.8659, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1634, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:44<10:33,  5.51s/it][A

	loss_cls: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8341, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:49<10:30,  5.53s/it][A

	loss_cls: tensor(0.5161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7238, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:55<10:22,  5.51s/it][A

	loss_cls: tensor(0.4656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6641, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [22:00<10:15,  5.49s/it][A

	loss_cls: tensor(0.5615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6447, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:05<09:54,  5.36s/it][A

	loss_cls: tensor(0.4039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6139, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:10<09:42,  5.29s/it][A

	loss_cls: tensor(0.4680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5721, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:16<09:40,  5.32s/it][A

	loss_cls: tensor(0.5035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6744, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:21<09:38,  5.36s/it][A

	loss_cls: tensor(0.7785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1213, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:27<09:38,  5.40s/it][A

	loss_cls: tensor(0.4279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5845, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:32<09:33,  5.41s/it][A

	loss_cls: tensor(0.5682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7779, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:37<09:28,  5.42s/it][A

	loss_cls: tensor(0.4988, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2840, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7828, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:43<09:29,  5.47s/it][A

	loss_cls: tensor(1.1948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6438, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:48<09:23,  5.47s/it][A

	loss_cls: tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2852, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:54<09:20,  5.50s/it][A

	loss_cls: tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0559, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:59<09:12,  5.47s/it][A

	loss_cls: tensor(0.6470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9405, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:05<09:05,  5.46s/it][A

	loss_cls: tensor(0.5777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7257, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:10<08:58,  5.43s/it][A

	loss_cls: tensor(0.8388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9580, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:16<08:50,  5.41s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8176, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:21<08:46,  5.43s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7762, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:26<08:39,  5.41s/it][A

	loss_cls: tensor(0.8252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8992, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:32<08:35,  5.42s/it][A

	loss_cls: tensor(0.6912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7542, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:37<08:28,  5.41s/it][A

	loss_cls: tensor(0.5872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7317, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:43<08:23,  5.41s/it][A

	loss_cls: tensor(0.4605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0955, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:48<08:16,  5.39s/it][A

	loss_cls: tensor(0.6787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7041, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:53<08:10,  5.39s/it][A

	loss_cls: tensor(0.7621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2535, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:59<08:06,  5.41s/it][A

	loss_cls: tensor(0.6264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9634, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:04<08:00,  5.40s/it][A

	loss_cls: tensor(0.7872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0189, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:10<07:56,  5.41s/it][A

	loss_cls: tensor(0.7185, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9085, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:15<07:49,  5.40s/it][A

	loss_cls: tensor(0.5131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1474, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6605, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:20<07:45,  5.41s/it][A

	loss_cls: tensor(0.6788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0229, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:26<07:38,  5.39s/it][A

	loss_cls: tensor(0.5874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7901, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:31<07:33,  5.40s/it][A

	loss_cls: tensor(0.3568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5302, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:37<07:27,  5.40s/it][A

	loss_cls: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0057, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:42<07:21,  5.39s/it][A

	loss_cls: tensor(0.7217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0246, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:47<07:17,  5.40s/it][A

	loss_cls: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6323, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:53<07:11,  5.39s/it][A

	loss_cls: tensor(0.8449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9585, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:58<07:07,  5.41s/it][A

	loss_cls: tensor(0.5368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9280, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:04<07:01,  5.40s/it][A

	loss_cls: tensor(0.6371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7759, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:09<06:56,  5.41s/it][A

	loss_cls: tensor(0.6891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8132, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:14<06:50,  5.40s/it][A

	loss_cls: tensor(0.6080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0235, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:20<06:44,  5.39s/it][A

	loss_cls: tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1622, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:25<06:40,  5.41s/it][A

	loss_cls: tensor(0.6090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8491, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:31<06:33,  5.39s/it][A

	loss_cls: tensor(0.4686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5484, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:36<06:29,  5.41s/it][A

	loss_cls: tensor(0.5722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6705, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:41<06:22,  5.39s/it][A

	loss_cls: tensor(0.7761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9945, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:47<06:19,  5.42s/it][A

	loss_cls: tensor(0.4452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4881, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:52<06:12,  5.40s/it][A

	loss_cls: tensor(1.0491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3092, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:58<06:05,  5.38s/it][A

	loss_cls: tensor(0.5047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7691, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:03<06:01,  5.39s/it][A

	loss_cls: tensor(0.9509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2576, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:08<05:55,  5.38s/it][A

	loss_cls: tensor(0.6825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0091, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:14<05:50,  5.40s/it][A

	loss_cls: tensor(0.7880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1627, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:19<05:44,  5.38s/it][A

	loss_cls: tensor(0.6304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8123, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:25<05:40,  5.40s/it][A

	loss_cls: tensor(0.4022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6509, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:30<05:34,  5.39s/it][A

	loss_cls: tensor(0.6796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0920, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:35<05:27,  5.38s/it][A

	loss_cls: tensor(0.5847, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4436, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0283, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:41<05:23,  5.39s/it][A

	loss_cls: tensor(0.4162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6798, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:46<05:17,  5.38s/it][A

	loss_cls: tensor(0.4843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5646, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:52<05:13,  5.40s/it][A

	loss_cls: tensor(0.9591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0763, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:57<05:07,  5.39s/it][A

	loss_cls: tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8418, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:02<05:02,  5.40s/it][A

	loss_cls: tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7629, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:08<04:56,  5.39s/it][A

	loss_cls: tensor(0.4710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8119, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:13<04:50,  5.39s/it][A

	loss_cls: tensor(0.5864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6397, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:18<04:45,  5.40s/it][A

	loss_cls: tensor(0.5475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7071, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:24<04:39,  5.38s/it][A

	loss_cls: tensor(0.7439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0685, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:29<04:34,  5.39s/it][A

	loss_cls: tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0788, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:35<04:29,  5.38s/it][A

	loss_cls: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7686, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:40<04:24,  5.40s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6513, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:45<04:18,  5.39s/it][A

	loss_cls: tensor(0.5885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7709, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:51<04:12,  5.37s/it][A

	loss_cls: tensor(0.6626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9117, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:56<04:08,  5.39s/it][A

	loss_cls: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6948, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:02<04:02,  5.38s/it][A

	loss_cls: tensor(0.6549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8736, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:07<03:57,  5.39s/it][A

	loss_cls: tensor(0.4230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6516, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:12<03:51,  5.38s/it][A

	loss_cls: tensor(0.5359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8096, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:18<03:46,  5.40s/it][A

	loss_cls: tensor(0.4371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4912, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:23<03:40,  5.39s/it][A

	loss_cls: tensor(0.5008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6174, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:28<03:35,  5.38s/it][A

	loss_cls: tensor(0.4731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5850, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:34<03:30,  5.40s/it][A

	loss_cls: tensor(0.6549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2308, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:39<03:24,  5.38s/it][A

	loss_cls: tensor(1.3192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5751, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:45<03:19,  5.40s/it][A

	loss_cls: tensor(0.8866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0459, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:50<03:14,  5.40s/it][A

	loss_cls: tensor(0.4799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7486, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:56<03:09,  5.42s/it][A

	loss_cls: tensor(0.6165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7229, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:01<03:03,  5.40s/it][A

	loss_cls: tensor(0.6790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8581, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:06<02:57,  5.38s/it][A

	loss_cls: tensor(0.4212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6029, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:12<02:52,  5.40s/it][A

	loss_cls: tensor(0.6827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9225, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:17<02:46,  5.38s/it][A

	loss_cls: tensor(0.9265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2812, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:22<02:42,  5.41s/it][A

	loss_cls: tensor(0.6560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9831, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:28<02:36,  5.40s/it][A

	loss_cls: tensor(0.5951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6823, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:33<02:31,  5.40s/it][A

	loss_cls: tensor(0.8111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1442, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:39<02:25,  5.39s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8861, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:44<02:19,  5.38s/it][A

	loss_cls: tensor(0.5060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8480, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:49<02:14,  5.39s/it][A

	loss_cls: tensor(0.8396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0919, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:55<02:09,  5.38s/it][A

	loss_cls: tensor(0.5662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7414, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:00<02:04,  5.39s/it][A

	loss_cls: tensor(0.4354, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7773, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:06<01:58,  5.38s/it][A

	loss_cls: tensor(0.5546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6445, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:11<01:53,  5.39s/it][A

	loss_cls: tensor(0.6880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8884, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:16<01:47,  5.38s/it][A

	loss_cls: tensor(0.4912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8106, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:22<01:41,  5.36s/it][A

	loss_cls: tensor(0.5651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8081, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:27<01:36,  5.38s/it][A

	loss_cls: tensor(0.7138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9428, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:32<01:31,  5.37s/it][A

	loss_cls: tensor(0.7251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8996, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:38<01:26,  5.40s/it][A

	loss_cls: tensor(0.5529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6927, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:43<01:20,  5.39s/it][A

	loss_cls: tensor(0.6233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1812, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8045, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:49<01:15,  5.40s/it][A

	loss_cls: tensor(0.5377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6799, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:54<01:09,  5.38s/it][A

	loss_cls: tensor(0.4741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5013, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:59<01:04,  5.37s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3708, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8935, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:05<00:59,  5.39s/it][A

	loss_cls: tensor(0.4170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1151, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5321, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:10<00:53,  5.40s/it][A

	loss_cls: tensor(0.6680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7800, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:16<00:48,  5.41s/it][A

	loss_cls: tensor(0.3287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3672, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:21<00:43,  5.40s/it][A

	loss_cls: tensor(0.7589, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0444, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:26<00:37,  5.41s/it][A

	loss_cls: tensor(0.3128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3324, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:32<00:32,  5.40s/it][A

	loss_cls: tensor(0.4607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6772, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:37<00:26,  5.38s/it][A

	loss_cls: tensor(1.0220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2896, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:43<00:21,  5.40s/it][A

	loss_cls: tensor(0.2217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2420, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:48<00:16,  5.40s/it][A

	loss_cls: tensor(0.7817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0485, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:53<00:10,  5.41s/it][A

	loss_cls: tensor(1.0425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4930, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:59<00:05,  5.39s/it][A

	loss_cls: tensor(0.5244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6428, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:01<00:00,  5.43s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5347, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8565129417989213

	Training cls acc: 0.6832627118644068

	Training cls prec: 0.5663101807169604

	Training cls rec: 0.6018906117423067

	Training cls f1: 0.5204856622679088

--
	Training ner acc: 0.9549518744662763

	Training ner prec: 0.25329581853890326

	Training ner rec: 0.26246935292260604

	Training ner f1: 0.25745458265289395

	Current Learning rate:  0.0008



  1%|          | 1/177 [00:00<01:53,  1.55it/s][A
  1%|          | 2/177 [00:01<02:00,  1.45it/s][A
  2%|▏         | 3/177 [00:02<02:02,  1.42it/s][A
  2%|▏         | 4/177 [00:02<01:57,  1.47it/s][A
  3%|▎         | 5/177 [00:03<01:59,  1.44it/s][A
  3%|▎         | 6/177 [00:04<02:00,  1.42it/s][A
  4%|▍         | 7/177 [00:04<02:00,  1.41it/s][A
  5%|▍         | 8/177 [00:05<01:55,  1.46it/s][A
  5%|▌         | 9/177 [00:06<01:57,  1.43it/s][A
  6%|▌         | 10/177 [00:06<01:57,  1.42it/s][A
  6%|▌         | 11/177 [00:07<01:57,  1.41it/s][A
  7%|▋         | 12/177 [00:08<01:53,  1.45it/s][A
  7%|▋         | 13/177 [00:09<01:54,  1.43it/s][A
  8%|▊         | 14/177 [00:09<01:55,  1.42it/s][A
  8%|▊         | 15/177 [00:10<01:50,  1.46it/s][A
  9%|▉         | 16/177 [00:11<01:51,  1.44it/s][A
 10%|▉         | 17/177 [00:11<01:52,  1.42it/s][A
 10%|█         | 18/177 [00:12<01:52,  1.41it/s][A
 11%|█         | 19/177 [00:13<01:48,  1.45it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8021858380844364

	Validation cls acc: 0.8681732580037665

	Validation cls prec: 0.6751244283023945

	Validation cls rec: 0.6763989776701641

	Validation cls f1: 0.6668262058092567

--
	Validation ner acc: 0.9532380402497234

	Validation ner prec: 0.4127392622934857

	Validation ner rec: 0.42354048964218455

	Validation ner f1: 0.417915029589961



  0%|          | 1/354 [00:05<32:31,  5.53s/it][A

	loss_cls: tensor(0.5525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8027, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<32:11,  5.49s/it][A

	loss_cls: tensor(0.7304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1432, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:56,  5.46s/it][A

	loss_cls: tensor(0.7001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8664, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:59,  5.48s/it][A

	loss_cls: tensor(1.0172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2147, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:42,  5.45s/it][A

	loss_cls: tensor(0.8489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0394, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:45,  5.47s/it][A

	loss_cls: tensor(0.4660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7256, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:34,  5.46s/it][A

	loss_cls: tensor(0.5119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7486, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:33,  5.47s/it][A

	loss_cls: tensor(0.4922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8931, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:23,  5.46s/it][A

	loss_cls: tensor(0.5954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1095, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:12,  5.44s/it][A

	loss_cls: tensor(0.7107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8404, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:15,  5.47s/it][A

	loss_cls: tensor(0.8920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9564, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:02,  5.45s/it][A

	loss_cls: tensor(0.5353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8646, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<31:13,  5.49s/it][A

	loss_cls: tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9690, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<31:35,  5.57s/it][A

	loss_cls: tensor(0.5952, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6470, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:22<31:30,  5.58s/it][A

	loss_cls: tensor(0.5355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1661, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<31:07,  5.53s/it][A

	loss_cls: tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7539, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:33<30:49,  5.49s/it][A

	loss_cls: tensor(0.7681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9159, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:40,  5.48s/it][A

	loss_cls: tensor(0.6134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8882, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:44<30:23,  5.44s/it][A

	loss_cls: tensor(0.6872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9938, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:23,  5.46s/it][A

	loss_cls: tensor(0.4496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4977, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:10,  5.44s/it][A

	loss_cls: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6667, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:07,  5.44s/it][A

	loss_cls: tensor(0.7036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8249, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:05<29:54,  5.42s/it][A

	loss_cls: tensor(0.9124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0255, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<29:42,  5.40s/it][A

	loss_cls: tensor(0.9171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6620, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:16<29:39,  5.41s/it][A

	loss_cls: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8479, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:21<29:31,  5.40s/it][A

	loss_cls: tensor(0.7371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0885, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:27<29:33,  5.42s/it][A

	loss_cls: tensor(0.9071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1230, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:32<29:24,  5.41s/it][A

	loss_cls: tensor(0.5134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6389, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:38<29:24,  5.43s/it][A

	loss_cls: tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7947, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:43<29:12,  5.41s/it][A

	loss_cls: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9905, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:49<29:03,  5.40s/it][A

	loss_cls: tensor(0.5869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8423, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:54<29:02,  5.41s/it][A

	loss_cls: tensor(0.7239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9832, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:59<28:54,  5.40s/it][A

	loss_cls: tensor(0.5454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7628, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:05<28:59,  5.44s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6697, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:10<28:50,  5.42s/it][A

	loss_cls: tensor(0.5149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2448, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7597, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:16<28:50,  5.44s/it][A

	loss_cls: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8732, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:21<28:37,  5.42s/it][A

	loss_cls: tensor(0.5511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0286, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:26<28:27,  5.40s/it][A

	loss_cls: tensor(0.5347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7413, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:32<28:26,  5.42s/it][A

	loss_cls: tensor(0.5767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9115, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:37<28:18,  5.41s/it][A

	loss_cls: tensor(0.7127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7643, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:43<28:17,  5.42s/it][A

	loss_cls: tensor(0.5202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8608, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:48<28:06,  5.41s/it][A

	loss_cls: tensor(0.8422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9789, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:54<28:05,  5.42s/it][A

	loss_cls: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8354, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:59<27:54,  5.40s/it][A

	loss_cls: tensor(0.8442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9049, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:04<27:45,  5.39s/it][A

	loss_cls: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7535, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:10<27:47,  5.41s/it][A

	loss_cls: tensor(0.5305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9202, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:15<27:41,  5.41s/it][A

	loss_cls: tensor(0.6165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8351, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:21<27:39,  5.42s/it][A

	loss_cls: tensor(0.6683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7047, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:26<27:28,  5.41s/it][A

	loss_cls: tensor(0.8505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1367, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:31<27:25,  5.41s/it][A

	loss_cls: tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1094, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:37<27:13,  5.39s/it][A

	loss_cls: tensor(0.5358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6689, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:42<27:05,  5.38s/it][A

	loss_cls: tensor(0.7438, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9731, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:48<27:09,  5.41s/it][A

	loss_cls: tensor(0.7514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2851, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:53<26:59,  5.40s/it][A

	loss_cls: tensor(0.4857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6507, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:58<26:58,  5.41s/it][A

	loss_cls: tensor(0.5076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6151, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:04<26:47,  5.40s/it][A

	loss_cls: tensor(0.3985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5613, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:09<26:47,  5.41s/it][A

	loss_cls: tensor(0.4485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5618, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:15<26:35,  5.39s/it][A

	loss_cls: tensor(0.5154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7393, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:20<26:32,  5.40s/it][A

	loss_cls: tensor(0.3196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5622, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:26,  5.40s/it][A

	loss_cls: tensor(0.6965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0986, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:31<26:20,  5.39s/it][A

	loss_cls: tensor(0.3491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4033, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:36<26:21,  5.42s/it][A

	loss_cls: tensor(0.4409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6817, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:42<26:12,  5.40s/it][A

	loss_cls: tensor(0.7913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0635, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:47<26:10,  5.41s/it][A

	loss_cls: tensor(0.4524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8826, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:52<26:00,  5.40s/it][A

	loss_cls: tensor(0.4492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5722, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:58<25:59,  5.41s/it][A

	loss_cls: tensor(0.6759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9475, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:03<25:50,  5.40s/it][A

	loss_cls: tensor(0.5710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6831, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:09<25:42,  5.39s/it][A

	loss_cls: tensor(0.4301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6274, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:14<25:42,  5.41s/it][A

	loss_cls: tensor(0.9841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3309, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:19<25:33,  5.40s/it][A

	loss_cls: tensor(0.7959, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0078, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:25<25:34,  5.42s/it][A

	loss_cls: tensor(0.6861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:30<25:26,  5.41s/it][A

	loss_cls: tensor(0.8411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9802, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:36<25:23,  5.42s/it][A

	loss_cls: tensor(0.5754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9044, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:41<25:13,  5.41s/it][A

	loss_cls: tensor(0.7221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0245, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:46<25:03,  5.39s/it][A

	loss_cls: tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9403, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:52<25:02,  5.41s/it][A

	loss_cls: tensor(0.5751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8881, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:57<24:53,  5.39s/it][A

	loss_cls: tensor(0.5707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8442, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:03<24:50,  5.40s/it][A

	loss_cls: tensor(0.3987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6730, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:08<24:44,  5.40s/it][A

	loss_cls: tensor(0.6119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9972, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:14<24:48,  5.43s/it][A

	loss_cls: tensor(0.6707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9305, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:19<24:39,  5.42s/it][A

	loss_cls: tensor(0.4874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8295, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:24<24:31,  5.41s/it][A

	loss_cls: tensor(0.4910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6406, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:30<24:30,  5.42s/it][A

	loss_cls: tensor(0.4766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7307, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:35<24:20,  5.41s/it][A

	loss_cls: tensor(0.7544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9330, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:41<24:22,  5.44s/it][A

	loss_cls: tensor(0.5979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:46<24:19,  5.45s/it][A

	loss_cls: tensor(0.3535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5818, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:52<24:24,  5.49s/it][A

	loss_cls: tensor(0.5819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9509, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:57<24:18,  5.48s/it][A

	loss_cls: tensor(0.7862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9552, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:03<24:10,  5.47s/it][A

	loss_cls: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6936, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:08<24:08,  5.49s/it][A

	loss_cls: tensor(0.5963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6231, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:14<23:57,  5.47s/it][A

	loss_cls: tensor(0.4876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9225, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:19<23:54,  5.48s/it][A

	loss_cls: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4190, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:25<23:45,  5.46s/it][A

	loss_cls: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9070, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:30<23:45,  5.48s/it][A

	loss_cls: tensor(0.2929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3632, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:36<23:35,  5.46s/it][A

	loss_cls: tensor(1.4591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7822, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:41<23:27,  5.45s/it][A

	loss_cls: tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9817, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:46<23:27,  5.48s/it][A

	loss_cls: tensor(0.8347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0635, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:52<23:17,  5.46s/it][A

	loss_cls: tensor(0.9967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4456, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:57<23:15,  5.47s/it][A

	loss_cls: tensor(0.4017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6149, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:03<23:07,  5.46s/it][A

	loss_cls: tensor(0.5517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7568, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:08<23:06,  5.48s/it][A

	loss_cls: tensor(0.4815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7910, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:14<22:56,  5.46s/it][A

	loss_cls: tensor(0.8639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2006, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:19<22:47,  5.45s/it][A

	loss_cls: tensor(0.6569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0727, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:25<22:45,  5.46s/it][A

	loss_cls: tensor(0.6010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8024, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:30<22:35,  5.44s/it][A

	loss_cls: tensor(0.7054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9889, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:36<22:35,  5.47s/it][A

	loss_cls: tensor(0.6007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1022, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7029, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:41<22:27,  5.45s/it][A

	loss_cls: tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9904, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:47<22:27,  5.48s/it][A

	loss_cls: tensor(0.5571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6796, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:52<22:19,  5.47s/it][A

	loss_cls: tensor(0.8973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0461, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:57<22:09,  5.45s/it][A

	loss_cls: tensor(0.5970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8869, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:03<22:09,  5.47s/it][A

	loss_cls: tensor(0.6617, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9064, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:08<21:59,  5.45s/it][A

	loss_cls: tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7519, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:14<21:59,  5.48s/it][A

	loss_cls: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9681, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:19<21:53,  5.47s/it][A

	loss_cls: tensor(0.4761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6229, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:25<21:52,  5.49s/it][A

	loss_cls: tensor(0.8148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8438, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:30<21:43,  5.48s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7889, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:36<21:33,  5.46s/it][A

	loss_cls: tensor(0.5245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7986, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:41<21:33,  5.48s/it][A

	loss_cls: tensor(0.4709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5819, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:47<21:23,  5.46s/it][A

	loss_cls: tensor(0.6452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8014, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:52<21:23,  5.48s/it][A

	loss_cls: tensor(0.6869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8814, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:58<21:14,  5.47s/it][A

	loss_cls: tensor(0.6043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9290, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:03<21:12,  5.48s/it][A

	loss_cls: tensor(0.6033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8962, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:09<21:01,  5.46s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9796, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:14<20:51,  5.44s/it][A

	loss_cls: tensor(0.5470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7325, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:19<20:50,  5.46s/it][A

	loss_cls: tensor(0.4963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6652, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:25<20:43,  5.46s/it][A

	loss_cls: tensor(0.6251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7401, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:30<20:45,  5.48s/it][A

	loss_cls: tensor(0.4140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5877, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:36<20:36,  5.47s/it][A

	loss_cls: tensor(0.8894, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2256, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:41<20:33,  5.48s/it][A

	loss_cls: tensor(0.5274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6683, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:47<20:26,  5.48s/it][A

	loss_cls: tensor(0.3957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5599, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:52<20:12,  5.44s/it][A

	loss_cls: tensor(0.4444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5000, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:58<20:06,  5.44s/it][A

	loss_cls: tensor(0.7364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8512, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:03<19:56,  5.41s/it][A

	loss_cls: tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7962, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:09<19:54,  5.43s/it][A

	loss_cls: tensor(0.7902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9734, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:14<19:45,  5.42s/it][A

	loss_cls: tensor(0.6761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1651, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:19<19:43,  5.43s/it][A

	loss_cls: tensor(0.4769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6354, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:25<19:32,  5.40s/it][A

	loss_cls: tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8066, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:30<19:22,  5.38s/it][A

	loss_cls: tensor(0.3732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6242, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:35<19:21,  5.40s/it][A

	loss_cls: tensor(0.4757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6246, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:41<19:14,  5.39s/it][A

	loss_cls: tensor(0.7148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9379, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:46<19:11,  5.41s/it][A

	loss_cls: tensor(0.4917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5355, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:52<19:03,  5.39s/it][A

	loss_cls: tensor(0.5120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7731, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:57<19:01,  5.41s/it][A

	loss_cls: tensor(0.6136, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0267, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:02<18:52,  5.39s/it][A

	loss_cls: tensor(0.4464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6868, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:08<18:43,  5.38s/it][A

	loss_cls: tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0283, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:13<18:44,  5.40s/it][A

	loss_cls: tensor(0.5929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8310, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:19<18:37,  5.40s/it][A

	loss_cls: tensor(0.4137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8226, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:24<18:34,  5.41s/it][A

	loss_cls: tensor(0.5602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7422, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:29<18:26,  5.40s/it][A

	loss_cls: tensor(0.5768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8673, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:35<18:23,  5.41s/it][A

	loss_cls: tensor(0.8328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3228, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:40<18:14,  5.39s/it][A

	loss_cls: tensor(0.4537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5786, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:46<18:07,  5.38s/it][A

	loss_cls: tensor(0.5347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3298, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8645, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:51<18:06,  5.41s/it][A

	loss_cls: tensor(0.4923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7618, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:56<18:00,  5.40s/it][A

	loss_cls: tensor(0.6342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8990, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:02<17:56,  5.41s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8600, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:07<17:48,  5.40s/it][A

	loss_cls: tensor(0.6311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7611, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:13<17:46,  5.41s/it][A

	loss_cls: tensor(0.5166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6893, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:18<17:36,  5.39s/it][A

	loss_cls: tensor(0.9161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3229, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:23<17:32,  5.40s/it][A

	loss_cls: tensor(0.5427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6182, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:29<17:26,  5.39s/it][A

	loss_cls: tensor(0.5268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7327, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:34<17:20,  5.39s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1377, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:40<17:19,  5.41s/it][A

	loss_cls: tensor(0.3822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4810, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:45<17:10,  5.40s/it][A

	loss_cls: tensor(0.4973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7000, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:50<17:06,  5.40s/it][A

	loss_cls: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6637, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:56<16:58,  5.39s/it][A

	loss_cls: tensor(0.6366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7823, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:01<16:55,  5.40s/it][A

	loss_cls: tensor(0.8860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0867, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:07<16:49,  5.40s/it][A

	loss_cls: tensor(0.2981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4100, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:12<16:42,  5.39s/it][A

	loss_cls: tensor(1.0927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2820, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:17<16:39,  5.40s/it][A

	loss_cls: tensor(0.6093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7737, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:23<16:30,  5.38s/it][A

	loss_cls: tensor(0.6682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0381, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:28<16:27,  5.40s/it][A

	loss_cls: tensor(0.5139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5888, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:34<16:19,  5.38s/it][A

	loss_cls: tensor(0.5084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7725, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:39<16:18,  5.41s/it][A

	loss_cls: tensor(0.4905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6708, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:44<16:12,  5.40s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7856, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:50<16:03,  5.38s/it][A

	loss_cls: tensor(0.7194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7697, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:55<16:02,  5.41s/it][A

	loss_cls: tensor(0.6930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0821, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:01<15:55,  5.40s/it][A

	loss_cls: tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7381, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:06<15:52,  5.41s/it][A

	loss_cls: tensor(0.4738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7525, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:11<15:45,  5.40s/it][A

	loss_cls: tensor(0.5104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7541, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:17<15:43,  5.42s/it][A

	loss_cls: tensor(0.6058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7740, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:22<15:36,  5.41s/it][A

	loss_cls: tensor(0.3221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5505, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:28<15:28,  5.40s/it][A

	loss_cls: tensor(1.1545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3512, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:33<15:26,  5.42s/it][A

	loss_cls: tensor(0.5075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9603, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:38<15:17,  5.40s/it][A

	loss_cls: tensor(0.6221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7528, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:44<15:15,  5.42s/it][A

	loss_cls: tensor(0.5103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6800, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:49<15:06,  5.40s/it][A

	loss_cls: tensor(0.6751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9743, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:55<15:03,  5.41s/it][A

	loss_cls: tensor(0.5687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9184, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:00<14:56,  5.40s/it][A

	loss_cls: tensor(0.6235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7566, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:05<14:49,  5.39s/it][A

	loss_cls: tensor(0.5909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0041, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:11<14:46,  5.40s/it][A

	loss_cls: tensor(0.4706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7709, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:16<14:37,  5.38s/it][A

	loss_cls: tensor(0.3107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8914, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:22<14:35,  5.40s/it][A

	loss_cls: tensor(0.7201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0408, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:27<14:29,  5.40s/it][A

	loss_cls: tensor(0.6243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8343, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:33<14:26,  5.42s/it][A

	loss_cls: tensor(0.3908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7466, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:38<14:20,  5.41s/it][A

	loss_cls: tensor(0.4891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8321, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:43<14:12,  5.40s/it][A

	loss_cls: tensor(0.5416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4713, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0129, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:49<14:09,  5.41s/it][A

	loss_cls: tensor(0.7452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8594, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:54<14:00,  5.39s/it][A

	loss_cls: tensor(0.6584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0339, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:59<13:57,  5.40s/it][A

	loss_cls: tensor(0.6669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8733, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:05<13:50,  5.39s/it][A

	loss_cls: tensor(0.6849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9227, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:10<13:48,  5.41s/it][A

	loss_cls: tensor(0.4385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:16<13:40,  5.40s/it][A

	loss_cls: tensor(0.5923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8665, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:21<13:34,  5.39s/it][A

	loss_cls: tensor(0.4611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8225, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:26<13:30,  5.41s/it][A

	loss_cls: tensor(0.6513, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7974, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:32<13:22,  5.39s/it][A

	loss_cls: tensor(0.4332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7464, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:37<13:20,  5.41s/it][A

	loss_cls: tensor(0.5152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7217, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:43<13:13,  5.40s/it][A

	loss_cls: tensor(0.5669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8689, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:48<13:10,  5.42s/it][A

	loss_cls: tensor(0.5732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6037, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:54<13:03,  5.40s/it][A

	loss_cls: tensor(0.6362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8529, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:59<12:55,  5.38s/it][A

	loss_cls: tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6598, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:04<12:51,  5.40s/it][A

	loss_cls: tensor(0.6239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9922, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:10<12:44,  5.39s/it][A

	loss_cls: tensor(0.8369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9879, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:15<12:41,  5.40s/it][A

	loss_cls: tensor(0.5702, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9933, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:20<12:35,  5.39s/it][A

	loss_cls: tensor(0.4633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8091, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:26<12:32,  5.41s/it][A

	loss_cls: tensor(0.4831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6644, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:31<12:25,  5.40s/it][A

	loss_cls: tensor(0.6031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7688, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:37<12:17,  5.38s/it][A

	loss_cls: tensor(0.5218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7195, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:42<12:14,  5.40s/it][A

	loss_cls: tensor(0.3703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8011, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:47<12:06,  5.38s/it][A

	loss_cls: tensor(0.6609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8994, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:53<12:04,  5.40s/it][A

	loss_cls: tensor(0.7894, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0821, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:58<11:56,  5.39s/it][A

	loss_cls: tensor(0.7547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8900, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:04<11:52,  5.40s/it][A

	loss_cls: tensor(0.7994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4401, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:09<11:44,  5.38s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9393, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:14<11:38,  5.37s/it][A

	loss_cls: tensor(0.5891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7003, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:20<11:34,  5.38s/it][A

	loss_cls: tensor(0.6264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8368, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:25<11:28,  5.38s/it][A

	loss_cls: tensor(0.3816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5375, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:31<11:25,  5.40s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9968, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:36<11:18,  5.39s/it][A

	loss_cls: tensor(0.6781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8604, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:41<11:15,  5.40s/it][A

	loss_cls: tensor(0.6065, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8118, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:47<11:08,  5.39s/it][A

	loss_cls: tensor(0.7591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0665, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:52<11:01,  5.37s/it][A

	loss_cls: tensor(0.5411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7891, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:57<10:57,  5.39s/it][A

	loss_cls: tensor(0.4131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9724, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:03<10:50,  5.38s/it][A

	loss_cls: tensor(0.5840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7681, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:08<10:48,  5.40s/it][A

	loss_cls: tensor(0.7179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8324, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:14<10:41,  5.39s/it][A

	loss_cls: tensor(0.2971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5006, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:19<10:37,  5.40s/it][A

	loss_cls: tensor(0.7093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9112, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:24<10:31,  5.39s/it][A

	loss_cls: tensor(0.5452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9828, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:30<10:24,  5.38s/it][A

	loss_cls: tensor(0.6986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1887, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:35<10:20,  5.40s/it][A

	loss_cls: tensor(0.6474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7437, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:41<10:15,  5.39s/it][A

	loss_cls: tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9459, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:46<10:12,  5.42s/it][A

	loss_cls: tensor(0.5944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6878, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:51<10:05,  5.41s/it][A

	loss_cls: tensor(1.6106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(2.0557, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:57<10:02,  5.42s/it][A

	loss_cls: tensor(0.7275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8875, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:02<09:54,  5.40s/it][A

	loss_cls: tensor(0.3464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4550, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:08<09:47,  5.39s/it][A

	loss_cls: tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7689, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:13<09:43,  5.40s/it][A

	loss_cls: tensor(0.7719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3175, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:18<09:37,  5.40s/it][A

	loss_cls: tensor(0.5337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5897, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:24<09:33,  5.41s/it][A

	loss_cls: tensor(0.5971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6423, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:29<09:26,  5.40s/it][A

	loss_cls: tensor(0.3943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4876, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:35<09:22,  5.41s/it][A

	loss_cls: tensor(0.3401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4387, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:40<09:15,  5.39s/it][A

	loss_cls: tensor(0.3409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3685, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:45<09:09,  5.39s/it][A

	loss_cls: tensor(0.6043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7671, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:51<09:07,  5.42s/it][A

	loss_cls: tensor(0.8832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2192, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:56<09:00,  5.41s/it][A

	loss_cls: tensor(0.8030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1981, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0011, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:02<08:56,  5.42s/it][A

	loss_cls: tensor(0.3781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5018, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:07<08:50,  5.41s/it][A

	loss_cls: tensor(0.7765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8968, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:13<08:46,  5.42s/it][A

	loss_cls: tensor(0.7739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0667, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:18<08:39,  5.41s/it][A

	loss_cls: tensor(0.7698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9580, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:23<08:34,  5.42s/it][A

	loss_cls: tensor(0.9326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0922, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:29<08:28,  5.41s/it][A

	loss_cls: tensor(0.6900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8030, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:34<08:22,  5.40s/it][A

	loss_cls: tensor(0.4344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4688, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:40<08:18,  5.41s/it][A

	loss_cls: tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7659, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:45<08:10,  5.39s/it][A

	loss_cls: tensor(0.5196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6734, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:50<08:05,  5.40s/it][A

	loss_cls: tensor(0.8894, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2826, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1720, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:56<07:59,  5.39s/it][A

	loss_cls: tensor(0.6878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7941, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:01<07:56,  5.42s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9454, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:07<07:50,  5.41s/it][A

	loss_cls: tensor(0.6902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9909, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:12<07:44,  5.40s/it][A

	loss_cls: tensor(0.6875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7334, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:17<07:40,  5.42s/it][A

	loss_cls: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7113, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:23<07:33,  5.40s/it][A

	loss_cls: tensor(0.4904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6967, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:28<07:29,  5.41s/it][A

	loss_cls: tensor(0.4770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6845, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:34<07:22,  5.39s/it][A

	loss_cls: tensor(0.7073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9083, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:39<07:18,  5.41s/it][A

	loss_cls: tensor(0.6402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0411, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:44<07:11,  5.40s/it][A

	loss_cls: tensor(1.0463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1647, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:50<07:05,  5.39s/it][A

	loss_cls: tensor(0.6385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8807, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:55<07:01,  5.41s/it][A

	loss_cls: tensor(0.7816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0068, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:01<06:55,  5.39s/it][A

	loss_cls: tensor(0.6058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1561, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7619, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:06<06:51,  5.41s/it][A

	loss_cls: tensor(0.5209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6058, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:11<06:44,  5.39s/it][A

	loss_cls: tensor(0.7317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0599, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:17<06:40,  5.41s/it][A

	loss_cls: tensor(0.6015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8597, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:22<06:34,  5.40s/it][A

	loss_cls: tensor(0.5407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6127, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:28<06:28,  5.40s/it][A

	loss_cls: tensor(0.4927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7497, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:33<06:24,  5.42s/it][A

	loss_cls: tensor(0.7100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1279, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:38<06:17,  5.40s/it][A

	loss_cls: tensor(0.6414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8874, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:44<06:13,  5.41s/it][A

	loss_cls: tensor(0.6135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9014, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:49<06:07,  5.40s/it][A

	loss_cls: tensor(0.5463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8205, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:55<06:02,  5.42s/it][A

	loss_cls: tensor(0.5813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9822, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:00<05:56,  5.41s/it][A

	loss_cls: tensor(0.7481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8100, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:05<05:50,  5.40s/it][A

	loss_cls: tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8871, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:11<05:46,  5.41s/it][A

	loss_cls: tensor(0.5791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7221, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:16<05:39,  5.39s/it][A

	loss_cls: tensor(0.5710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7819, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:22<05:35,  5.41s/it][A

	loss_cls: tensor(0.4779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5478, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:27<05:29,  5.40s/it][A

	loss_cls: tensor(0.6293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8896, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:33<05:25,  5.42s/it][A

	loss_cls: tensor(0.6943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0251, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:38<05:18,  5.40s/it][A

	loss_cls: tensor(0.6247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7676, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:43<05:12,  5.39s/it][A

	loss_cls: tensor(0.5480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5862, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:49<05:08,  5.41s/it][A

	loss_cls: tensor(0.7150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8885, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:54<05:02,  5.39s/it][A

	loss_cls: tensor(0.5471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7044, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:00<04:57,  5.40s/it][A

	loss_cls: tensor(0.6108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9732, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:05<04:51,  5.40s/it][A

	loss_cls: tensor(0.6166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7995, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:10<04:47,  5.42s/it][A

	loss_cls: tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7135, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:16<04:41,  5.40s/it][A

	loss_cls: tensor(0.3768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4493, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:21<04:34,  5.38s/it][A

	loss_cls: tensor(0.6224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7947, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:26<04:29,  5.40s/it][A

	loss_cls: tensor(0.7317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9710, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:32<04:23,  5.38s/it][A

	loss_cls: tensor(0.7660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8510, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:37<04:19,  5.41s/it][A

	loss_cls: tensor(0.5425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8378, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:43<04:13,  5.40s/it][A

	loss_cls: tensor(0.4263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4548, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:48<04:09,  5.41s/it][A

	loss_cls: tensor(0.5838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6793, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:53<04:03,  5.40s/it][A

	loss_cls: tensor(0.9395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2439, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:59<03:57,  5.39s/it][A

	loss_cls: tensor(0.4165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5819, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:04<03:52,  5.40s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1908, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7639, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:10<03:46,  5.38s/it][A

	loss_cls: tensor(0.7304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9427, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:15<03:41,  5.41s/it][A

	loss_cls: tensor(0.6866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9483, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:20<03:35,  5.40s/it][A

	loss_cls: tensor(1.0131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0935, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:26<03:30,  5.41s/it][A

	loss_cls: tensor(0.4690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6549, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:31<03:25,  5.40s/it][A

	loss_cls: tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8814, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:37<03:19,  5.40s/it][A

	loss_cls: tensor(0.5196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2261, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7457, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:42<03:14,  5.41s/it][A

	loss_cls: tensor(0.6202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7542, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:48<03:09,  5.40s/it][A

	loss_cls: tensor(0.5101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6930, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:53<03:04,  5.43s/it][A

	loss_cls: tensor(0.5855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6278, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:58<02:58,  5.42s/it][A

	loss_cls: tensor(0.5640, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2102, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7743, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:04<02:54,  5.44s/it][A

	loss_cls: tensor(0.7353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0725, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:09<02:47,  5.41s/it][A

	loss_cls: tensor(0.5289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8054, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:15<02:41,  5.39s/it][A

	loss_cls: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7012, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:20<02:36,  5.41s/it][A

	loss_cls: tensor(0.6934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8933, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:25<02:31,  5.39s/it][A

	loss_cls: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6054, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:31<02:26,  5.42s/it][A

	loss_cls: tensor(0.4735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5424, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:36<02:20,  5.40s/it][A

	loss_cls: tensor(0.9986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3029, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:42<02:15,  5.42s/it][A

	loss_cls: tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0874, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:47<02:09,  5.40s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7280, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:52<02:03,  5.39s/it][A

	loss_cls: tensor(0.5226, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5652, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:58<01:58,  5.41s/it][A

	loss_cls: tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6288, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:03<01:53,  5.39s/it][A

	loss_cls: tensor(0.3447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4144, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:09<01:48,  5.41s/it][A

	loss_cls: tensor(0.4926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9168, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:14<01:42,  5.39s/it][A

	loss_cls: tensor(0.5050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5508, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:19<01:37,  5.41s/it][A

	loss_cls: tensor(1.0300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3414, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:25<01:31,  5.39s/it][A

	loss_cls: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7535, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:30<01:25,  5.37s/it][A

	loss_cls: tensor(0.7216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8931, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:36<01:20,  5.39s/it][A

	loss_cls: tensor(0.7686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9954, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:41<01:15,  5.39s/it][A

	loss_cls: tensor(0.5036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6572, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:46<01:10,  5.40s/it][A

	loss_cls: tensor(0.6879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7847, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:52<01:04,  5.39s/it][A

	loss_cls: tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6535, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:57<00:59,  5.40s/it][A

	loss_cls: tensor(0.4832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7444, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:03<00:53,  5.39s/it][A

	loss_cls: tensor(0.5856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6206, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:08<00:48,  5.38s/it][A

	loss_cls: tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6517, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:13<00:43,  5.40s/it][A

	loss_cls: tensor(0.5902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7705, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:19<00:37,  5.39s/it][A

	loss_cls: tensor(0.6295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7173, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:24<00:32,  5.41s/it][A

	loss_cls: tensor(0.5031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7736, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:30<00:26,  5.40s/it][A

	loss_cls: tensor(1.1149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5643, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:35<00:21,  5.41s/it][A

	loss_cls: tensor(0.5823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8542, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:40<00:16,  5.39s/it][A

	loss_cls: tensor(1.0657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1929, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:46<00:10,  5.38s/it][A

	loss_cls: tensor(0.4475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6287, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:51<00:05,  5.41s/it][A

	loss_cls: tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9094, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:53<00:00,  5.41s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9323, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8435438060659474

	Training cls acc: 0.698858286252354

	Training cls prec: 0.5757473595926985

	Training cls rec: 0.6123081367784757

	Training cls f1: 0.534499391926655

--
	Training ner acc: 0.9552227652717331

	Training ner prec: 0.2764937331153249

	Training ner rec: 0.28273720052765694

	Training ner f1: 0.2785084143631785

	Current Learning rate:  0.0007714285714285715



  1%|          | 1/177 [00:00<02:08,  1.37it/s][A
  1%|          | 2/177 [00:01<02:07,  1.38it/s][A
  2%|▏         | 3/177 [00:02<02:04,  1.39it/s][A
  2%|▏         | 4/177 [00:02<01:59,  1.45it/s][A
  3%|▎         | 5/177 [00:03<02:01,  1.42it/s][A
  3%|▎         | 6/177 [00:04<02:01,  1.40it/s][A
  4%|▍         | 7/177 [00:04<01:56,  1.45it/s][A
  5%|▍         | 8/177 [00:05<01:57,  1.43it/s][A
  5%|▌         | 9/177 [00:06<01:58,  1.41it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.41it/s][A
  6%|▌         | 11/177 [00:07<01:54,  1.46it/s][A
  7%|▋         | 12/177 [00:08<01:55,  1.43it/s][A
  7%|▋         | 13/177 [00:09<01:56,  1.41it/s][A
  8%|▊         | 14/177 [00:09<01:52,  1.46it/s][A
  8%|▊         | 15/177 [00:10<01:53,  1.43it/s][A
  9%|▉         | 16/177 [00:11<01:54,  1.41it/s][A
 10%|▉         | 17/177 [00:11<01:54,  1.40it/s][A
 10%|█         | 18/177 [00:12<01:50,  1.44it/s][A
 11%|█         | 19/177 [00:13<01:50,  1.43it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7849979589214433

	Validation cls acc: 0.7003295668549906

	Validation cls prec: 0.6129573580844768

	Validation cls rec: 0.6036622276029056

	Validation cls f1: 0.558284402352199

--
	Validation ner acc: 0.9540077321792133

	Validation ner prec: 0.42118407154100446

	Validation ner rec: 0.4315442561205273

	Validation ner f1: 0.4261458679386315



  0%|          | 1/354 [00:05<31:28,  5.35s/it][A

	loss_cls: tensor(0.6010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9793, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:48,  5.42s/it][A

	loss_cls: tensor(0.6194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0522, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:38,  5.41s/it][A

	loss_cls: tensor(0.5728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7581, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:41,  5.43s/it][A

	loss_cls: tensor(0.6029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9220, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:30,  5.42s/it][A

	loss_cls: tensor(0.5115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9327, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:21,  5.41s/it][A

	loss_cls: tensor(0.5732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8384, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:22,  5.43s/it][A

	loss_cls: tensor(0.6474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9574, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:13,  5.42s/it][A

	loss_cls: tensor(0.6444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7828, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:13,  5.43s/it][A

	loss_cls: tensor(0.5928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8045, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:06,  5.42s/it][A

	loss_cls: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8162, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<31:04,  5.43s/it][A

	loss_cls: tensor(0.4802, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8560, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:52,  5.42s/it][A

	loss_cls: tensor(0.9261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0193, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:42,  5.40s/it][A

	loss_cls: tensor(0.7410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7773, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:43,  5.42s/it][A

	loss_cls: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6091, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:32,  5.41s/it][A

	loss_cls: tensor(0.5437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7169, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:36,  5.43s/it][A

	loss_cls: tensor(0.4218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6863, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:26,  5.42s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9480, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:23,  5.43s/it][A

	loss_cls: tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8304, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:12,  5.41s/it][A

	loss_cls: tensor(0.4577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5882, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:03,  5.40s/it][A

	loss_cls: tensor(0.7978, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1089, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:05,  5.42s/it][A

	loss_cls: tensor(0.2871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4083, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<29:56,  5.41s/it][A

	loss_cls: tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5502, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:56,  5.43s/it][A

	loss_cls: tensor(0.3892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5726, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:47,  5.42s/it][A

	loss_cls: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4538, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:47,  5.43s/it][A

	loss_cls: tensor(0.8503, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0481, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:35,  5.41s/it][A

	loss_cls: tensor(0.7481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2069, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:27,  5.40s/it][A

	loss_cls: tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2349, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:27,  5.42s/it][A

	loss_cls: tensor(0.5197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7825, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:20,  5.42s/it][A

	loss_cls: tensor(0.5805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8215, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:22,  5.44s/it][A

	loss_cls: tensor(0.3195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4382, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:48<29:12,  5.43s/it][A

	loss_cls: tensor(0.9904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1915, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<29:10,  5.44s/it][A

	loss_cls: tensor(0.5420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6672, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<29:00,  5.42s/it][A

	loss_cls: tensor(0.7470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9861, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:51,  5.41s/it][A

	loss_cls: tensor(0.5970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8129, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:54,  5.44s/it][A

	loss_cls: tensor(0.4451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5734, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:15<28:46,  5.43s/it][A

	loss_cls: tensor(0.5384, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7691, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:44,  5.44s/it][A

	loss_cls: tensor(0.4348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6742, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:33,  5.42s/it][A

	loss_cls: tensor(0.4499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5189, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:31,  5.43s/it][A

	loss_cls: tensor(0.6225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7644, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:20,  5.41s/it][A

	loss_cls: tensor(0.6174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6691, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:42<28:11,  5.41s/it][A

	loss_cls: tensor(0.3938, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6629, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:14,  5.43s/it][A

	loss_cls: tensor(0.6418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9352, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:53<28:04,  5.42s/it][A

	loss_cls: tensor(0.6130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<28:05,  5.44s/it][A

	loss_cls: tensor(0.4749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5016, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:53,  5.41s/it][A

	loss_cls: tensor(0.5333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6424, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:52,  5.43s/it][A

	loss_cls: tensor(0.4199, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5677, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:42,  5.41s/it][A

	loss_cls: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7298, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:20<27:39,  5.42s/it][A

	loss_cls: tensor(0.9601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2298, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:34,  5.42s/it][A

	loss_cls: tensor(0.6733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1389, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:31<27:26,  5.42s/it][A

	loss_cls: tensor(0.7036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0201, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:27,  5.44s/it][A

	loss_cls: tensor(0.5115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0435, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:16,  5.42s/it][A

	loss_cls: tensor(0.6537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9491, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:47<27:16,  5.44s/it][A

	loss_cls: tensor(0.3599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4788, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<27:03,  5.41s/it][A

	loss_cls: tensor(0.7748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9337, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:58<27:08,  5.45s/it][A

	loss_cls: tensor(0.4951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5992, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<27:00,  5.44s/it][A

	loss_cls: tensor(1.2910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5100, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:09<26:50,  5.42s/it][A

	loss_cls: tensor(0.3911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4622, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:49,  5.44s/it][A

	loss_cls: tensor(0.5636, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7277, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:38,  5.42s/it][A

	loss_cls: tensor(0.6397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7526, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:37,  5.43s/it][A

	loss_cls: tensor(0.4937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7338, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:30<26:26,  5.42s/it][A

	loss_cls: tensor(0.6843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8829, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:36<26:26,  5.43s/it][A

	loss_cls: tensor(0.6709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1210, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:18,  5.42s/it][A

	loss_cls: tensor(0.8649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2002, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:46<26:08,  5.41s/it][A

	loss_cls: tensor(0.5598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5976, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:52<26:06,  5.42s/it][A

	loss_cls: tensor(0.4573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8340, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:57<25:55,  5.40s/it][A

	loss_cls: tensor(0.5733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7947, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:03<25:53,  5.41s/it][A

	loss_cls: tensor(0.6239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9318, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:08<25:44,  5.40s/it][A

	loss_cls: tensor(0.4056, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4646, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:14<25:45,  5.42s/it][A

	loss_cls: tensor(0.5419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8807, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:19<25:38,  5.42s/it][A

	loss_cls: tensor(0.5085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6159, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:24<25:29,  5.41s/it][A

	loss_cls: tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7699, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:30<25:29,  5.42s/it][A

	loss_cls: tensor(0.5806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1448, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7254, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:35<25:20,  5.41s/it][A

	loss_cls: tensor(0.6923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9082, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:41<25:17,  5.42s/it][A

	loss_cls: tensor(0.3931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4499, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:46<25:08,  5.41s/it][A

	loss_cls: tensor(0.5293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7147, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:52<25:11,  5.44s/it][A

	loss_cls: tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9184, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:57<25:07,  5.44s/it][A

	loss_cls: tensor(0.5363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6830, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:03<25:14,  5.49s/it][A

	loss_cls: tensor(1.0318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3864, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:08<25:09,  5.49s/it][A

	loss_cls: tensor(0.5718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7588, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:13<24:54,  5.45s/it][A

	loss_cls: tensor(0.4962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7543, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:19<24:48,  5.45s/it][A

	loss_cls: tensor(0.6550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9055, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:24<24:38,  5.43s/it][A

	loss_cls: tensor(0.7474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7914, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:30<24:37,  5.45s/it][A

	loss_cls: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5924, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:35<24:35,  5.46s/it][A

	loss_cls: tensor(0.5541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7538, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:41<24:43,  5.51s/it][A

	loss_cls: tensor(0.3940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4669, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:47<24:54,  5.58s/it][A

	loss_cls: tensor(0.7884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9063, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:52<24:37,  5.53s/it][A

	loss_cls: tensor(0.5419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6603, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:58<24:28,  5.52s/it][A

	loss_cls: tensor(0.7463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9263, device='cuda:0', grad_fn=<AddBackward0>)
	loss_cls: 


 25%|██▌       | 89/354 [08:06<28:37,  6.48s/it][A

tensor(0.6293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0063, device='cuda:0', grad_fn=<AddBackward0>)
	loss_cls: 


 25%|██▌       | 90/354 [08:14<29:55,  6.80s/it][A

tensor(0.7746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2444, device='cuda:0', grad_fn=<AddBackward0>)
	loss_cls: 


 26%|██▌       | 91/354 [08:21<30:22,  6.93s/it][A

tensor(0.5241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5865, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:27<29:17,  6.71s/it][A

	loss_cls: tensor(0.7132, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9689, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:33<27:31,  6.33s/it][A

	loss_cls: tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6776, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:38<26:26,  6.10s/it][A

	loss_cls: tensor(0.4536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6095, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:44<25:29,  5.91s/it][A

	loss_cls: tensor(0.3820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7151, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:49<24:40,  5.74s/it][A

	loss_cls: tensor(0.6895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0938, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:55<24:12,  5.65s/it][A

	loss_cls: tensor(0.4997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6746, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [09:00<23:44,  5.56s/it][A

	loss_cls: tensor(0.7652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3325, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [09:05<23:22,  5.50s/it][A

	loss_cls: tensor(0.4208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6513, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:11<23:13,  5.48s/it][A

	loss_cls: tensor(0.4393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6443, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:16<22:58,  5.45s/it][A

	loss_cls: tensor(0.6020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7840, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:22<22:54,  5.45s/it][A

	loss_cls: tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8541, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:27<22:43,  5.43s/it][A

	loss_cls: tensor(0.5303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8522, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:32<22:39,  5.44s/it][A

	loss_cls: tensor(0.3246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3858, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:38<22:28,  5.41s/it][A

	loss_cls: tensor(0.6204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8988, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:43<22:22,  5.41s/it][A

	loss_cls: tensor(0.6231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9264, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:49<22:20,  5.43s/it][A

	loss_cls: tensor(0.8397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1861, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:54<22:13,  5.42s/it][A

	loss_cls: tensor(0.5113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6187, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:59<22:11,  5.44s/it][A

	loss_cls: tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7646, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [10:05<22:04,  5.43s/it][A

	loss_cls: tensor(0.4849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0830, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:10<22:01,  5.44s/it][A

	loss_cls: tensor(0.4640, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5107, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:16<21:52,  5.42s/it][A

	loss_cls: tensor(0.5182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5920, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:21<21:43,  5.41s/it][A

	loss_cls: tensor(0.9561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1620, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:27<21:40,  5.42s/it][A

	loss_cls: tensor(0.6581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1859, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:32<21:33,  5.41s/it][A

	loss_cls: tensor(0.5445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8309, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:37<21:33,  5.43s/it][A

	loss_cls: tensor(0.4366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7577, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:43<21:24,  5.42s/it][A

	loss_cls: tensor(0.7188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8751, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:48<21:19,  5.42s/it][A

	loss_cls: tensor(0.6829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0994, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7823, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:54<21:09,  5.40s/it][A

	loss_cls: tensor(0.4477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6984, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:59<21:01,  5.39s/it][A

	loss_cls: tensor(0.4472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5758, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [11:04<21:01,  5.41s/it][A

	loss_cls: tensor(0.4635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6194, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:10<20:54,  5.41s/it][A

	loss_cls: tensor(0.7059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2008, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:15<20:53,  5.42s/it][A

	loss_cls: tensor(0.6763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7507, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:21<20:45,  5.42s/it][A

	loss_cls: tensor(0.4240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5604, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:26<20:47,  5.45s/it][A

	loss_cls: tensor(0.6263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7526, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:32<20:38,  5.43s/it][A

	loss_cls: tensor(0.7699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8635, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:37<20:28,  5.41s/it][A

	loss_cls: tensor(0.9524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1536, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:42<20:27,  5.43s/it][A

	loss_cls: tensor(1.0489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3335, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:48<20:21,  5.43s/it][A

	loss_cls: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8144, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:53<20:19,  5.45s/it][A

	loss_cls: tensor(0.5068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6248, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:59<20:10,  5.43s/it][A

	loss_cls: tensor(0.4439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5057, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [12:04<20:08,  5.44s/it][A

	loss_cls: tensor(0.5668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6141, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:10<19:58,  5.42s/it][A

	loss_cls: tensor(0.3684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4092, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:15<19:50,  5.41s/it][A

	loss_cls: tensor(0.9951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2264, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:20<19:49,  5.43s/it][A

	loss_cls: tensor(0.6600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7587, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:26<19:42,  5.42s/it][A

	loss_cls: tensor(0.5187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9256, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:31<19:41,  5.44s/it][A

	loss_cls: tensor(0.5414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6442, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:37<19:32,  5.43s/it][A

	loss_cls: tensor(0.3504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6898, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:42<19:28,  5.44s/it][A

	loss_cls: tensor(0.6021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8126, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:48<19:20,  5.42s/it][A

	loss_cls: tensor(0.6922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8387, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:53<19:12,  5.41s/it][A

	loss_cls: tensor(0.7279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8092, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:58<19:13,  5.44s/it][A

	loss_cls: tensor(0.7607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9733, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [13:04<19:06,  5.43s/it][A

	loss_cls: tensor(0.6401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8246, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:09<19:03,  5.44s/it][A

	loss_cls: tensor(0.6255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0892, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:15<18:54,  5.43s/it][A

	loss_cls: tensor(0.5168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6969, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:20<18:51,  5.44s/it][A

	loss_cls: tensor(0.6448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8814, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:26<18:42,  5.42s/it][A

	loss_cls: tensor(0.5248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9160, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:31<18:36,  5.42s/it][A

	loss_cls: tensor(0.5771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7970, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:36<18:31,  5.42s/it][A

	loss_cls: tensor(0.8301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1151, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:42<18:26,  5.42s/it][A

	loss_cls: tensor(0.4101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7056, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:47<18:28,  5.46s/it][A

	loss_cls: tensor(0.7204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0521, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:53<18:17,  5.43s/it][A

	loss_cls: tensor(0.4609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8696, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:58<18:13,  5.44s/it][A

	loss_cls: tensor(0.4961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7677, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [14:04<18:01,  5.41s/it][A

	loss_cls: tensor(0.6639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8243, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:09<18:00,  5.43s/it][A

	loss_cls: tensor(0.8262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0263, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:14<17:51,  5.41s/it][A

	loss_cls: tensor(0.5683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7891, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:20<17:43,  5.40s/it][A

	loss_cls: tensor(0.7125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0280, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:25<17:40,  5.41s/it][A

	loss_cls: tensor(0.3527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5424, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:31<17:32,  5.40s/it][A

	loss_cls: tensor(0.7770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9376, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:36<17:30,  5.42s/it][A

	loss_cls: tensor(0.6661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9566, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:41<17:22,  5.40s/it][A

	loss_cls: tensor(0.6260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8013, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:47<17:21,  5.42s/it][A

	loss_cls: tensor(0.4976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6352, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:52<17:13,  5.41s/it][A

	loss_cls: tensor(0.7447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1438, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:58<17:07,  5.41s/it][A

	loss_cls: tensor(0.2935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4293, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [15:03<17:04,  5.42s/it][A

	loss_cls: tensor(0.6571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3329, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:08<16:55,  5.40s/it][A

	loss_cls: tensor(0.7805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8791, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:14<16:52,  5.41s/it][A

	loss_cls: tensor(0.5154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5805, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:19<16:44,  5.40s/it][A

	loss_cls: tensor(0.5348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7691, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:25<16:42,  5.42s/it][A

	loss_cls: tensor(0.5500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0806, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:30<16:34,  5.41s/it][A

	loss_cls: tensor(0.8327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0992, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:35<16:25,  5.39s/it][A

	loss_cls: tensor(0.6173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7272, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:41<16:21,  5.39s/it][A

	loss_cls: tensor(0.6281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7127, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:46<16:14,  5.38s/it][A

	loss_cls: tensor(0.5588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7013, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:52<16:11,  5.40s/it][A

	loss_cls: tensor(0.7136, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8483, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:57<16:03,  5.38s/it][A

	loss_cls: tensor(0.4728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8452, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [16:02<16:00,  5.40s/it][A

	loss_cls: tensor(0.5643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6486, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:08<15:52,  5.38s/it][A

	loss_cls: tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7383, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:13<15:44,  5.37s/it][A

	loss_cls: tensor(0.4695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2981, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7676, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:19<15:43,  5.39s/it][A

	loss_cls: tensor(0.4333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6922, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:24<15:35,  5.38s/it][A

	loss_cls: tensor(0.6531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9822, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:29<15:31,  5.38s/it][A

	loss_cls: tensor(0.8271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2151, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0421, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:35<15:23,  5.37s/it][A

	loss_cls: tensor(0.6641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9498, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:40<15:21,  5.39s/it][A

	loss_cls: tensor(0.6721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0209, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:45<15:14,  5.38s/it][A

	loss_cls: tensor(0.6285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8704, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:51<15:07,  5.37s/it][A

	loss_cls: tensor(1.1492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3605, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:56<15:04,  5.39s/it][A

	loss_cls: tensor(0.4091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6346, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [17:02<14:57,  5.37s/it][A

	loss_cls: tensor(0.3848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6354, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:07<14:55,  5.39s/it][A

	loss_cls: tensor(0.9657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3600, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:12<14:47,  5.38s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0487, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:18<14:44,  5.39s/it][A

	loss_cls: tensor(0.7387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9837, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:23<14:36,  5.38s/it][A

	loss_cls: tensor(0.5531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7481, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:28<14:29,  5.37s/it][A

	loss_cls: tensor(0.8105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9063, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:34<14:27,  5.39s/it][A

	loss_cls: tensor(0.4757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7528, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:39<14:18,  5.37s/it][A

	loss_cls: tensor(0.5244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9183, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:45<14:16,  5.39s/it][A

	loss_cls: tensor(0.3931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1994, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5925, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:50<14:10,  5.38s/it][A

	loss_cls: tensor(0.8578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1755, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:55<14:06,  5.39s/it][A

	loss_cls: tensor(0.4833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7898, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [18:01<13:59,  5.38s/it][A

	loss_cls: tensor(0.4195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4652, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:06<13:57,  5.40s/it][A

	loss_cls: tensor(0.5027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6161, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:12<13:59,  5.45s/it][A

	loss_cls: tensor(0.3210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5571, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:17<13:53,  5.45s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4889, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:23<13:52,  5.48s/it][A

	loss_cls: tensor(0.7130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7992, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:28<13:47,  5.48s/it][A

	loss_cls: tensor(1.0062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1542, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:34<13:44,  5.50s/it][A

	loss_cls: tensor(0.7194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4081, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:39<13:37,  5.49s/it][A

	loss_cls: tensor(1.0856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4388, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:45<13:29,  5.47s/it][A

	loss_cls: tensor(0.7246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9685, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:50<13:26,  5.49s/it][A

	loss_cls: tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9390, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:56<13:19,  5.48s/it][A

	loss_cls: tensor(0.6355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6831, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [19:01<13:17,  5.50s/it][A

	loss_cls: tensor(0.5960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7056, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:07<13:09,  5.48s/it][A

	loss_cls: tensor(0.5496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2153, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7649, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:12<13:05,  5.50s/it][A

	loss_cls: tensor(0.5385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8686, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:18<12:59,  5.49s/it][A

	loss_cls: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9010, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:23<12:51,  5.47s/it][A

	loss_cls: tensor(0.6455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9178, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:29<12:48,  5.49s/it][A

	loss_cls: tensor(0.6954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1288, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:34<12:41,  5.48s/it][A

	loss_cls: tensor(0.6762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1919, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8681, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:40<12:38,  5.50s/it][A

	loss_cls: tensor(0.6190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7366, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:45<12:30,  5.48s/it][A

	loss_cls: tensor(0.4614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:51<12:26,  5.49s/it][A

	loss_cls: tensor(0.5612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7973, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:56<12:19,  5.48s/it][A

	loss_cls: tensor(0.7914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8504, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [20:01<12:11,  5.46s/it][A

	loss_cls: tensor(1.0540, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1894, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:07<12:10,  5.49s/it][A

	loss_cls: tensor(0.7412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9287, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:12<12:03,  5.48s/it][A

	loss_cls: tensor(0.6234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8738, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:18<12:01,  5.50s/it][A

	loss_cls: tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8938, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:23<11:53,  5.49s/it][A

	loss_cls: tensor(0.6508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7308, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:29<11:50,  5.50s/it][A

	loss_cls: tensor(0.5256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0346, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:34<11:42,  5.49s/it][A

	loss_cls: tensor(0.4668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6524, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:40<11:34,  5.47s/it][A

	loss_cls: tensor(0.5176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5336, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:45<11:31,  5.49s/it][A

	loss_cls: tensor(0.5009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5752, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:51<11:25,  5.48s/it][A

	loss_cls: tensor(0.5234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8068, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:56<11:21,  5.50s/it][A

	loss_cls: tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8316, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [21:02<11:15,  5.49s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7106, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:07<11:11,  5.51s/it][A

	loss_cls: tensor(0.7364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9498, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:13<11:03,  5.49s/it][A

	loss_cls: tensor(0.6332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7402, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:18<10:56,  5.47s/it][A

	loss_cls: tensor(0.6791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0795, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:24<10:54,  5.50s/it][A

	loss_cls: tensor(0.5868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7113, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:29<10:47,  5.48s/it][A

	loss_cls: tensor(0.3817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5878, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:35<10:40,  5.47s/it][A

	loss_cls: tensor(0.4366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5891, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:40<10:29,  5.42s/it][A

	loss_cls: tensor(0.8634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1279, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:46<10:23,  5.42s/it][A

	loss_cls: tensor(0.6502, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7722, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:51<10:14,  5.39s/it][A

	loss_cls: tensor(0.4673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5760, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:56<10:07,  5.38s/it][A

	loss_cls: tensor(0.5232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7496, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [22:02<10:03,  5.39s/it][A

	loss_cls: tensor(0.3922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6692, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:07<09:56,  5.38s/it][A

	loss_cls: tensor(0.4863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5181, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:12<09:53,  5.39s/it][A

	loss_cls: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6619, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:18<09:45,  5.38s/it][A

	loss_cls: tensor(0.5425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0970, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:23<09:41,  5.38s/it][A

	loss_cls: tensor(0.7667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0610, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:28<09:34,  5.37s/it][A

	loss_cls: tensor(0.8908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0876, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:34<09:30,  5.38s/it][A

	loss_cls: tensor(0.7178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8686, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:39<09:24,  5.38s/it][A

	loss_cls: tensor(0.6297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9804, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:45<09:17,  5.36s/it][A

	loss_cls: tensor(0.5788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9155, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:50<09:14,  5.38s/it][A

	loss_cls: tensor(0.7088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7761, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:55<09:08,  5.37s/it][A

	loss_cls: tensor(0.6500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8641, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [23:01<09:04,  5.39s/it][A

	loss_cls: tensor(0.7657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8433, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:06<08:57,  5.37s/it][A

	loss_cls: tensor(0.5559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8316, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:12<08:53,  5.39s/it][A

	loss_cls: tensor(0.3519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7017, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:17<08:47,  5.38s/it][A

	loss_cls: tensor(0.6325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7221, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:22<08:41,  5.38s/it][A

	loss_cls: tensor(0.5559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7995, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:28<08:37,  5.39s/it][A

	loss_cls: tensor(0.4317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5278, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:33<08:31,  5.38s/it][A

	loss_cls: tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7510, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:38<08:27,  5.40s/it][A

	loss_cls: tensor(0.4286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5755, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:44<08:19,  5.37s/it][A

	loss_cls: tensor(0.4733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8804, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:49<08:16,  5.39s/it][A

	loss_cls: tensor(0.7358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9518, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:55<08:09,  5.38s/it][A

	loss_cls: tensor(1.0548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2116, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [24:00<08:03,  5.37s/it][A

	loss_cls: tensor(0.8092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1772, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:05<08:00,  5.39s/it][A

	loss_cls: tensor(0.8812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1179, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:11<07:53,  5.38s/it][A

	loss_cls: tensor(0.8492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1429, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:16<07:47,  5.37s/it][A

	loss_cls: tensor(0.3961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5644, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:21<07:40,  5.35s/it][A

	loss_cls: tensor(0.5916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7794, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:27<07:36,  5.37s/it][A

	loss_cls: tensor(0.5023, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6850, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:32<07:29,  5.36s/it][A

	loss_cls: tensor(0.4944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6382, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:37<07:23,  5.34s/it][A

	loss_cls: tensor(0.6784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8532, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:43<07:18,  5.35s/it][A

	loss_cls: tensor(0.9012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2841, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:48<07:12,  5.34s/it][A

	loss_cls: tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6418, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:53<07:07,  5.34s/it][A

	loss_cls: tensor(0.5676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9422, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:59<07:01,  5.34s/it][A

	loss_cls: tensor(0.5866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6695, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:04<06:58,  5.36s/it][A

	loss_cls: tensor(0.6858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0556, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:10<06:52,  5.36s/it][A

	loss_cls: tensor(0.5451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8936, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:15<06:46,  5.35s/it][A

	loss_cls: tensor(0.7077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8922, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:20<06:42,  5.36s/it][A

	loss_cls: tensor(0.4296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8127, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:26<06:35,  5.35s/it][A

	loss_cls: tensor(0.3704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5006, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:31<06:31,  5.36s/it][A

	loss_cls: tensor(0.3740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6114, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:36<06:25,  5.35s/it][A

	loss_cls: tensor(0.4392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6634, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:42<06:21,  5.37s/it][A

	loss_cls: tensor(1.1494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2343, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:47<06:14,  5.35s/it][A

	loss_cls: tensor(0.4508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5492, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:52<06:08,  5.34s/it][A

	loss_cls: tensor(0.4928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6646, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:58<06:04,  5.36s/it][A

	loss_cls: tensor(0.7647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9902, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:03<05:58,  5.35s/it][A

	loss_cls: tensor(0.4538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5703, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:09<05:54,  5.37s/it][A

	loss_cls: tensor(0.5053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:14<05:49,  5.38s/it][A

	loss_cls: tensor(0.6676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0266, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:19<05:45,  5.40s/it][A

	loss_cls: tensor(0.5856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1522, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:25<05:39,  5.40s/it][A

	loss_cls: tensor(0.7624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0036, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:30<05:34,  5.39s/it][A

	loss_cls: tensor(0.5786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8430, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:36<05:30,  5.42s/it][A

	loss_cls: tensor(0.5451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6741, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:41<05:23,  5.40s/it][A

	loss_cls: tensor(0.5659, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8757, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:46<05:19,  5.42s/it][A

	loss_cls: tensor(0.7362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9685, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:52<05:13,  5.40s/it][A

	loss_cls: tensor(0.8810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3542, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:57<05:08,  5.42s/it][A

	loss_cls: tensor(0.5049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5809, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:03<05:02,  5.40s/it][A

	loss_cls: tensor(0.8333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1287, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:08<04:56,  5.39s/it][A

	loss_cls: tensor(0.5724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9333, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:13<04:51,  5.40s/it][A

	loss_cls: tensor(0.5595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9810, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:19<04:45,  5.38s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9106, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:24<04:41,  5.40s/it][A

	loss_cls: tensor(0.9245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1270, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:30<04:35,  5.40s/it][A

	loss_cls: tensor(0.7069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8343, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:35<04:30,  5.41s/it][A

	loss_cls: tensor(0.6892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8210, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:40<04:24,  5.40s/it][A

	loss_cls: tensor(0.4378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7375, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:46<04:18,  5.38s/it][A

	loss_cls: tensor(0.7591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0867, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:51<04:13,  5.39s/it][A

	loss_cls: tensor(0.7878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8799, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:57<04:07,  5.38s/it][A

	loss_cls: tensor(0.5020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8648, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:02<04:03,  5.41s/it][A

	loss_cls: tensor(0.6887, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7892, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:07<03:57,  5.40s/it][A

	loss_cls: tensor(0.9344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9764, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:13<03:53,  5.42s/it][A

	loss_cls: tensor(0.5067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6438, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:18<03:47,  5.41s/it][A

	loss_cls: tensor(0.6955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0392, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:24<03:41,  5.40s/it][A

	loss_cls: tensor(0.4600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9351, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:29<03:36,  5.42s/it][A

	loss_cls: tensor(0.5379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6470, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:34<03:30,  5.41s/it][A

	loss_cls: tensor(0.6776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1867, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8643, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:40<03:26,  5.43s/it][A

	loss_cls: tensor(0.7703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9762, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:45<03:20,  5.41s/it][A

	loss_cls: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3543, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:51<03:15,  5.43s/it][A

	loss_cls: tensor(0.4157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6115, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:56<03:09,  5.42s/it][A

	loss_cls: tensor(0.3774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5838, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:02<03:03,  5.41s/it][A

	loss_cls: tensor(0.5022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6643, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:07<02:58,  5.42s/it][A

	loss_cls: tensor(0.3780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5405, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:12<02:53,  5.41s/it][A

	loss_cls: tensor(0.6626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8128, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:18<02:48,  5.42s/it][A

	loss_cls: tensor(0.9179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2000, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:23<02:42,  5.42s/it][A

	loss_cls: tensor(0.9361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3749, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:29<02:37,  5.43s/it][A

	loss_cls: tensor(0.5279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6764, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:34<02:31,  5.42s/it][A

	loss_cls: tensor(0.2913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3403, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:39<02:25,  5.40s/it][A

	loss_cls: tensor(0.4170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4733, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:45<02:21,  5.43s/it][A

	loss_cls: tensor(0.6498, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7501, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:50<02:15,  5.42s/it][A

	loss_cls: tensor(0.3566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4916, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:56<02:10,  5.44s/it][A

	loss_cls: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8488, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:01<02:04,  5.42s/it][A

	loss_cls: tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1003, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:07<01:59,  5.44s/it][A

	loss_cls: tensor(0.5971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1483, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7454, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:12<01:54,  5.44s/it][A

	loss_cls: tensor(0.5455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7389, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:17<01:48,  5.42s/it][A

	loss_cls: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5393, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:23<01:43,  5.44s/it][A

	loss_cls: tensor(0.7553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1114, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8667, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:28<01:37,  5.43s/it][A

	loss_cls: tensor(1.0543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4212, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:34<01:32,  5.45s/it][A

	loss_cls: tensor(0.7352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8252, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:39<01:26,  5.43s/it][A

	loss_cls: tensor(0.6237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0346, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:45<01:21,  5.44s/it][A

	loss_cls: tensor(0.6011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8846, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:50<01:15,  5.43s/it][A

	loss_cls: tensor(0.6961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8745, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:55<01:10,  5.41s/it][A

	loss_cls: tensor(0.6537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8167, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:01<01:05,  5.43s/it][A

	loss_cls: tensor(0.5175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8396, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:06<00:59,  5.42s/it][A

	loss_cls: tensor(0.6690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1439, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:12<00:54,  5.43s/it][A

	loss_cls: tensor(0.4127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:17<00:48,  5.41s/it][A

	loss_cls: tensor(0.7311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9611, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:23<00:43,  5.42s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9046, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:28<00:37,  5.40s/it][A

	loss_cls: tensor(0.5394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7730, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:33<00:32,  5.42s/it][A

	loss_cls: tensor(0.5519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8757, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:39<00:27,  5.41s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9109, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:44<00:21,  5.41s/it][A

	loss_cls: tensor(0.3989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8135, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:50<00:16,  5.42s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8091, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:55<00:10,  5.41s/it][A

	loss_cls: tensor(0.8155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9572, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:00<00:05,  5.41s/it][A

	loss_cls: tensor(0.8610, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0513, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:02<00:00,  5.43s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.5919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9302, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8353839999033232

	Training cls acc: 0.6972104519774012

	Training cls prec: 0.5729179503014249

	Training cls rec: 0.5989213470145673

	Training cls f1: 0.5292429822394373

--
	Training ner acc: 0.9554887470692321

	Training ner prec: 0.2706492270446447

	Training ner rec: 0.2787828940540228

	Training ner f1: 0.2742869180345584

	Current Learning rate:  0.0007428571428571429



  1%|          | 1/177 [00:00<02:09,  1.36it/s][A
  1%|          | 2/177 [00:01<02:06,  1.39it/s][A
  2%|▏         | 3/177 [00:02<02:03,  1.41it/s][A
  2%|▏         | 4/177 [00:02<02:04,  1.39it/s][A
  3%|▎         | 5/177 [00:03<02:03,  1.39it/s][A
  3%|▎         | 6/177 [00:04<01:59,  1.44it/s][A
  4%|▍         | 7/177 [00:04<01:59,  1.42it/s][A
  5%|▍         | 8/177 [00:05<02:00,  1.40it/s][A
  5%|▌         | 9/177 [00:06<02:00,  1.40it/s][A
  6%|▌         | 10/177 [00:07<01:56,  1.44it/s][A
  6%|▌         | 11/177 [00:07<01:57,  1.42it/s][A
  7%|▋         | 12/177 [00:08<01:57,  1.41it/s][A
  7%|▋         | 13/177 [00:09<01:54,  1.44it/s][A
  8%|▊         | 14/177 [00:09<01:54,  1.42it/s][A
  8%|▊         | 15/177 [00:10<01:55,  1.41it/s][A
  9%|▉         | 16/177 [00:11<01:54,  1.40it/s][A
 10%|▉         | 17/177 [00:11<01:50,  1.45it/s][A
 10%|█         | 18/177 [00:12<01:51,  1.42it/s][A
 11%|█         | 19/177 [00:13<01:51,  1.41it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.9912236380038288

	Validation cls acc: 0.4103107344632768

	Validation cls prec: 0.5471045197740113

	Validation cls rec: 0.46404358353510894

	Validation cls f1: 0.3562123375682698

--
	Validation ner acc: 0.9550533639124246

	Validation ner prec: 0.4258143828527232

	Validation ner rec: 0.43596986817325806

	Validation ner f1: 0.4306745596630856



  0%|          | 1/354 [00:05<31:31,  5.36s/it][A

	loss_cls: tensor(1.1161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1524, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:28,  5.36s/it][A

	loss_cls: tensor(0.6544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8610, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:36,  5.40s/it][A

	loss_cls: tensor(0.5953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8903, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:28,  5.39s/it][A

	loss_cls: tensor(0.8325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8593, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:31,  5.42s/it][A

	loss_cls: tensor(0.4811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7522, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:19,  5.40s/it][A

	loss_cls: tensor(0.3749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6281, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:19,  5.42s/it][A

	loss_cls: tensor(0.5535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9994, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:05,  5.39s/it][A

	loss_cls: tensor(0.9440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2988, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:54,  5.38s/it][A

	loss_cls: tensor(0.5220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7650, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:54,  5.39s/it][A

	loss_cls: tensor(0.4429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5418, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:45,  5.38s/it][A

	loss_cls: tensor(0.4248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7193, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:43,  5.39s/it][A

	loss_cls: tensor(0.5190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6615, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:34,  5.38s/it][A

	loss_cls: tensor(0.5543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8013, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:30,  5.38s/it][A

	loss_cls: tensor(1.0234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3744, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3979, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:20,  5.37s/it][A

	loss_cls: tensor(0.8465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1170, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:15,  5.37s/it][A

	loss_cls: tensor(0.6408, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9787, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:27,  5.42s/it][A

	loss_cls: tensor(0.4403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6892, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:27,  5.44s/it][A

	loss_cls: tensor(0.4143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4988, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<29:41,  5.32s/it][A

	loss_cls: tensor(0.5250, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1739, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6990, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<29:06,  5.23s/it][A

	loss_cls: tensor(0.3601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5259, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<28:45,  5.18s/it][A

	loss_cls: tensor(0.6750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9332, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:57<28:27,  5.14s/it][A

	loss_cls: tensor(0.3627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5806, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:02<28:43,  5.21s/it][A

	loss_cls: tensor(0.5422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7765, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:06,  5.29s/it][A

	loss_cls: tensor(0.5401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7260, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:13<29:12,  5.33s/it][A

	loss_cls: tensor(0.6840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9972, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:23,  5.38s/it][A

	loss_cls: tensor(0.7703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9446, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:20,  5.38s/it][A

	loss_cls: tensor(0.4971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6681, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:30<29:27,  5.42s/it][A

	loss_cls: tensor(0.7412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8012, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<29:18,  5.41s/it][A

	loss_cls: tensor(0.6939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8693, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<29:08,  5.40s/it][A

	loss_cls: tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5275, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:46<29:09,  5.42s/it][A

	loss_cls: tensor(0.6262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6509, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<29:01,  5.41s/it][A

	loss_cls: tensor(0.6560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8497, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:57<29:01,  5.43s/it][A

	loss_cls: tensor(0.3880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5381, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:02<28:55,  5.42s/it][A

	loss_cls: tensor(0.6330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8194, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:53,  5.44s/it][A

	loss_cls: tensor(0.7556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0665, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:13<28:41,  5.41s/it][A

	loss_cls: tensor(0.6300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9634, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:40,  5.43s/it][A

	loss_cls: tensor(0.9264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5255, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:24<28:31,  5.42s/it][A

	loss_cls: tensor(0.2965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3446, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:29<28:24,  5.41s/it][A

	loss_cls: tensor(0.3892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8226, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:35<28:24,  5.43s/it][A

	loss_cls: tensor(0.4215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6251, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:40<28:13,  5.41s/it][A

	loss_cls: tensor(1.0200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3854, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:45<28:10,  5.42s/it][A

	loss_cls: tensor(0.6761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8658, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:51<27:59,  5.40s/it][A

	loss_cls: tensor(0.7803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0884, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:56<27:59,  5.42s/it][A

	loss_cls: tensor(0.7191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8331, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:02<27:51,  5.41s/it][A

	loss_cls: tensor(0.6709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9128, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:07<27:41,  5.39s/it][A

	loss_cls: tensor(0.4006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6822, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:12<27:40,  5.41s/it][A

	loss_cls: tensor(0.6259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7675, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:18<27:29,  5.39s/it][A

	loss_cls: tensor(0.6573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8457, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:23<27:28,  5.41s/it][A

	loss_cls: tensor(0.6138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6529, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:29<27:17,  5.39s/it][A

	loss_cls: tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5348, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:34<27:17,  5.41s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0606, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:39<27:12,  5.41s/it][A

	loss_cls: tensor(0.8916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3243, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:45<27:03,  5.39s/it][A

	loss_cls: tensor(0.4769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8652, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:50<27:03,  5.41s/it][A

	loss_cls: tensor(0.5285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5437, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:56<26:53,  5.40s/it][A

	loss_cls: tensor(0.6341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0872, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:01<26:53,  5.41s/it][A

	loss_cls: tensor(0.5526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8446, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:06<26:42,  5.40s/it][A

	loss_cls: tensor(0.6780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8822, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:12<26:44,  5.42s/it][A

	loss_cls: tensor(0.5861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7681, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:17<26:34,  5.40s/it][A

	loss_cls: tensor(0.6056, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9212, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:23<26:27,  5.40s/it][A

	loss_cls: tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8998, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:28<26:26,  5.41s/it][A

	loss_cls: tensor(0.5245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7629, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:33<26:16,  5.40s/it][A

	loss_cls: tensor(0.6207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8760, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:39<26:14,  5.41s/it][A

	loss_cls: tensor(0.5314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6859, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:44<26:08,  5.41s/it][A

	loss_cls: tensor(0.9149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2764, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:50<26:07,  5.43s/it][A

	loss_cls: tensor(0.7620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0660, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:55<25:59,  5.42s/it][A

	loss_cls: tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8577, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:01<26:03,  5.45s/it][A

	loss_cls: tensor(0.6886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8065, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:06<26:02,  5.46s/it][A

	loss_cls: tensor(0.6602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1347, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:12<25:50,  5.44s/it][A

	loss_cls: tensor(0.6054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7960, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:17<25:45,  5.44s/it][A

	loss_cls: tensor(0.7869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8966, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:22<25:35,  5.43s/it][A

	loss_cls: tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6890, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:28<25:33,  5.44s/it][A

	loss_cls: tensor(0.6203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7398, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:33<25:24,  5.43s/it][A

	loss_cls: tensor(0.6489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9459, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:39<25:14,  5.41s/it][A

	loss_cls: tensor(0.4957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7588, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:44<25:17,  5.44s/it][A

	loss_cls: tensor(0.5506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8393, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:49<25:09,  5.43s/it][A

	loss_cls: tensor(0.8467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1242, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:55<25:10,  5.45s/it][A

	loss_cls: tensor(0.4436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7801, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:00<25:00,  5.44s/it][A

	loss_cls: tensor(0.5371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1524, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:06<24:58,  5.45s/it][A

	loss_cls: tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7077, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:11<24:48,  5.43s/it][A

	loss_cls: tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7317, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<24:39,  5.42s/it][A

	loss_cls: tensor(0.5033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8503, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<24:41,  5.45s/it][A

	loss_cls: tensor(0.4811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8815, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:28<24:31,  5.43s/it][A

	loss_cls: tensor(0.6192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7704, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:29,  5.44s/it][A

	loss_cls: tensor(0.4588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6039, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:38<24:21,  5.43s/it][A

	loss_cls: tensor(0.7054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0034, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:19,  5.45s/it][A

	loss_cls: tensor(0.5705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0217, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:49<24:12,  5.44s/it][A

	loss_cls: tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6773, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<24:04,  5.43s/it][A

	loss_cls: tensor(0.4597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6880, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<24:03,  5.45s/it][A

	loss_cls: tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9872, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:06<23:50,  5.42s/it][A

	loss_cls: tensor(0.5222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5771, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:52,  5.45s/it][A

	loss_cls: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6698, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:17<23:43,  5.43s/it][A

	loss_cls: tensor(0.8552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9773, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<23:43,  5.45s/it][A

	loss_cls: tensor(0.6041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7747, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:27<23:33,  5.44s/it][A

	loss_cls: tensor(0.6130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7195, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:33<23:26,  5.43s/it][A

	loss_cls: tensor(0.3776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5620, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:38<23:25,  5.45s/it][A

	loss_cls: tensor(0.5280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0504, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:44<23:16,  5.43s/it][A

	loss_cls: tensor(0.5831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6613, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:49<23:17,  5.46s/it][A

	loss_cls: tensor(0.3926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4337, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:55<23:09,  5.45s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8132, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:00<23:07,  5.46s/it][A

	loss_cls: tensor(0.4309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8480, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:06<22:57,  5.45s/it][A

	loss_cls: tensor(0.5885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6983, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:11<22:46,  5.42s/it][A

	loss_cls: tensor(1.0545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4601, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:47,  5.45s/it][A

	loss_cls: tensor(0.4168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5404, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:22<22:39,  5.44s/it][A

	loss_cls: tensor(0.8172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1513, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:27<22:41,  5.47s/it][A

	loss_cls: tensor(1.0317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3149, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:33<22:32,  5.45s/it][A

	loss_cls: tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7413, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:38<22:29,  5.46s/it][A

	loss_cls: tensor(0.6033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1154, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:44<22:19,  5.45s/it][A

	loss_cls: tensor(0.6236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:49<22:11,  5.43s/it][A

	loss_cls: tensor(0.5268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6924, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:55<22:12,  5.46s/it][A

	loss_cls: tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0130, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:00<22:03,  5.45s/it][A

	loss_cls: tensor(0.6221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8355, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:05<21:59,  5.45s/it][A

	loss_cls: tensor(1.0403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2222, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2625, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:11<21:49,  5.44s/it][A

	loss_cls: tensor(0.5157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6616, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:16<21:48,  5.45s/it][A

	loss_cls: tensor(0.5211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9285, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:22<21:38,  5.43s/it][A

	loss_cls: tensor(0.6566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8250, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:27<21:32,  5.43s/it][A

	loss_cls: tensor(0.9823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4564, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:33<21:30,  5.44s/it][A

	loss_cls: tensor(0.7054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7751, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:38<21:22,  5.44s/it][A

	loss_cls: tensor(0.9276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1249, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:44<21:20,  5.45s/it][A

	loss_cls: tensor(0.7331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7993, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:49<21:11,  5.43s/it][A

	loss_cls: tensor(0.6531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1608, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:54<21:09,  5.45s/it][A

	loss_cls: tensor(0.5377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8836, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:00<20:59,  5.43s/it][A

	loss_cls: tensor(0.6299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7082, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:05<20:53,  5.42s/it][A

	loss_cls: tensor(0.4947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8873, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:11<20:55,  5.46s/it][A

	loss_cls: tensor(0.5018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9914, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:16<20:46,  5.44s/it][A

	loss_cls: tensor(0.9122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0400, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:22<20:45,  5.46s/it][A

	loss_cls: tensor(0.6500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9702, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:27<20:34,  5.44s/it][A

	loss_cls: tensor(0.8199, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9481, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:33<20:31,  5.45s/it][A

	loss_cls: tensor(0.6324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8054, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:38<20:20,  5.42s/it][A

	loss_cls: tensor(0.5267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9190, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:43<20:10,  5.40s/it][A

	loss_cls: tensor(0.5624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8513, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:49<20:09,  5.43s/it][A

	loss_cls: tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7596, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:54<20:02,  5.42s/it][A

	loss_cls: tensor(0.6932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8896, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:00<20:00,  5.43s/it][A

	loss_cls: tensor(0.6450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9028, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:05<19:50,  5.41s/it][A

	loss_cls: tensor(0.5900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8018, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:10<19:50,  5.44s/it][A

	loss_cls: tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7900, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:16<19:39,  5.41s/it][A

	loss_cls: tensor(0.7101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9689, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:21<19:37,  5.42s/it][A

	loss_cls: tensor(0.5930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8254, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:27<19:29,  5.41s/it][A

	loss_cls: tensor(0.5934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6721, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:32<19:20,  5.40s/it][A

	loss_cls: tensor(0.5180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6835, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:37<19:18,  5.41s/it][A

	loss_cls: tensor(0.8686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1380, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:43<19:09,  5.40s/it][A

	loss_cls: tensor(0.8044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3589, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:48<19:07,  5.41s/it][A

	loss_cls: tensor(1.0655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1485, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2140, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:54<18:59,  5.40s/it][A

	loss_cls: tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0800, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:59<18:54,  5.40s/it][A

	loss_cls: tensor(0.4769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7830, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:04<18:47,  5.39s/it][A

	loss_cls: tensor(0.5374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7071, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:10<18:39,  5.38s/it][A

	loss_cls: tensor(0.6989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9525, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:15<18:37,  5.40s/it][A

	loss_cls: tensor(0.6280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8558, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:21<18:29,  5.39s/it][A

	loss_cls: tensor(0.5044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7439, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:26<18:27,  5.40s/it][A

	loss_cls: tensor(0.5178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8239, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:31<18:18,  5.39s/it][A

	loss_cls: tensor(0.5674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8933, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:37<18:16,  5.40s/it][A

	loss_cls: tensor(0.4440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7660, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:42<18:09,  5.39s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7413, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:48<18:01,  5.38s/it][A

	loss_cls: tensor(0.6102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7360, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:53<18:01,  5.41s/it][A

	loss_cls: tensor(0.4165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6374, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:58<17:52,  5.39s/it][A

	loss_cls: tensor(0.4122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5210, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:04<17:50,  5.41s/it][A

	loss_cls: tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1842, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7551, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:09<17:42,  5.39s/it][A

	loss_cls: tensor(0.5561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9932, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:15<17:41,  5.42s/it][A

	loss_cls: tensor(0.5925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0821, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:20<17:34,  5.41s/it][A

	loss_cls: tensor(0.7922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9338, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:25<17:27,  5.40s/it][A

	loss_cls: tensor(0.5139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5448, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:31<17:23,  5.41s/it][A

	loss_cls: tensor(0.6922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0187, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:36<17:14,  5.39s/it][A

	loss_cls: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8657, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:42<17:11,  5.40s/it][A

	loss_cls: tensor(0.6350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7571, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:47<17:03,  5.38s/it][A

	loss_cls: tensor(0.4527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5238, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:52<17:00,  5.40s/it][A

	loss_cls: tensor(0.5927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6256, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:58<16:52,  5.38s/it][A

	loss_cls: tensor(1.0575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5051, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:03<16:44,  5.37s/it][A

	loss_cls: tensor(0.8688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1132, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:08<16:41,  5.38s/it][A

	loss_cls: tensor(0.6004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7867, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:14<16:32,  5.36s/it][A

	loss_cls: tensor(0.4532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7297, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:19<16:29,  5.38s/it][A

	loss_cls: tensor(0.7997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0414, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:25<16:23,  5.37s/it][A

	loss_cls: tensor(0.3138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4074, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:30<16:22,  5.40s/it][A

	loss_cls: tensor(0.4336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6434, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:35<16:14,  5.39s/it][A

	loss_cls: tensor(0.4420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6387, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:41<16:08,  5.38s/it][A

	loss_cls: tensor(0.7107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9788, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:46<16:07,  5.40s/it][A

	loss_cls: tensor(0.7579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9039, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:52<15:58,  5.39s/it][A

	loss_cls: tensor(0.5056, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6166, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:57<15:55,  5.40s/it][A

	loss_cls: tensor(1.1740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4771, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:02<15:49,  5.39s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6425, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:08<15:48,  5.42s/it][A

	loss_cls: tensor(0.9088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0679, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:13<15:40,  5.40s/it][A

	loss_cls: tensor(0.7687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8702, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:19<15:31,  5.39s/it][A

	loss_cls: tensor(0.4194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5830, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:24<15:30,  5.41s/it][A

	loss_cls: tensor(0.5130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9138, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:29<15:22,  5.39s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8737, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:35<15:18,  5.40s/it][A

	loss_cls: tensor(0.8481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2728, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:40<15:09,  5.38s/it][A

	loss_cls: tensor(0.6923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9132, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:46<15:07,  5.40s/it][A

	loss_cls: tensor(0.8291, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2048, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:51<14:59,  5.39s/it][A

	loss_cls: tensor(0.5430, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6792, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:56<14:53,  5.38s/it][A

	loss_cls: tensor(0.5727, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7510, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:02<14:48,  5.39s/it][A

	loss_cls: tensor(0.6543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9234, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:07<14:40,  5.37s/it][A

	loss_cls: tensor(0.6441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0137, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:12<14:38,  5.39s/it][A

	loss_cls: tensor(0.7332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1110, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:18<14:31,  5.38s/it][A

	loss_cls: tensor(0.7329, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9308, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:23<14:30,  5.41s/it][A

	loss_cls: tensor(0.7540, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9200, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:29<14:22,  5.39s/it][A

	loss_cls: tensor(0.4379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6810, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:34<14:16,  5.39s/it][A

	loss_cls: tensor(0.5734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8638, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:39<14:13,  5.40s/it][A

	loss_cls: tensor(0.6528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9694, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:45<14:05,  5.39s/it][A

	loss_cls: tensor(0.5670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7470, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:50<14:02,  5.40s/it][A

	loss_cls: tensor(0.7249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0226, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:56<13:56,  5.40s/it][A

	loss_cls: tensor(0.7656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9980, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:01<13:52,  5.40s/it][A

	loss_cls: tensor(0.5285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8883, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:06<13:44,  5.39s/it][A

	loss_cls: tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5494, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:12<13:35,  5.36s/it][A

	loss_cls: tensor(0.6106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7035, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:17<13:32,  5.38s/it][A

	loss_cls: tensor(0.6772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3712, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:23<13:26,  5.38s/it][A

	loss_cls: tensor(0.4306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5290, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:28<13:25,  5.40s/it][A

	loss_cls: tensor(0.5986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9419, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:33<13:17,  5.39s/it][A

	loss_cls: tensor(0.6600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3353, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:39<13:13,  5.40s/it][A

	loss_cls: tensor(0.4815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6194, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:44<13:06,  5.38s/it][A

	loss_cls: tensor(0.4348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5927, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:49<12:59,  5.37s/it][A

	loss_cls: tensor(0.7087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9966, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:55<12:56,  5.39s/it][A

	loss_cls: tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6984, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:00<12:49,  5.38s/it][A

	loss_cls: tensor(0.6281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9629, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:06<12:46,  5.40s/it][A

	loss_cls: tensor(0.6366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7641, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:11<12:37,  5.38s/it][A

	loss_cls: tensor(0.6474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8338, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:16<12:33,  5.38s/it][A

	loss_cls: tensor(0.7338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0771, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:22<12:25,  5.36s/it][A

	loss_cls: tensor(0.3789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5409, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:27<12:18,  5.35s/it][A

	loss_cls: tensor(0.4350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6146, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:32<12:17,  5.38s/it][A

	loss_cls: tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9014, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:38<12:11,  5.38s/it][A

	loss_cls: tensor(0.8054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0918, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:43<12:07,  5.39s/it][A

	loss_cls: tensor(0.4647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7189, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:49<12:00,  5.38s/it][A

	loss_cls: tensor(0.8724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0512, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:54<11:57,  5.39s/it][A

	loss_cls: tensor(0.6160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8787, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:59<11:49,  5.38s/it][A

	loss_cls: tensor(0.5297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7953, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:05<11:42,  5.36s/it][A

	loss_cls: tensor(0.9999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0831, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:10<11:40,  5.39s/it][A

	loss_cls: tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7729, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:16<11:34,  5.39s/it][A

	loss_cls: tensor(0.5825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7058, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:21<11:32,  5.41s/it][A

	loss_cls: tensor(0.3904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7817, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:26<11:26,  5.40s/it][A

	loss_cls: tensor(0.5841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8333, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:32<11:22,  5.42s/it][A

	loss_cls: tensor(0.4697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7913, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:37<11:15,  5.40s/it][A

	loss_cls: tensor(0.4336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5242, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:43<11:06,  5.38s/it][A

	loss_cls: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7073, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:48<11:05,  5.41s/it][A

	loss_cls: tensor(0.4176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6895, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:53<10:58,  5.40s/it][A

	loss_cls: tensor(0.7990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0605, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:59<10:55,  5.42s/it][A

	loss_cls: tensor(0.4285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6725, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:04<10:48,  5.40s/it][A

	loss_cls: tensor(0.4704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7025, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:10<10:45,  5.42s/it][A

	loss_cls: tensor(0.6660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9322, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:15<10:37,  5.40s/it][A

	loss_cls: tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7528, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:20<10:33,  5.41s/it][A

	loss_cls: tensor(0.7901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8493, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:26<10:26,  5.40s/it][A

	loss_cls: tensor(0.6639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8410, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:31<10:20,  5.39s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:37<10:16,  5.41s/it][A

	loss_cls: tensor(0.4359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6244, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:42<10:09,  5.39s/it][A

	loss_cls: tensor(0.6247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9232, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:47<10:05,  5.41s/it][A

	loss_cls: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5496, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:53<09:57,  5.38s/it][A

	loss_cls: tensor(0.5133, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6013, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:58<09:54,  5.40s/it][A

	loss_cls: tensor(0.8423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9225, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:04<09:48,  5.40s/it][A

	loss_cls: tensor(0.3272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4609, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:09<09:41,  5.39s/it][A

	loss_cls: tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7258, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:14<09:38,  5.41s/it][A

	loss_cls: tensor(0.8469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9893, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:20<09:33,  5.41s/it][A

	loss_cls: tensor(0.6015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0513, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:25<09:31,  5.45s/it][A

	loss_cls: tensor(1.1001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2958, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:31<09:25,  5.44s/it][A

	loss_cls: tensor(1.0238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2223, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:36<09:24,  5.48s/it][A

	loss_cls: tensor(0.3232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3519, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:42<09:18,  5.47s/it][A

	loss_cls: tensor(0.3335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4208, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:47<09:12,  5.47s/it][A

	loss_cls: tensor(0.6932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8144, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:53<09:09,  5.50s/it][A

	loss_cls: tensor(0.4140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6776, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:58<08:48,  5.34s/it][A

	loss_cls: tensor(0.4012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4306, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:03<08:41,  5.32s/it][A

	loss_cls: tensor(0.8834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0644, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:09<08:40,  5.37s/it][A

	loss_cls: tensor(0.4231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7624, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:14<08:42,  5.44s/it][A

	loss_cls: tensor(0.5335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8585, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:20<08:38,  5.46s/it][A

	loss_cls: tensor(0.5532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7164, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:25<08:33,  5.46s/it][A

	loss_cls: tensor(0.7413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9170, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:31<08:30,  5.49s/it][A

	loss_cls: tensor(0.6395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9115, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:36<08:23,  5.48s/it][A

	loss_cls: tensor(0.4532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5680, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:42<08:19,  5.49s/it][A

	loss_cls: tensor(0.6249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7500, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:47<08:14,  5.49s/it][A

	loss_cls: tensor(0.6213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:53<08:10,  5.51s/it][A

	loss_cls: tensor(0.5817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8262, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:58<08:04,  5.50s/it][A

	loss_cls: tensor(0.6331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7601, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:04<07:58,  5.50s/it][A

	loss_cls: tensor(0.5812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6279, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:09<07:53,  5.50s/it][A

	loss_cls: tensor(0.7637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2184, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:15<07:46,  5.49s/it][A

	loss_cls: tensor(0.8507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2645, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:20<07:42,  5.50s/it][A

	loss_cls: tensor(0.7333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0654, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:26<07:35,  5.49s/it][A

	loss_cls: tensor(0.4222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5474, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:31<07:32,  5.52s/it][A

	loss_cls: tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7849, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:37<07:26,  5.51s/it][A

	loss_cls: tensor(0.4566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6605, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:42<07:19,  5.49s/it][A

	loss_cls: tensor(0.6835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8387, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:48<07:14,  5.50s/it][A

	loss_cls: tensor(0.6382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9110, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:53<07:08,  5.49s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0622, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:59<07:04,  5.51s/it][A

	loss_cls: tensor(0.7641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9516, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:04<06:58,  5.50s/it][A

	loss_cls: tensor(0.5959, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2102, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8061, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:10<06:53,  5.51s/it][A

	loss_cls: tensor(0.3828, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0098, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:15<06:46,  5.50s/it][A

	loss_cls: tensor(0.7522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9444, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:21<06:40,  5.48s/it][A

	loss_cls: tensor(0.3901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6597, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:26<06:36,  5.50s/it][A

	loss_cls: tensor(0.5068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9502, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:32<06:28,  5.47s/it][A

	loss_cls: tensor(0.6400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7672, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:37<06:24,  5.50s/it][A

	loss_cls: tensor(0.8933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2597, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:43<06:18,  5.49s/it][A

	loss_cls: tensor(0.8293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0447, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:48<06:14,  5.51s/it][A

	loss_cls: tensor(0.4829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6853, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:54<06:08,  5.50s/it][A

	loss_cls: tensor(0.6323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9181, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:59<06:02,  5.49s/it][A

	loss_cls: tensor(0.5234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7530, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:05<05:58,  5.51s/it][A

	loss_cls: tensor(0.4192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5671, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:10<05:51,  5.49s/it][A

	loss_cls: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5425, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:16<05:46,  5.50s/it][A

	loss_cls: tensor(0.9548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3066, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:21<05:33,  5.37s/it][A

	loss_cls: tensor(0.6857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9378, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:26<05:29,  5.41s/it][A

	loss_cls: tensor(0.6404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9395, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:32<05:25,  5.42s/it][A

	loss_cls: tensor(0.6768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8273, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:37<05:20,  5.43s/it][A

	loss_cls: tensor(0.4581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5086, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:43<05:16,  5.46s/it][A

	loss_cls: tensor(0.6751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1252, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:48<05:11,  5.47s/it][A

	loss_cls: tensor(0.3886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4173, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:54<05:08,  5.51s/it][A

	loss_cls: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6838, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:59<05:03,  5.51s/it][A

	loss_cls: tensor(0.5716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7465, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:05<04:58,  5.52s/it][A

	loss_cls: tensor(0.7142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8541, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:10<04:51,  5.50s/it][A

	loss_cls: tensor(0.5945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9854, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:16<04:45,  5.49s/it][A

	loss_cls: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7212, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:21<04:40,  5.50s/it][A

	loss_cls: tensor(0.7468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9673, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:27<04:34,  5.49s/it][A

	loss_cls: tensor(0.5934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6959, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:32<04:29,  5.51s/it][A

	loss_cls: tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7896, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:38<04:23,  5.49s/it][A

	loss_cls: tensor(0.4502, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6217, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:43<04:19,  5.53s/it][A

	loss_cls: tensor(0.6487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8485, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:49<04:13,  5.51s/it][A

	loss_cls: tensor(0.6117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8295, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:54<04:06,  5.48s/it][A

	loss_cls: tensor(0.5027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5496, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:00<04:02,  5.51s/it][A

	loss_cls: tensor(0.5443, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5952, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:05<03:56,  5.49s/it][A

	loss_cls: tensor(0.4445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6312, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:11<03:51,  5.51s/it][A

	loss_cls: tensor(0.3775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5270, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:16<03:45,  5.49s/it][A

	loss_cls: tensor(0.6564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0605, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:22<03:40,  5.51s/it][A

	loss_cls: tensor(0.3783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4854, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:27<03:33,  5.49s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8274, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:33<03:27,  5.47s/it][A

	loss_cls: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0833, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:38<03:23,  5.49s/it][A

	loss_cls: tensor(0.3412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3703, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:44<03:17,  5.49s/it][A

	loss_cls: tensor(0.8127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9042, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:49<03:12,  5.51s/it][A

	loss_cls: tensor(1.1942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3659, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:55<03:06,  5.49s/it][A

	loss_cls: tensor(1.4931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.9119, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:00<03:02,  5.52s/it][A

	loss_cls: tensor(0.6604, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0099, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:06<02:56,  5.50s/it][A

	loss_cls: tensor(0.5817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7461, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:11<02:50,  5.48s/it][A

	loss_cls: tensor(0.5601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6731, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:17<02:45,  5.51s/it][A

	loss_cls: tensor(0.5489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0428, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:22<02:39,  5.50s/it][A

	loss_cls: tensor(0.4360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6523, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:28<02:34,  5.52s/it][A

	loss_cls: tensor(0.5881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6360, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:33<02:28,  5.50s/it][A

	loss_cls: tensor(0.5575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7135, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:39<02:23,  5.51s/it][A

	loss_cls: tensor(0.4792, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6603, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:44<02:17,  5.50s/it][A

	loss_cls: tensor(0.5555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8008, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:50<02:11,  5.48s/it][A

	loss_cls: tensor(0.7165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1685, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:55<02:06,  5.50s/it][A

	loss_cls: tensor(0.6514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8802, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:01<02:00,  5.49s/it][A

	loss_cls: tensor(0.4615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6714, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:06<01:55,  5.51s/it][A

	loss_cls: tensor(0.8180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2096, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0276, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:12<01:49,  5.49s/it][A

	loss_cls: tensor(0.6015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7978, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:17<01:43,  5.47s/it][A

	loss_cls: tensor(0.6149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0909, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:23<01:38,  5.47s/it][A

	loss_cls: tensor(0.6900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9812, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:28<01:32,  5.47s/it][A

	loss_cls: tensor(0.6394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9533, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:34<01:27,  5.47s/it][A

	loss_cls: tensor(0.5830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0504, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:39<01:22,  5.47s/it][A

	loss_cls: tensor(0.4098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7009, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:45<01:16,  5.50s/it][A

	loss_cls: tensor(0.7271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0133, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:50<01:11,  5.48s/it][A

	loss_cls: tensor(0.6850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8838, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:56<01:06,  5.51s/it][A

	loss_cls: tensor(0.5455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9084, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:01<01:01,  5.55s/it][A

	loss_cls: tensor(0.6210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7962, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:07<00:55,  5.59s/it][A

	loss_cls: tensor(0.4703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6331, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:13<00:50,  5.59s/it][A

	loss_cls: tensor(0.6553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8591, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:18<00:44,  5.54s/it][A

	loss_cls: tensor(0.7454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9476, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:23<00:38,  5.51s/it][A

	loss_cls: tensor(0.7784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8743, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:29<00:32,  5.46s/it][A

	loss_cls: tensor(0.6086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8515, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:34<00:27,  5.44s/it][A

	loss_cls: tensor(0.4597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6341, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:40<00:21,  5.41s/it][A

	loss_cls: tensor(0.4716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0261, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4977, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:45<00:16,  5.42s/it][A

	loss_cls: tensor(0.6601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9213, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:50<00:10,  5.40s/it][A

	loss_cls: tensor(0.4805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7701, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:56<00:05,  5.38s/it][A

	loss_cls: tensor(1.0105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1411, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:58<00:00,  5.42s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3291, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8489431734644087

	Training cls acc: 0.6854402071563088

	Training cls prec: 0.578783400764333

	Training cls rec: 0.6160912690573708

	Training cls f1: 0.5318005179898385

--
	Training ner acc: 0.9551878304758328

	Training ner prec: 0.2729954470220202

	Training ner rec: 0.2804383575678841

	Training ner f1: 0.2759131902074863

	Current Learning rate:  0.0007142857142857143



  1%|          | 1/177 [00:00<02:04,  1.41it/s][A
  1%|          | 2/177 [00:01<01:57,  1.49it/s][A
  2%|▏         | 3/177 [00:02<02:00,  1.44it/s][A
  2%|▏         | 4/177 [00:02<02:01,  1.42it/s][A
  3%|▎         | 5/177 [00:03<01:57,  1.47it/s][A
  3%|▎         | 6/177 [00:04<01:59,  1.43it/s][A
  4%|▍         | 7/177 [00:04<01:59,  1.42it/s][A
  5%|▍         | 8/177 [00:05<02:00,  1.41it/s][A
  5%|▌         | 9/177 [00:06<01:56,  1.45it/s][A
  6%|▌         | 10/177 [00:06<01:56,  1.43it/s][A
  6%|▌         | 11/177 [00:07<01:57,  1.41it/s][A
  7%|▋         | 12/177 [00:08<01:53,  1.45it/s][A
  7%|▋         | 13/177 [00:09<01:54,  1.43it/s][A
  8%|▊         | 14/177 [00:09<01:55,  1.41it/s][A
  8%|▊         | 15/177 [00:10<01:56,  1.39it/s][A
  9%|▉         | 16/177 [00:11<01:51,  1.44it/s][A
 10%|▉         | 17/177 [00:11<01:52,  1.42it/s][A
 10%|█         | 18/177 [00:12<01:53,  1.41it/s][A
 11%|█         | 19/177 [00:13<01:52,  1.40it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8598872804540699

	Validation cls acc: 0.8811205273069679

	Validation cls prec: 0.6627488566047888

	Validation cls rec: 0.6929412160344364

	Validation cls f1: 0.6715185630439868

--
	Validation ner acc: 0.9537075141699931

	Validation ner prec: 0.3961699329389849

	Validation ner rec: 0.4067796610169492

	Validation ner f1: 0.40126792642461295



  0%|          | 1/354 [00:05<32:09,  5.47s/it][A

	loss_cls: tensor(0.6578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9869, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:50,  5.43s/it][A

	loss_cls: tensor(0.5274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7680, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:50,  5.44s/it][A

	loss_cls: tensor(0.7477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9322, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:36,  5.42s/it][A

	loss_cls: tensor(0.7729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9166, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:24,  5.40s/it][A

	loss_cls: tensor(0.3263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4713, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:24,  5.42s/it][A

	loss_cls: tensor(0.5646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8491, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:15,  5.41s/it][A

	loss_cls: tensor(0.3485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3763, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:20,  5.44s/it][A

	loss_cls: tensor(0.8143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3519, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:16,  5.44s/it][A

	loss_cls: tensor(0.4107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5454, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:16,  5.46s/it][A

	loss_cls: tensor(0.7165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1519, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<31:03,  5.43s/it][A

	loss_cls: tensor(0.4155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5116, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:54,  5.42s/it][A

	loss_cls: tensor(0.5064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6207, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:54,  5.44s/it][A

	loss_cls: tensor(0.5016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0826, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5842, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<30:46,  5.43s/it][A

	loss_cls: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8621, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:45,  5.45s/it][A

	loss_cls: tensor(0.4888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6507, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:33,  5.42s/it][A

	loss_cls: tensor(0.4573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5560, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:33,  5.44s/it][A

	loss_cls: tensor(0.4489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4997, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:22,  5.42s/it][A

	loss_cls: tensor(0.4760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6332, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:13,  5.41s/it][A

	loss_cls: tensor(0.5112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6076, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:15,  5.43s/it][A

	loss_cls: tensor(0.7932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2202, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:08,  5.43s/it][A

	loss_cls: tensor(0.4760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7378, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<30:09,  5.45s/it][A

	loss_cls: tensor(0.3842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4733, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:58,  5.43s/it][A

	loss_cls: tensor(0.5839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7165, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:59,  5.45s/it][A

	loss_cls: tensor(0.4204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4909, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:45,  5.43s/it][A

	loss_cls: tensor(0.7349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8362, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:21<29:34,  5.41s/it][A

	loss_cls: tensor(0.7760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0416, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:36,  5.43s/it][A

	loss_cls: tensor(0.5016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5634, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:32<29:25,  5.42s/it][A

	loss_cls: tensor(0.4656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6547, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:27,  5.44s/it][A

	loss_cls: tensor(0.4748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8701, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:16,  5.42s/it][A

	loss_cls: tensor(0.6223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1973, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:48<29:16,  5.44s/it][A

	loss_cls: tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4400, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<29:05,  5.42s/it][A

	loss_cls: tensor(1.1480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2864, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:59<29:05,  5.44s/it][A

	loss_cls: tensor(0.4368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4832, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:56,  5.43s/it][A

	loss_cls: tensor(1.0496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3201, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:10<28:47,  5.42s/it][A

	loss_cls: tensor(0.4026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8308, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:15<28:52,  5.45s/it][A

	loss_cls: tensor(0.5565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8486, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:40,  5.43s/it][A

	loss_cls: tensor(0.6954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0638, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:26<28:38,  5.44s/it][A

	loss_cls: tensor(0.4621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6062, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:26,  5.42s/it][A

	loss_cls: tensor(0.5856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7992, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:37<28:25,  5.43s/it][A

	loss_cls: tensor(0.6055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7875, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:42<28:17,  5.42s/it][A

	loss_cls: tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9158, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:05,  5.40s/it][A

	loss_cls: tensor(0.6729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8131, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:53<28:07,  5.43s/it][A

	loss_cls: tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8586, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<28:01,  5.42s/it][A

	loss_cls: tensor(0.5943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1120, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:04<28:01,  5.44s/it][A

	loss_cls: tensor(0.6629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8516, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:50,  5.42s/it][A

	loss_cls: tensor(0.5102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7515, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:15<27:53,  5.45s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9905, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:20<27:43,  5.44s/it][A

	loss_cls: tensor(0.7480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8328, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:26<27:32,  5.42s/it][A

	loss_cls: tensor(0.5873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6630, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:31<27:28,  5.42s/it][A

	loss_cls: tensor(0.4739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7622, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:19,  5.41s/it][A

	loss_cls: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9215, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:42<27:18,  5.42s/it][A

	loss_cls: tensor(0.4474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5640, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:47<27:10,  5.42s/it][A

	loss_cls: tensor(0.6361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6837, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:53<27:11,  5.44s/it][A

	loss_cls: tensor(0.6264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9018, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:58<27:02,  5.43s/it][A

	loss_cls: tensor(0.3521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4966, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<26:53,  5.41s/it][A

	loss_cls: tensor(0.9639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2748, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:09<26:51,  5.43s/it][A

	loss_cls: tensor(1.2371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4265, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:43,  5.42s/it][A

	loss_cls: tensor(0.7412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0463, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:20<26:42,  5.43s/it][A

	loss_cls: tensor(0.7340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0979, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:34,  5.43s/it][A

	loss_cls: tensor(0.6940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0533, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:31<26:35,  5.44s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7880, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:36<26:25,  5.43s/it][A

	loss_cls: tensor(0.8687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1445, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:15,  5.42s/it][A

	loss_cls: tensor(0.6225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7939, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:47<26:13,  5.43s/it][A

	loss_cls: tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7432, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:52<26:02,  5.41s/it][A

	loss_cls: tensor(0.6269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0595, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6863, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:58<26:01,  5.42s/it][A

	loss_cls: tensor(0.6444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9711, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:03<25:52,  5.41s/it][A

	loss_cls: tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9718, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:09<25:51,  5.42s/it][A

	loss_cls: tensor(0.5754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8505, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:14<25:42,  5.41s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7876, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:19<25:33,  5.40s/it][A

	loss_cls: tensor(0.5021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9575, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:25<25:32,  5.41s/it][A

	loss_cls: tensor(0.5430, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7340, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:30<25:23,  5.40s/it][A

	loss_cls: tensor(0.5553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6979, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:36<25:22,  5.42s/it][A

	loss_cls: tensor(0.5464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0187, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:41<25:15,  5.41s/it][A

	loss_cls: tensor(0.4986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7239, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:46<25:16,  5.43s/it][A

	loss_cls: tensor(0.6662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8720, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:52<25:07,  5.42s/it][A

	loss_cls: tensor(0.6406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9637, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:57<24:58,  5.41s/it][A

	loss_cls: tensor(0.5435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8309, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:03<24:55,  5.42s/it][A

	loss_cls: tensor(0.7500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9726, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:08<24:45,  5.40s/it][A

	loss_cls: tensor(0.5075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7301, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:14<24:43,  5.42s/it][A

	loss_cls: tensor(0.3332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6399, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:19<24:35,  5.41s/it][A

	loss_cls: tensor(0.9602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1081, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:24<24:32,  5.41s/it][A

	loss_cls: tensor(0.8121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9638, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:30<24:22,  5.40s/it][A

	loss_cls: tensor(0.4305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6312, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:35<24:14,  5.39s/it][A

	loss_cls: tensor(0.5738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7055, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:41<24:17,  5.42s/it][A

	loss_cls: tensor(0.6025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7552, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:46<24:10,  5.41s/it][A

	loss_cls: tensor(0.6627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8565, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:51<24:13,  5.44s/it][A

	loss_cls: tensor(0.5050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0154, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:57<24:06,  5.44s/it][A

	loss_cls: tensor(0.8592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5090, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:02<24:05,  5.45s/it][A

	loss_cls: tensor(0.5829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2413, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:08<23:56,  5.44s/it][A

	loss_cls: tensor(0.5524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9025, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:13<23:46,  5.42s/it][A

	loss_cls: tensor(0.7900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9597, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:19<23:45,  5.44s/it][A

	loss_cls: tensor(0.6261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8557, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:24<23:37,  5.43s/it][A

	loss_cls: tensor(0.4925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7531, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:30<23:37,  5.45s/it][A

	loss_cls: tensor(0.5829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7402, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:35<23:30,  5.44s/it][A

	loss_cls: tensor(0.7363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8832, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:40<23:27,  5.45s/it][A

	loss_cls: tensor(0.6446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9816, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:46<23:15,  5.43s/it][A

	loss_cls: tensor(0.5350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7478, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:51<23:07,  5.42s/it][A

	loss_cls: tensor(0.7876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9827, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:57<23:05,  5.43s/it][A

	loss_cls: tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8237, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:02<22:56,  5.42s/it][A

	loss_cls: tensor(0.6594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7707, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:08<22:55,  5.44s/it][A

	loss_cls: tensor(0.6827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8718, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:13<22:46,  5.42s/it][A

	loss_cls: tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8662, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:18<22:45,  5.44s/it][A

	loss_cls: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5939, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:24<22:35,  5.42s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8036, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:29<22:27,  5.41s/it][A

	loss_cls: tensor(0.5083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6915, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:35<22:25,  5.43s/it][A

	loss_cls: tensor(0.7717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0358, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:40<22:18,  5.42s/it][A

	loss_cls: tensor(0.4976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7915, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:46<22:19,  5.45s/it][A

	loss_cls: tensor(0.3967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5576, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:51<22:11,  5.44s/it][A

	loss_cls: tensor(0.3245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4462, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:56<22:09,  5.45s/it][A

	loss_cls: tensor(0.5729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3402, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:02<21:59,  5.43s/it][A

	loss_cls: tensor(0.7634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1586, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:07<21:50,  5.42s/it][A

	loss_cls: tensor(0.4737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5270, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:13<21:48,  5.43s/it][A

	loss_cls: tensor(0.5981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2213, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:18<21:41,  5.42s/it][A

	loss_cls: tensor(0.4485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6166, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:24<21:41,  5.45s/it][A

	loss_cls: tensor(0.3993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7702, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:29<21:32,  5.43s/it][A

	loss_cls: tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1464, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:34<21:30,  5.44s/it][A

	loss_cls: tensor(0.7789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9490, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:40<21:21,  5.43s/it][A

	loss_cls: tensor(0.6680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8666, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:45<21:14,  5.42s/it][A

	loss_cls: tensor(0.3192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6084, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:51<21:12,  5.44s/it][A

	loss_cls: tensor(0.7100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7736, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:56<21:03,  5.42s/it][A

	loss_cls: tensor(0.6601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8281, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:02<21:01,  5.44s/it][A

	loss_cls: tensor(0.5747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7018, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:07<20:49,  5.41s/it][A

	loss_cls: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6788, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:12<20:47,  5.42s/it][A

	loss_cls: tensor(0.6559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9277, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:18<20:38,  5.41s/it][A

	loss_cls: tensor(0.4957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7127, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:23<20:32,  5.40s/it][A

	loss_cls: tensor(0.8106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0516, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:29<20:32,  5.43s/it][A

	loss_cls: tensor(0.8626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0381, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:34<20:24,  5.42s/it][A

	loss_cls: tensor(0.8087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1595, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:39<20:21,  5.43s/it][A

	loss_cls: tensor(0.1990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2374, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:45<20:10,  5.40s/it][A

	loss_cls: tensor(0.7440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0449, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:50<20:09,  5.42s/it][A

	loss_cls: tensor(0.2494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2649, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:56<20:00,  5.41s/it][A

	loss_cls: tensor(0.3718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5217, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:01<19:58,  5.42s/it][A

	loss_cls: tensor(0.6615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1308, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:07<19:50,  5.41s/it][A

	loss_cls: tensor(0.4366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6608, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:12<19:44,  5.41s/it][A

	loss_cls: tensor(0.8622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2309, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:17<19:42,  5.42s/it][A

	loss_cls: tensor(0.6394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9242, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:23<19:34,  5.41s/it][A

	loss_cls: tensor(0.3600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4736, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:28<19:31,  5.43s/it][A

	loss_cls: tensor(0.5489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0951, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:34<19:23,  5.41s/it][A

	loss_cls: tensor(0.6255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8310, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:39<19:18,  5.41s/it][A

	loss_cls: tensor(0.4627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6218, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:44<19:13,  5.41s/it][A

	loss_cls: tensor(0.8090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0014, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:50<19:05,  5.40s/it][A

	loss_cls: tensor(0.5010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6597, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:55<19:02,  5.41s/it][A

	loss_cls: tensor(0.3725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5429, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:01<18:52,  5.39s/it][A

	loss_cls: tensor(0.6651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9438, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:06<18:49,  5.40s/it][A

	loss_cls: tensor(0.9490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1052, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:11<18:39,  5.38s/it][A

	loss_cls: tensor(0.3676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5283, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:17<18:40,  5.41s/it][A

	loss_cls: tensor(0.4367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7266, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:22<18:33,  5.41s/it][A

	loss_cls: tensor(0.7954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8838, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:28<18:26,  5.40s/it][A

	loss_cls: tensor(0.6671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1024, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:33<18:25,  5.42s/it][A

	loss_cls: tensor(0.5004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5325, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:38<18:16,  5.40s/it][A

	loss_cls: tensor(0.8085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9361, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:44<18:13,  5.41s/it][A

	loss_cls: tensor(0.6323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8208, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:49<18:04,  5.40s/it][A

	loss_cls: tensor(0.3338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3766, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:55<18:04,  5.42s/it][A

	loss_cls: tensor(0.5313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7693, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:00<17:54,  5.40s/it][A

	loss_cls: tensor(0.7137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1576, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:05<17:47,  5.39s/it][A

	loss_cls: tensor(0.8423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1189, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:11<17:44,  5.40s/it][A

	loss_cls: tensor(0.3296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3593, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:16<17:36,  5.39s/it][A

	loss_cls: tensor(0.7764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0589, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:22<17:34,  5.41s/it][A

	loss_cls: tensor(0.3573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5133, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:27<17:26,  5.40s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9282, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:33<17:25,  5.42s/it][A

	loss_cls: tensor(0.7605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9405, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:38<17:17,  5.40s/it][A

	loss_cls: tensor(0.7541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0235, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:43<17:09,  5.39s/it][A

	loss_cls: tensor(0.6845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9311, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:49<17:06,  5.40s/it][A

	loss_cls: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0714, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:54<16:59,  5.39s/it][A

	loss_cls: tensor(0.5570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6537, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:59<16:56,  5.41s/it][A

	loss_cls: tensor(0.5473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8914, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:05<16:49,  5.40s/it][A

	loss_cls: tensor(0.6341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8353, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:10<16:45,  5.41s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7669, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:16<16:38,  5.39s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9739, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:21<16:30,  5.38s/it][A

	loss_cls: tensor(0.9179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1139, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:26<16:28,  5.40s/it][A

	loss_cls: tensor(0.5962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7185, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:32<16:19,  5.38s/it][A

	loss_cls: tensor(0.5590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0044, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:37<16:17,  5.40s/it][A

	loss_cls: tensor(0.6884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7336, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:43<16:10,  5.39s/it][A

	loss_cls: tensor(0.5798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7704, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:48<16:08,  5.41s/it][A

	loss_cls: tensor(0.8420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9698, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:53<16:00,  5.40s/it][A

	loss_cls: tensor(0.5657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8543, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:59<15:54,  5.39s/it][A

	loss_cls: tensor(0.4538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6550, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:04<15:51,  5.41s/it][A

	loss_cls: tensor(0.5453, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6807, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:10<15:42,  5.38s/it][A

	loss_cls: tensor(0.4934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5905, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:15<15:40,  5.41s/it][A

	loss_cls: tensor(0.5516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0010, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:20<15:33,  5.40s/it][A

	loss_cls: tensor(0.4401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8044, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:26<15:31,  5.41s/it][A

	loss_cls: tensor(0.2574, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3195, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:31<15:22,  5.40s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8164, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:37<15:15,  5.38s/it][A

	loss_cls: tensor(0.4017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4519, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:42<15:12,  5.40s/it][A

	loss_cls: tensor(0.7121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0718, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:47<15:03,  5.38s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5062, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:53<15:02,  5.40s/it][A

	loss_cls: tensor(0.6995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0182, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:58<14:54,  5.39s/it][A

	loss_cls: tensor(0.5304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6877, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:04<14:51,  5.40s/it][A

	loss_cls: tensor(0.7587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9583, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:09<14:44,  5.39s/it][A

	loss_cls: tensor(0.9499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0664, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:14<14:36,  5.38s/it][A

	loss_cls: tensor(0.7279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2143, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:20<14:33,  5.39s/it][A

	loss_cls: tensor(0.3180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5595, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:25<14:26,  5.38s/it][A

	loss_cls: tensor(0.6790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8038, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:31<14:25,  5.41s/it][A

	loss_cls: tensor(0.7544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2705, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:36<14:17,  5.40s/it][A

	loss_cls: tensor(0.4308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5512, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:41<14:15,  5.42s/it][A

	loss_cls: tensor(0.4686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5969, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:47<14:07,  5.40s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5384, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:52<14:00,  5.39s/it][A

	loss_cls: tensor(1.1364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2920, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:58<13:56,  5.40s/it][A

	loss_cls: tensor(0.5335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5879, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:03<13:49,  5.39s/it][A

	loss_cls: tensor(0.4765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6394, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:08<13:47,  5.41s/it][A

	loss_cls: tensor(0.4556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6507, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:14<13:40,  5.39s/it][A

	loss_cls: tensor(0.7674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1054, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:19<13:36,  5.41s/it][A

	loss_cls: tensor(0.5675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5920, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:25<13:29,  5.40s/it][A

	loss_cls: tensor(0.4594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4850, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:30<13:22,  5.38s/it][A

	loss_cls: tensor(0.6173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7914, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:35<13:18,  5.39s/it][A

	loss_cls: tensor(0.7366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8496, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:41<13:11,  5.38s/it][A

	loss_cls: tensor(0.8201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0101, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:46<13:08,  5.40s/it][A

	loss_cls: tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6369, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:51<13:02,  5.40s/it][A

	loss_cls: tensor(0.2474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2666, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:57<12:58,  5.40s/it][A

	loss_cls: tensor(1.0439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6229, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:02<12:51,  5.39s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6116, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:08<12:44,  5.39s/it][A

	loss_cls: tensor(0.2122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2482, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:13<12:41,  5.40s/it][A

	loss_cls: tensor(0.4769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5834, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:18<12:36,  5.40s/it][A

	loss_cls: tensor(1.1009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1768, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:24<12:32,  5.42s/it][A

	loss_cls: tensor(0.9879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4646, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:29<12:24,  5.40s/it][A

	loss_cls: tensor(0.7770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9766, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:35<12:20,  5.41s/it][A

	loss_cls: tensor(0.4312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7541, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:40<12:12,  5.39s/it][A

	loss_cls: tensor(0.7245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0820, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:45<12:04,  5.37s/it][A

	loss_cls: tensor(0.6053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7225, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:51<12:02,  5.39s/it][A

	loss_cls: tensor(0.7075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9265, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:56<11:55,  5.38s/it][A

	loss_cls: tensor(0.7550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9447, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:02<11:53,  5.40s/it][A

	loss_cls: tensor(0.5767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6640, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:07<11:45,  5.39s/it][A

	loss_cls: tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3642, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0053, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:12<11:42,  5.41s/it][A

	loss_cls: tensor(0.5644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9627, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:18<11:35,  5.39s/it][A

	loss_cls: tensor(0.5211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7427, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:23<11:28,  5.38s/it][A

	loss_cls: tensor(0.5262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7245, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:29<11:26,  5.41s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7524, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:34<11:21,  5.40s/it][A

	loss_cls: tensor(0.5643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6399, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:40<11:18,  5.43s/it][A

	loss_cls: tensor(0.5109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6247, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:45<11:10,  5.41s/it][A

	loss_cls: tensor(0.6778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0120, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:50<11:07,  5.42s/it][A

	loss_cls: tensor(0.5527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3842, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9369, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:56<10:59,  5.41s/it][A

	loss_cls: tensor(0.6155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9594, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:01<10:55,  5.42s/it][A

	loss_cls: tensor(0.6818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0112, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:07<10:48,  5.41s/it][A

	loss_cls: tensor(0.4429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6549, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:12<10:43,  5.41s/it][A

	loss_cls: tensor(0.5405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7555, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:17<10:40,  5.43s/it][A

	loss_cls: tensor(0.7661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9661, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:23<10:32,  5.41s/it][A

	loss_cls: tensor(0.8789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0765, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:28<10:29,  5.42s/it][A

	loss_cls: tensor(0.4515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9350, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:34<10:23,  5.42s/it][A

	loss_cls: tensor(0.6680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9541, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:39<10:21,  5.45s/it][A

	loss_cls: tensor(0.5696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6855, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:45<10:16,  5.46s/it][A

	loss_cls: tensor(0.5429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7759, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:50<10:11,  5.46s/it][A

	loss_cls: tensor(0.7587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0975, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:56<10:08,  5.49s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7898, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:01<10:02,  5.48s/it][A

	loss_cls: tensor(0.5576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0283, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:07<09:57,  5.48s/it][A

	loss_cls: tensor(0.5468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7050, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:12<09:51,  5.47s/it][A

	loss_cls: tensor(0.7061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7766, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:18<09:48,  5.50s/it][A

	loss_cls: tensor(0.5514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8122, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:23<09:42,  5.50s/it][A

	loss_cls: tensor(0.5697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8024, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:29<09:36,  5.49s/it][A

	loss_cls: tensor(0.7015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2081, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:34<09:33,  5.51s/it][A

	loss_cls: tensor(0.6204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8488, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:40<09:25,  5.49s/it][A

	loss_cls: tensor(0.5903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7988, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:45<09:21,  5.51s/it][A

	loss_cls: tensor(0.5856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7146, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:51<09:14,  5.49s/it][A

	loss_cls: tensor(0.7239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0445, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:56<09:09,  5.50s/it][A

	loss_cls: tensor(0.6606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7922, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:02<09:03,  5.49s/it][A

	loss_cls: tensor(0.9421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0306, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:07<08:57,  5.48s/it][A

	loss_cls: tensor(0.8035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9516, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:13<08:53,  5.50s/it][A

	loss_cls: tensor(0.8103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9057, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:18<08:47,  5.49s/it][A

	loss_cls: tensor(0.5448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8424, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:24<08:42,  5.50s/it][A

	loss_cls: tensor(0.9613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0888, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:29<08:36,  5.49s/it][A

	loss_cls: tensor(0.9316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1106, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:35<08:32,  5.51s/it][A

	loss_cls: tensor(0.5188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6460, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:40<08:25,  5.49s/it][A

	loss_cls: tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8353, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:46<08:18,  5.48s/it][A

	loss_cls: tensor(0.5095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0614, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5708, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:51<08:15,  5.50s/it][A

	loss_cls: tensor(0.5247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6844, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:57<08:08,  5.49s/it][A

	loss_cls: tensor(0.3936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4815, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:02<08:04,  5.50s/it][A

	loss_cls: tensor(0.7206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0259, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:08<07:57,  5.49s/it][A

	loss_cls: tensor(0.5302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7733, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:13<07:52,  5.50s/it][A

	loss_cls: tensor(0.5297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7003, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:19<07:46,  5.49s/it][A

	loss_cls: tensor(0.4150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6180, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:24<07:40,  5.48s/it][A

	loss_cls: tensor(0.5821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8495, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:30<07:37,  5.51s/it][A

	loss_cls: tensor(0.7896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2138, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:35<07:30,  5.49s/it][A

	loss_cls: tensor(0.5525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7362, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:41<07:26,  5.52s/it][A

	loss_cls: tensor(0.7474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0964, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:46<07:21,  5.51s/it][A

	loss_cls: tensor(0.5728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8595, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:52<07:15,  5.52s/it][A

	loss_cls: tensor(0.3923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6192, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:57<07:08,  5.49s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6771, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:02<07:01,  5.48s/it][A

	loss_cls: tensor(0.7692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1715, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:08<06:56,  5.49s/it][A

	loss_cls: tensor(0.7575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1942, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:13<06:50,  5.47s/it][A

	loss_cls: tensor(0.6095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8985, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:19<06:47,  5.50s/it][A

	loss_cls: tensor(0.6921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0195, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:24<06:40,  5.48s/it][A

	loss_cls: tensor(0.6132, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7075, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:30<06:36,  5.50s/it][A

	loss_cls: tensor(0.5955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7134, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:35<06:29,  5.49s/it][A

	loss_cls: tensor(0.4878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8733, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:41<06:23,  5.48s/it][A

	loss_cls: tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8074, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:46<06:18,  5.49s/it][A

	loss_cls: tensor(0.5778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7344, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:52<06:12,  5.48s/it][A

	loss_cls: tensor(0.6516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0662, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:57<06:08,  5.50s/it][A

	loss_cls: tensor(0.4668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7436, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:03<06:02,  5.49s/it][A

	loss_cls: tensor(0.5946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8474, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:08<05:57,  5.49s/it][A

	loss_cls: tensor(0.6447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8057, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:14<05:50,  5.47s/it][A

	loss_cls: tensor(0.5075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8615, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:19<05:44,  5.47s/it][A

	loss_cls: tensor(1.1219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0822, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2041, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:25<05:40,  5.49s/it][A

	loss_cls: tensor(0.8760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2528, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:30<05:33,  5.47s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9764, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:36<05:30,  5.50s/it][A

	loss_cls: tensor(0.6080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0235, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:41<05:23,  5.49s/it][A

	loss_cls: tensor(0.6469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9045, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:47<05:18,  5.50s/it][A

	loss_cls: tensor(0.7967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3397, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:52<05:12,  5.48s/it][A

	loss_cls: tensor(0.6511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8819, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:58<05:05,  5.46s/it][A

	loss_cls: tensor(0.4497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9072, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:03<05:01,  5.47s/it][A

	loss_cls: tensor(0.8221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0053, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:09<04:55,  5.47s/it][A

	loss_cls: tensor(0.6071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8795, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:14<04:43,  5.35s/it][A

	loss_cls: tensor(0.6166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0126, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:19<04:32,  5.25s/it][A

	loss_cls: tensor(0.6092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8647, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:24<04:24,  5.19s/it][A

	loss_cls: tensor(0.5609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7967, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:29<04:17,  5.14s/it][A

	loss_cls: tensor(0.5497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6787, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:34<04:10,  5.10s/it][A

	loss_cls: tensor(0.6497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9530, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:39<04:04,  5.09s/it][A

	loss_cls: tensor(0.4514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8155, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:44<03:58,  5.07s/it][A

	loss_cls: tensor(0.8678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0070, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:49<03:54,  5.10s/it][A

	loss_cls: tensor(0.5133, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2830, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7963, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:55<03:54,  5.22s/it][A

	loss_cls: tensor(0.3582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3943, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:00<03:54,  5.33s/it][A

	loss_cls: tensor(0.4775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6794, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:06<03:51,  5.38s/it][A

	loss_cls: tensor(0.7605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0102, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:11<03:46,  5.40s/it][A

	loss_cls: tensor(0.4067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4835, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:17<03:43,  5.44s/it][A

	loss_cls: tensor(0.6393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9788, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:22<03:38,  5.45s/it][A

	loss_cls: tensor(0.5956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9869, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:28<03:33,  5.48s/it][A

	loss_cls: tensor(0.8050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2176, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:33<03:28,  5.47s/it][A

	loss_cls: tensor(1.1421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1886, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3307, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:39<03:23,  5.50s/it][A

	loss_cls: tensor(0.3446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4945, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:44<03:17,  5.49s/it][A

	loss_cls: tensor(0.5853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8601, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:50<03:11,  5.48s/it][A

	loss_cls: tensor(0.4307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5187, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:55<03:06,  5.47s/it][A

	loss_cls: tensor(0.4337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4888, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:00<02:59,  5.45s/it][A

	loss_cls: tensor(0.7069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2352, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:06<02:54,  5.45s/it][A

	loss_cls: tensor(0.8481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0786, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:11<02:48,  5.43s/it][A

	loss_cls: tensor(0.6678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9212, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:17<02:42,  5.43s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7096, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:22<02:36,  5.41s/it][A

	loss_cls: tensor(0.5696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8372, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:27<02:30,  5.39s/it][A

	loss_cls: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8866, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:33<02:26,  5.42s/it][A

	loss_cls: tensor(0.6518, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9045, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:38<02:20,  5.41s/it][A

	loss_cls: tensor(0.6769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9120, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:44<02:15,  5.42s/it][A

	loss_cls: tensor(1.0012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1346, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:49<02:09,  5.41s/it][A

	loss_cls: tensor(0.3557, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6449, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:55<02:04,  5.42s/it][A

	loss_cls: tensor(0.6048, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8707, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:00<01:58,  5.40s/it][A

	loss_cls: tensor(0.9025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0436, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:05<01:53,  5.41s/it][A

	loss_cls: tensor(0.4981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6148, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:11<01:48,  5.40s/it][A

	loss_cls: tensor(0.5779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6600, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:16<01:42,  5.40s/it][A

	loss_cls: tensor(0.4956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7461, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:22<01:37,  5.42s/it][A

	loss_cls: tensor(0.5685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8471, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:27<01:31,  5.41s/it][A

	loss_cls: tensor(0.6284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7585, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:32<01:26,  5.41s/it][A

	loss_cls: tensor(0.6548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7540, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:38<01:20,  5.40s/it][A

	loss_cls: tensor(0.7717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9813, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:43<01:15,  5.41s/it][A

	loss_cls: tensor(0.6354, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7678, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:49<01:10,  5.40s/it][A

	loss_cls: tensor(0.4922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8039, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:54<01:04,  5.40s/it][A

	loss_cls: tensor(0.7060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8175, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:59<00:59,  5.41s/it][A

	loss_cls: tensor(0.7248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0221, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:05<00:53,  5.39s/it][A

	loss_cls: tensor(0.5891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8430, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:10<00:48,  5.40s/it][A

	loss_cls: tensor(0.6841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0313, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:15<00:43,  5.39s/it][A

	loss_cls: tensor(0.7878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0037, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:21<00:37,  5.41s/it][A

	loss_cls: tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6668, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:26<00:32,  5.40s/it][A

	loss_cls: tensor(0.5531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8489, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:32<00:26,  5.38s/it][A

	loss_cls: tensor(0.4730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8239, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:37<00:21,  5.39s/it][A

	loss_cls: tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8180, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:42<00:16,  5.39s/it][A

	loss_cls: tensor(0.5178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7701, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:48<00:10,  5.41s/it][A

	loss_cls: tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6623, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:53<00:05,  5.40s/it][A

	loss_cls: tensor(0.5922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8556, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:55<00:00,  5.41s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6569, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8423573528811082

	Training cls acc: 0.6965630885122411

	Training cls prec: 0.5814838331787484

	Training cls rec: 0.615523347885636

	Training cls f1: 0.5377746529947522

--
	Training ner acc: 0.9553105296624583

	Training ner prec: 0.2738617751903701

	Training ner rec: 0.2822257492826109

	Training ner f1: 0.2776197703996951

	Current Learning rate:  0.0006857142857142857



  1%|          | 1/177 [00:00<01:55,  1.53it/s][A
  1%|          | 2/177 [00:01<02:02,  1.42it/s][A
  2%|▏         | 3/177 [00:02<02:04,  1.40it/s][A
  2%|▏         | 4/177 [00:02<01:59,  1.45it/s][A
  3%|▎         | 5/177 [00:03<02:01,  1.42it/s][A
  3%|▎         | 6/177 [00:04<02:02,  1.40it/s][A
  4%|▍         | 7/177 [00:04<02:02,  1.39it/s][A
  5%|▍         | 8/177 [00:05<01:57,  1.44it/s][A
  5%|▌         | 9/177 [00:06<01:58,  1.41it/s][A
  6%|▌         | 10/177 [00:07<01:59,  1.40it/s][A
  6%|▌         | 11/177 [00:07<01:55,  1.44it/s][A
  7%|▋         | 12/177 [00:08<01:55,  1.42it/s][A
  7%|▋         | 13/177 [00:09<01:56,  1.41it/s][A
  8%|▊         | 14/177 [00:09<01:58,  1.37it/s][A
  8%|▊         | 15/177 [00:10<01:53,  1.42it/s][A
  9%|▉         | 16/177 [00:11<01:54,  1.41it/s][A
 10%|▉         | 17/177 [00:12<01:54,  1.39it/s][A
 10%|█         | 18/177 [00:12<01:54,  1.39it/s][A
 11%|█         | 19/177 [00:13<01:50,  1.43it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7774243999672474

	Validation cls acc: 0.7172787193973634

	Validation cls prec: 0.6077145547484532

	Validation cls rec: 0.6031981436642453

	Validation cls f1: 0.5614926191197378

--
	Validation ner acc: 0.9543417866882946

	Validation ner prec: 0.40022165897415213

	Validation ner rec: 0.41064030131826745

	Validation ner f1: 0.4052331347634596



  0%|          | 1/354 [00:05<31:32,  5.36s/it][A

	loss_cls: tensor(0.5060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7002, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:47,  5.42s/it][A

	loss_cls: tensor(0.9067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3217, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:36,  5.40s/it][A

	loss_cls: tensor(0.4639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0233, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:38,  5.42s/it][A

	loss_cls: tensor(0.6765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7839, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:25,  5.40s/it][A

	loss_cls: tensor(0.4906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6492, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:27,  5.42s/it][A

	loss_cls: tensor(0.4712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6800, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:14,  5.40s/it][A

	loss_cls: tensor(0.5190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7449, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:03,  5.38s/it][A

	loss_cls: tensor(0.5104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6293, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:04,  5.40s/it][A

	loss_cls: tensor(1.0013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3052, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<30:55,  5.40s/it][A

	loss_cls: tensor(0.4633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5737, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:56,  5.41s/it][A

	loss_cls: tensor(0.6298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9177, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:46,  5.40s/it][A

	loss_cls: tensor(0.4713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6183, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:45,  5.41s/it][A

	loss_cls: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8683, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:35,  5.40s/it][A

	loss_cls: tensor(0.7297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1034, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:26,  5.39s/it][A

	loss_cls: tensor(0.6176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7879, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:29,  5.41s/it][A

	loss_cls: tensor(0.7116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9092, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:20,  5.40s/it][A

	loss_cls: tensor(0.4754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6285, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:20,  5.42s/it][A

	loss_cls: tensor(0.6704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8221, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:11,  5.41s/it][A

	loss_cls: tensor(0.6778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0784, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:11,  5.42s/it][A

	loss_cls: tensor(0.4968, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5380, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:01,  5.41s/it][A

	loss_cls: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6309, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<30:00,  5.42s/it][A

	loss_cls: tensor(0.6652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1930, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:52,  5.42s/it][A

	loss_cls: tensor(0.5293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7216, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:42,  5.40s/it][A

	loss_cls: tensor(0.7880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9618, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:43,  5.42s/it][A

	loss_cls: tensor(0.5146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6879, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:36,  5.41s/it][A

	loss_cls: tensor(0.5338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6710, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:34,  5.43s/it][A

	loss_cls: tensor(0.4279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7050, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:24,  5.41s/it][A

	loss_cls: tensor(0.4902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7479, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:23,  5.43s/it][A

	loss_cls: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7572, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:15,  5.42s/it][A

	loss_cls: tensor(0.4920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7127, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:05,  5.40s/it][A

	loss_cls: tensor(0.5429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9399, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<29:09,  5.43s/it][A

	loss_cls: tensor(0.6480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8220, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:59,  5.42s/it][A

	loss_cls: tensor(0.3083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6061, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<28:52,  5.41s/it][A

	loss_cls: tensor(0.8469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9045, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:38,  5.39s/it][A

	loss_cls: tensor(0.9811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1419, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:41,  5.41s/it][A

	loss_cls: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0009, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:36,  5.42s/it][A

	loss_cls: tensor(0.4022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5207, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:27,  5.40s/it][A

	loss_cls: tensor(0.8241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0895, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:27,  5.42s/it][A

	loss_cls: tensor(0.3824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6546, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:14,  5.40s/it][A

	loss_cls: tensor(0.6218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0818, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<28:13,  5.41s/it][A

	loss_cls: tensor(0.6460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8483, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:04,  5.40s/it][A

	loss_cls: tensor(0.4844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5622, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<28:07,  5.43s/it][A

	loss_cls: tensor(0.5591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7931, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<28:00,  5.42s/it][A

	loss_cls: tensor(0.4496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6075, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:52,  5.41s/it][A

	loss_cls: tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8515, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:08<27:50,  5.42s/it][A

	loss_cls: tensor(0.4769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7055, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:38,  5.40s/it][A

	loss_cls: tensor(0.2721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4513, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:37,  5.42s/it][A

	loss_cls: tensor(0.6012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9536, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:29,  5.41s/it][A

	loss_cls: tensor(0.8347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1604, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:31,  5.43s/it][A

	loss_cls: tensor(0.4834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6361, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:23,  5.43s/it][A

	loss_cls: tensor(0.6635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0678, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:13,  5.41s/it][A

	loss_cls: tensor(0.5534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0542, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:46<27:12,  5.42s/it][A

	loss_cls: tensor(0.6837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9298, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<27:03,  5.41s/it][A

	loss_cls: tensor(1.0627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3961, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:57<27:03,  5.43s/it][A

	loss_cls: tensor(1.0532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4106, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<26:56,  5.42s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6954, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:56,  5.44s/it][A

	loss_cls: tensor(0.5421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7945, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:47,  5.43s/it][A

	loss_cls: tensor(0.5399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8458, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:35,  5.41s/it][A

	loss_cls: tensor(0.8026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1251, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:24<26:35,  5.43s/it][A

	loss_cls: tensor(0.4527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6893, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:30<26:26,  5.41s/it][A

	loss_cls: tensor(0.7632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8569, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:35<26:26,  5.43s/it][A

	loss_cls: tensor(0.6662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8055, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:17,  5.42s/it][A

	loss_cls: tensor(0.9459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9921, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:46<26:16,  5.44s/it][A

	loss_cls: tensor(0.4693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5863, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:51<26:08,  5.43s/it][A

	loss_cls: tensor(0.6731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8069, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:57<25:59,  5.41s/it][A

	loss_cls: tensor(0.5516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8901, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:02<25:58,  5.43s/it][A

	loss_cls: tensor(0.5459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5841, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:08<25:46,  5.41s/it][A

	loss_cls: tensor(0.4464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8231, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:13<25:47,  5.43s/it][A

	loss_cls: tensor(0.6218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9669, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:19<25:41,  5.43s/it][A

	loss_cls: tensor(0.6705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9183, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:24<25:40,  5.44s/it][A

	loss_cls: tensor(0.6833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9607, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:29<25:31,  5.43s/it][A

	loss_cls: tensor(0.3238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5132, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:35<25:20,  5.41s/it][A

	loss_cls: tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7912, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:40<25:19,  5.43s/it][A

	loss_cls: tensor(0.3446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4085, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:46<25:08,  5.41s/it][A

	loss_cls: tensor(0.4172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8028, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:51<25:11,  5.44s/it][A

	loss_cls: tensor(0.3734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4520, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:57<25:02,  5.42s/it][A

	loss_cls: tensor(0.7419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0502, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:02<24:59,  5.43s/it][A

	loss_cls: tensor(0.5474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7604, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:07<24:49,  5.42s/it][A

	loss_cls: tensor(0.6992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8337, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:13<24:39,  5.40s/it][A

	loss_cls: tensor(1.0990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2874, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:18<24:34,  5.40s/it][A

	loss_cls: tensor(0.7259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8814, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:24<24:26,  5.39s/it][A

	loss_cls: tensor(0.8938, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1224, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:29<24:28,  5.42s/it][A

	loss_cls: tensor(0.5635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8672, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:34<24:18,  5.40s/it][A

	loss_cls: tensor(0.3356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5179, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:40<24:17,  5.42s/it][A

	loss_cls: tensor(0.6415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8952, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:45<24:09,  5.41s/it][A

	loss_cls: tensor(0.4748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8682, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:51<24:03,  5.41s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5921, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:56<24:03,  5.43s/it][A

	loss_cls: tensor(0.6234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9302, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:01<23:54,  5.41s/it][A

	loss_cls: tensor(0.9815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1112, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:07<23:55,  5.44s/it][A

	loss_cls: tensor(0.8965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1578, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:12<23:46,  5.42s/it][A

	loss_cls: tensor(0.7655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8337, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:18<23:43,  5.43s/it][A

	loss_cls: tensor(0.5653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6905, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:23<23:33,  5.42s/it][A

	loss_cls: tensor(0.6573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7400, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:29<23:24,  5.40s/it][A

	loss_cls: tensor(0.6438, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7766, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:34<23:22,  5.41s/it][A

	loss_cls: tensor(0.3796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9352, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:39<23:15,  5.41s/it][A

	loss_cls: tensor(0.3785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7852, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:45<23:17,  5.44s/it][A

	loss_cls: tensor(0.7427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7734, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:50<23:09,  5.43s/it][A

	loss_cls: tensor(0.6516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6806, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:56<23:07,  5.44s/it][A

	loss_cls: tensor(0.7909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9275, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:01<22:57,  5.42s/it][A

	loss_cls: tensor(0.6228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0339, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6567, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:07<22:48,  5.41s/it][A

	loss_cls: tensor(0.5275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6295, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:12<22:47,  5.43s/it][A

	loss_cls: tensor(0.8579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2149, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:17<22:43,  5.43s/it][A

	loss_cls: tensor(0.4639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5822, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:23<22:41,  5.45s/it][A

	loss_cls: tensor(0.7063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1329, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:28<22:32,  5.43s/it][A

	loss_cls: tensor(0.7131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2314, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9446, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:34<22:30,  5.45s/it][A

	loss_cls: tensor(0.5373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6986, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:39<22:20,  5.43s/it][A

	loss_cls: tensor(0.7272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0346, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:45<22:10,  5.41s/it][A

	loss_cls: tensor(0.4892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6230, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:50<22:10,  5.43s/it][A

	loss_cls: tensor(0.7516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9640, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:55<22:03,  5.42s/it][A

	loss_cls: tensor(0.7973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9987, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:01<22:01,  5.44s/it][A

	loss_cls: tensor(0.7991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0590, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:06<21:53,  5.43s/it][A

	loss_cls: tensor(0.5670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7676, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:12<21:51,  5.44s/it][A

	loss_cls: tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1925, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:17<21:42,  5.43s/it][A

	loss_cls: tensor(0.5160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7208, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:23<21:32,  5.41s/it][A

	loss_cls: tensor(0.5877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7385, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:28<21:31,  5.43s/it][A

	loss_cls: tensor(0.5530, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7161, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:33<21:23,  5.42s/it][A

	loss_cls: tensor(0.5905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6587, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:39<21:20,  5.43s/it][A

	loss_cls: tensor(0.6051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6914, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:44<21:12,  5.41s/it][A

	loss_cls: tensor(0.4222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6743, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:50<21:10,  5.43s/it][A

	loss_cls: tensor(0.6967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8939, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:55<21:02,  5.42s/it][A

	loss_cls: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7122, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:01<20:59,  5.43s/it][A

	loss_cls: tensor(0.5553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7563, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:06<20:52,  5.42s/it][A

	loss_cls: tensor(0.5821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0655, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:11<20:44,  5.41s/it][A

	loss_cls: tensor(0.4612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9043, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:17<20:43,  5.43s/it][A

	loss_cls: tensor(0.5097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7090, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:22<20:35,  5.42s/it][A

	loss_cls: tensor(1.0400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1558, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:28<20:35,  5.44s/it][A

	loss_cls: tensor(0.6146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7793, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:33<20:25,  5.42s/it][A

	loss_cls: tensor(0.8017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8867, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:39<20:22,  5.43s/it][A

	loss_cls: tensor(0.5310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6374, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:44<20:16,  5.43s/it][A

	loss_cls: tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7584, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:49<20:09,  5.42s/it][A

	loss_cls: tensor(0.6538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7348, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:55<20:06,  5.43s/it][A

	loss_cls: tensor(0.3896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4937, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:00<19:56,  5.42s/it][A

	loss_cls: tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4911, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:06<19:54,  5.43s/it][A

	loss_cls: tensor(0.6793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8484, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:11<19:45,  5.41s/it][A

	loss_cls: tensor(0.4066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7599, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:16<19:42,  5.42s/it][A

	loss_cls: tensor(0.3123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1261, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4383, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:22<19:35,  5.42s/it][A

	loss_cls: tensor(0.7695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9894, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:27<19:28,  5.41s/it][A

	loss_cls: tensor(0.4919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7162, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:33<19:26,  5.43s/it][A

	loss_cls: tensor(1.0865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5152, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:38<19:17,  5.41s/it][A

	loss_cls: tensor(0.6337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8258, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:44<19:16,  5.43s/it][A

	loss_cls: tensor(0.5298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6950, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:49<19:07,  5.41s/it][A

	loss_cls: tensor(0.8340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3497, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:54<19:07,  5.44s/it][A

	loss_cls: tensor(0.7699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9592, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:00<18:59,  5.42s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8746, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:05<18:51,  5.41s/it][A

	loss_cls: tensor(0.6243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8844, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:11<18:50,  5.43s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0089, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:16<18:41,  5.42s/it][A

	loss_cls: tensor(0.5349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6514, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:22<18:36,  5.42s/it][A

	loss_cls: tensor(0.5910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8367, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:27<18:30,  5.42s/it][A

	loss_cls: tensor(0.5801, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7815, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:32<18:28,  5.43s/it][A

	loss_cls: tensor(0.7876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9373, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:38<18:20,  5.42s/it][A

	loss_cls: tensor(0.5507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7043, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:43<18:13,  5.41s/it][A

	loss_cls: tensor(0.7233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0975, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:49<18:13,  5.44s/it][A

	loss_cls: tensor(0.6567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7980, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:54<18:05,  5.43s/it][A

	loss_cls: tensor(0.5036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8135, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:00<17:59,  5.43s/it][A

	loss_cls: tensor(0.6564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7370, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:05<17:52,  5.42s/it][A

	loss_cls: tensor(0.5057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6164, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:10<17:50,  5.43s/it][A

	loss_cls: tensor(0.4906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5147, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:16<17:43,  5.42s/it][A

	loss_cls: tensor(0.5204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0042, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:21<17:34,  5.41s/it][A

	loss_cls: tensor(0.6996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1768, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:27<17:32,  5.43s/it][A

	loss_cls: tensor(0.6388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:32<17:24,  5.41s/it][A

	loss_cls: tensor(0.6218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7153, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:37<17:23,  5.43s/it][A

	loss_cls: tensor(0.6614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8344, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:43<17:16,  5.42s/it][A

	loss_cls: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7675, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:48<17:12,  5.43s/it][A

	loss_cls: tensor(0.4982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2448, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7430, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:54<17:06,  5.43s/it][A

	loss_cls: tensor(0.6162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9981, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:59<16:58,  5.42s/it][A

	loss_cls: tensor(0.6747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1758, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:05<16:55,  5.43s/it][A

	loss_cls: tensor(0.4002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5435, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:10<16:46,  5.41s/it][A

	loss_cls: tensor(0.5121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1127, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6248, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:15<16:45,  5.43s/it][A

	loss_cls: tensor(0.3971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7263, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:21<16:37,  5.42s/it][A

	loss_cls: tensor(0.6203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8021, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:26<16:34,  5.44s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6170, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:32<16:26,  5.42s/it][A

	loss_cls: tensor(0.5319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6301, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:37<16:18,  5.41s/it][A

	loss_cls: tensor(0.6826, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9016, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:43<16:18,  5.43s/it][A

	loss_cls: tensor(0.8201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9712, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:48<16:08,  5.41s/it][A

	loss_cls: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7540, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:53<16:07,  5.44s/it][A

	loss_cls: tensor(0.6312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0658, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:59<16:00,  5.43s/it][A

	loss_cls: tensor(0.4867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6694, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:04<15:58,  5.45s/it][A

	loss_cls: tensor(0.4681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5380, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:10<15:50,  5.43s/it][A

	loss_cls: tensor(0.6348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9983, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:15<15:42,  5.42s/it][A

	loss_cls: tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7616, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:21<15:39,  5.43s/it][A

	loss_cls: tensor(0.5798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6386, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:26<15:31,  5.42s/it][A

	loss_cls: tensor(0.6239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7621, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:31<15:29,  5.43s/it][A

	loss_cls: tensor(0.6040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8500, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:37<15:21,  5.42s/it][A

	loss_cls: tensor(0.6520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9500, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:42<15:19,  5.44s/it][A

	loss_cls: tensor(0.4669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6475, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:48<15:10,  5.42s/it][A

	loss_cls: tensor(0.4633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6100, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:53<15:02,  5.41s/it][A

	loss_cls: tensor(0.4296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7801, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:59<14:58,  5.41s/it][A

	loss_cls: tensor(0.8724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1892, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:04<14:52,  5.41s/it][A

	loss_cls: tensor(0.7533, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8418, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:09<14:50,  5.43s/it][A

	loss_cls: tensor(0.6497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7958, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:15<14:43,  5.42s/it][A

	loss_cls: tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7326, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:20<14:39,  5.43s/it][A

	loss_cls: tensor(0.4747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7405, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:26<14:32,  5.42s/it][A

	loss_cls: tensor(0.6188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7513, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:31<14:23,  5.40s/it][A

	loss_cls: tensor(0.7644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1304, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:36<14:21,  5.42s/it][A

	loss_cls: tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1895, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6111, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:42<14:14,  5.41s/it][A

	loss_cls: tensor(1.2084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3852, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:47<14:12,  5.43s/it][A

	loss_cls: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6417, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:53<14:05,  5.42s/it][A

	loss_cls: tensor(0.5572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8444, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:58<14:02,  5.44s/it][A

	loss_cls: tensor(0.6004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8299, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:04<13:53,  5.41s/it][A

	loss_cls: tensor(0.5095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6217, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:09<13:45,  5.40s/it][A

	loss_cls: tensor(0.4657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5066, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:14<13:43,  5.42s/it][A

	loss_cls: tensor(0.5880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7314, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:20<13:38,  5.42s/it][A

	loss_cls: tensor(0.3178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4448, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:25<13:35,  5.44s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6222, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:31<13:27,  5.42s/it][A

	loss_cls: tensor(0.8630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4578, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:36<13:23,  5.43s/it][A

	loss_cls: tensor(0.3962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7837, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:41<13:16,  5.42s/it][A

	loss_cls: tensor(0.9705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3242, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:47<13:08,  5.40s/it][A

	loss_cls: tensor(0.4714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7115, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:52<13:07,  5.43s/it][A

	loss_cls: tensor(0.3661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4093, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:58<13:02,  5.43s/it][A

	loss_cls: tensor(0.7652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8769, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:03<13:00,  5.46s/it][A

	loss_cls: tensor(1.0036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3595, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:09<12:53,  5.44s/it][A

	loss_cls: tensor(0.7391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0834, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:14<12:49,  5.46s/it][A

	loss_cls: tensor(0.7628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1728, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:20<12:45,  5.47s/it][A

	loss_cls: tensor(0.5698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7963, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:25<12:37,  5.45s/it][A

	loss_cls: tensor(0.4706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6779, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:31<12:35,  5.47s/it][A

	loss_cls: tensor(0.5175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7870, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:36<12:29,  5.47s/it][A

	loss_cls: tensor(0.4211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6341, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:42<12:26,  5.49s/it][A

	loss_cls: tensor(0.7558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9155, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:47<12:17,  5.46s/it][A

	loss_cls: tensor(0.5775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9362, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:53<12:12,  5.46s/it][A

	loss_cls: tensor(0.6868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9045, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:58<12:02,  5.43s/it][A

	loss_cls: tensor(0.6565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8866, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:03<11:57,  5.43s/it][A

	loss_cls: tensor(0.8378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0793, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9171, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:09<11:50,  5.42s/it][A

	loss_cls: tensor(0.4950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7013, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:14<11:43,  5.41s/it][A

	loss_cls: tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9809, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:20<11:40,  5.43s/it][A

	loss_cls: tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0309, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:25<11:33,  5.42s/it][A

	loss_cls: tensor(0.5421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6437, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:30<11:30,  5.43s/it][A

	loss_cls: tensor(0.6636, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7680, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:36<11:21,  5.41s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9328, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:41<11:18,  5.43s/it][A

	loss_cls: tensor(0.4739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6108, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:47<11:11,  5.42s/it][A

	loss_cls: tensor(0.7342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9088, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:52<11:05,  5.41s/it][A

	loss_cls: tensor(0.7645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8472, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:57<11:02,  5.43s/it][A

	loss_cls: tensor(0.7144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9552, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:03<10:56,  5.42s/it][A

	loss_cls: tensor(0.7493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2775, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:08<10:51,  5.43s/it][A

	loss_cls: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7892, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:14<10:43,  5.41s/it][A

	loss_cls: tensor(0.5649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6770, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:19<10:40,  5.43s/it][A

	loss_cls: tensor(0.7751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9033, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:25<10:34,  5.42s/it][A

	loss_cls: tensor(0.5930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8391, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:30<10:27,  5.41s/it][A

	loss_cls: tensor(0.4063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4892, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:35<10:24,  5.43s/it][A

	loss_cls: tensor(0.6618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9903, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:41<10:17,  5.42s/it][A

	loss_cls: tensor(0.4172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4687, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:46<10:14,  5.43s/it][A

	loss_cls: tensor(0.6340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0380, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:52<10:07,  5.42s/it][A

	loss_cls: tensor(0.7595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0219, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:57<10:03,  5.44s/it][A

	loss_cls: tensor(0.5702, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:03<09:55,  5.41s/it][A

	loss_cls: tensor(0.4671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6056, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:08<09:48,  5.40s/it][A

	loss_cls: tensor(0.4999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7321, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:13<09:45,  5.42s/it][A

	loss_cls: tensor(0.4030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4658, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:19<09:38,  5.41s/it][A

	loss_cls: tensor(0.9041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2629, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:24<09:34,  5.42s/it][A

	loss_cls: tensor(0.5272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7519, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:30<09:27,  5.41s/it][A

	loss_cls: tensor(0.6764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9554, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:35<09:24,  5.43s/it][A

	loss_cls: tensor(0.4739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7226, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:40<09:18,  5.42s/it][A

	loss_cls: tensor(0.5302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7233, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:46<09:12,  5.41s/it][A

	loss_cls: tensor(0.8923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1123, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:51<09:07,  5.43s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8363, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:57<09:01,  5.41s/it][A

	loss_cls: tensor(0.3569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5978, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:02<08:56,  5.42s/it][A

	loss_cls: tensor(0.6406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8884, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:07<08:49,  5.40s/it][A

	loss_cls: tensor(0.4286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4744, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:13<08:45,  5.42s/it][A

	loss_cls: tensor(1.0475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4401, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:18<08:39,  5.41s/it][A

	loss_cls: tensor(0.4846, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9693, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:24<08:33,  5.40s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6698, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:29<08:29,  5.42s/it][A

	loss_cls: tensor(0.7305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0478, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:35<08:21,  5.39s/it][A

	loss_cls: tensor(0.5242, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6456, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:40<08:18,  5.41s/it][A

	loss_cls: tensor(0.7300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7953, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:45<08:11,  5.40s/it][A

	loss_cls: tensor(0.8788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1859, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:51<08:08,  5.42s/it][A

	loss_cls: tensor(0.6976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7795, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:56<08:01,  5.41s/it][A

	loss_cls: tensor(0.5607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6567, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:02<07:55,  5.40s/it][A

	loss_cls: tensor(0.5411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5840, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:07<07:52,  5.43s/it][A

	loss_cls: tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8294, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:12<07:44,  5.40s/it][A

	loss_cls: tensor(0.5752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9576, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:18<07:41,  5.42s/it][A

	loss_cls: tensor(0.5482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9666, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:23<07:34,  5.41s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6171, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:29<07:31,  5.43s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6125, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:34<07:24,  5.42s/it][A

	loss_cls: tensor(0.7249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8767, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:40<07:17,  5.41s/it][A

	loss_cls: tensor(0.5958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0684, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:45<07:13,  5.41s/it][A

	loss_cls: tensor(0.2535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2791, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:50<07:06,  5.40s/it][A

	loss_cls: tensor(0.3999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5838, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:56<07:02,  5.42s/it][A

	loss_cls: tensor(0.5261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8238, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:01<06:56,  5.42s/it][A

	loss_cls: tensor(0.5891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8617, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:07<06:52,  5.42s/it][A

	loss_cls: tensor(0.2661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3096, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:12<06:45,  5.41s/it][A

	loss_cls: tensor(0.9059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4154, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:17<06:39,  5.40s/it][A

	loss_cls: tensor(0.5880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7700, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:23<06:34,  5.41s/it][A

	loss_cls: tensor(0.3270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3816, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:28<06:30,  5.42s/it][A

	loss_cls: tensor(0.6458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9636, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:34<06:27,  5.46s/it][A

	loss_cls: tensor(0.4698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6314, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:39<06:21,  5.45s/it][A

	loss_cls: tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7684, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:45<06:18,  5.49s/it][A

	loss_cls: tensor(1.1309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3227, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:50<06:12,  5.48s/it][A

	loss_cls: tensor(0.4418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6201, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:56<06:06,  5.47s/it][A

	loss_cls: tensor(0.5338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7079, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:01<06:02,  5.49s/it][A

	loss_cls: tensor(0.8352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9596, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:07<05:56,  5.49s/it][A

	loss_cls: tensor(0.7615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0104, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:12<05:52,  5.51s/it][A

	loss_cls: tensor(0.4964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7750, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:18<05:47,  5.52s/it][A

	loss_cls: tensor(0.4104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9233, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:23<05:42,  5.52s/it][A

	loss_cls: tensor(0.5139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7282, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:29<05:35,  5.50s/it][A

	loss_cls: tensor(0.5140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6838, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:34<05:28,  5.48s/it][A

	loss_cls: tensor(0.4665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0065, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:40<05:24,  5.50s/it][A

	loss_cls: tensor(0.7203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8436, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:45<05:18,  5.50s/it][A

	loss_cls: tensor(0.6297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8233, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:51<05:14,  5.52s/it][A

	loss_cls: tensor(0.7889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9712, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:56<05:08,  5.50s/it][A

	loss_cls: tensor(0.5514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7652, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:02<05:03,  5.52s/it][A

	loss_cls: tensor(0.6477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9415, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:07<04:57,  5.50s/it][A

	loss_cls: tensor(0.4525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8965, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:13<04:50,  5.48s/it][A

	loss_cls: tensor(0.5841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8677, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:18<04:46,  5.51s/it][A

	loss_cls: tensor(0.7437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8266, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:24<04:40,  5.50s/it][A

	loss_cls: tensor(0.4534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6030, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:29<04:28,  5.37s/it][A

	loss_cls: tensor(0.7681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9534, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:34<04:18,  5.27s/it][A

	loss_cls: tensor(0.6879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0365, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:39<04:13,  5.27s/it][A

	loss_cls: tensor(0.6299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7845, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:45<04:10,  5.33s/it][A

	loss_cls: tensor(0.7525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8735, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:50<04:06,  5.37s/it][A

	loss_cls: tensor(0.4031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5894, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:56<04:04,  5.43s/it][A

	loss_cls: tensor(1.0442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4749, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:01<03:59,  5.44s/it][A

	loss_cls: tensor(0.5610, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7848, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:07<03:55,  5.48s/it][A

	loss_cls: tensor(0.8507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3461, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:12<03:50,  5.50s/it][A

	loss_cls: tensor(0.6820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9193, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:18<03:44,  5.48s/it][A

	loss_cls: tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8301, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:23<03:37,  5.44s/it][A

	loss_cls: tensor(0.4874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6916, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:28<03:32,  5.44s/it][A

	loss_cls: tensor(0.3719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0744, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4462, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:34<03:28,  5.48s/it][A

	loss_cls: tensor(0.5702, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7633, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:39<03:18,  5.36s/it][A

	loss_cls: tensor(0.6691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.8585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5276, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:44<03:11,  5.32s/it][A

	loss_cls: tensor(0.4308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5992, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:50<03:07,  5.37s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8003, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:55<03:04,  5.42s/it][A

	loss_cls: tensor(0.5740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8521, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:01<02:59,  5.43s/it][A

	loss_cls: tensor(0.5497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7273, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:06<02:54,  5.46s/it][A

	loss_cls: tensor(0.8726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1645, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:12<02:49,  5.47s/it][A

	loss_cls: tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0195, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:17<02:44,  5.47s/it][A

	loss_cls: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7416, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:23<02:39,  5.50s/it][A

	loss_cls: tensor(0.8428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9754, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:28<02:33,  5.48s/it][A

	loss_cls: tensor(0.6789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0349, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:34<02:28,  5.50s/it][A

	loss_cls: tensor(0.5885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8371, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:39<02:22,  5.50s/it][A

	loss_cls: tensor(0.4504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8564, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:45<02:17,  5.50s/it][A

	loss_cls: tensor(0.6452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9467, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:50<02:11,  5.50s/it][A

	loss_cls: tensor(0.6577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0471, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:56<02:06,  5.49s/it][A

	loss_cls: tensor(0.5989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7609, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:01<02:01,  5.51s/it][A

	loss_cls: tensor(0.7586, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9742, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:07<01:55,  5.50s/it][A

	loss_cls: tensor(0.4793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8502, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:12<01:50,  5.52s/it][A

	loss_cls: tensor(0.8610, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9666, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:18<01:44,  5.51s/it][A

	loss_cls: tensor(0.6904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8513, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:23<01:39,  5.52s/it][A

	loss_cls: tensor(0.6437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2568, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:29<01:33,  5.50s/it][A

	loss_cls: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9228, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:34<01:27,  5.49s/it][A

	loss_cls: tensor(0.6175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7918, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:40<01:22,  5.51s/it][A

	loss_cls: tensor(0.8246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8854, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:45<01:17,  5.50s/it][A

	loss_cls: tensor(0.5004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9091, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:51<01:11,  5.51s/it][A

	loss_cls: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0129, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:56<01:05,  5.50s/it][A

	loss_cls: tensor(0.6134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9273, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:02<01:00,  5.52s/it][A

	loss_cls: tensor(0.9754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2847, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:08<00:55,  5.51s/it][A

	loss_cls: tensor(0.7384, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9590, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:13<00:49,  5.50s/it][A

	loss_cls: tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8633, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:19<00:44,  5.51s/it][A

	loss_cls: tensor(0.6970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0132, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:24<00:38,  5.50s/it][A

	loss_cls: tensor(0.7451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9496, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:30<00:33,  5.53s/it][A

	loss_cls: tensor(0.5320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8188, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:35<00:27,  5.51s/it][A

	loss_cls: tensor(0.6203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8019, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:41<00:22,  5.53s/it][A

	loss_cls: tensor(0.5127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6869, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:46<00:16,  5.51s/it][A

	loss_cls: tensor(0.6883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8415, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:52<00:10,  5.49s/it][A

	loss_cls: tensor(0.6085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8649, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:57<00:05,  5.51s/it][A

	loss_cls: tensor(0.5273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6353, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:59<00:00,  5.42s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.5182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7586, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8412091942493525

	Training cls acc: 0.6938559322033898

	Training cls prec: 0.5815313942962248

	Training cls rec: 0.6297915714229274

	Training cls f1: 0.5395128026344723

--
	Training ner acc: 0.9553328069992448

	Training ner prec: 0.2628327388188645

	Training ner rec: 0.27142676736609783

	Training ner f1: 0.26674576430924646

	Current Learning rate:  0.0006571428571428571



  1%|          | 1/177 [00:00<02:12,  1.33it/s][A
  1%|          | 2/177 [00:01<02:10,  1.34it/s][A
  2%|▏         | 3/177 [00:02<02:02,  1.42it/s][A
  2%|▏         | 4/177 [00:02<02:04,  1.38it/s][A
  3%|▎         | 5/177 [00:03<02:05,  1.37it/s][A
  3%|▎         | 6/177 [00:04<02:05,  1.37it/s][A
  4%|▍         | 7/177 [00:05<02:00,  1.41it/s][A
  5%|▍         | 8/177 [00:05<02:02,  1.38it/s][A
  5%|▌         | 9/177 [00:06<02:02,  1.37it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.41it/s][A
  6%|▌         | 11/177 [00:07<01:58,  1.40it/s][A
  7%|▋         | 12/177 [00:08<01:59,  1.38it/s][A
  7%|▋         | 13/177 [00:09<01:55,  1.42it/s][A
  8%|▊         | 14/177 [00:09<01:51,  1.47it/s][A
  8%|▊         | 15/177 [00:10<01:49,  1.48it/s][A
  9%|▉         | 16/177 [00:11<01:48,  1.48it/s][A
 10%|▉         | 17/177 [00:11<01:47,  1.49it/s][A
 10%|█         | 18/177 [00:12<01:44,  1.52it/s][A
 11%|█         | 19/177 [00:13<01:44,  1.51it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8390363345375169

	Validation cls acc: 0.5826271186440678

	Validation cls prec: 0.5804681194511703

	Validation cls rec: 0.5440207156308852

	Validation cls f1: 0.4786450837298295

--
	Validation ner acc: 0.9549760609734016

	Validation ner prec: 0.40427251737763215

	Validation ner rec: 0.4146892655367232

	Validation ner f1: 0.4092846031067912



  0%|          | 1/354 [00:05<31:39,  5.38s/it][A

	loss_cls: tensor(0.5765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9170, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:49,  5.43s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6938, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:40,  5.42s/it][A

	loss_cls: tensor(0.5704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0581, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:28,  5.39s/it][A

	loss_cls: tensor(0.8596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0580, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:33,  5.43s/it][A

	loss_cls: tensor(0.5915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0847, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:23,  5.41s/it][A

	loss_cls: tensor(0.4261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7657, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:29,  5.45s/it][A

	loss_cls: tensor(0.6421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3642, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0063, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:21,  5.44s/it][A

	loss_cls: tensor(0.6818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0884, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:17,  5.44s/it][A

	loss_cls: tensor(0.7523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9261, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:02,  5.41s/it][A

	loss_cls: tensor(0.6732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8717, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:58,  5.42s/it][A

	loss_cls: tensor(0.5867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8252, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:51,  5.41s/it][A

	loss_cls: tensor(0.7657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0335, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:44,  5.41s/it][A

	loss_cls: tensor(0.5881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7427, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:46,  5.43s/it][A

	loss_cls: tensor(0.4373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6058, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:33,  5.41s/it][A

	loss_cls: tensor(0.5654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7593, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:34,  5.43s/it][A

	loss_cls: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8335, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:25,  5.42s/it][A

	loss_cls: tensor(0.5943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9048, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:24,  5.43s/it][A

	loss_cls: tensor(0.5429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8107, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:16,  5.42s/it][A

	loss_cls: tensor(0.4763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6236, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:04,  5.40s/it][A

	loss_cls: tensor(0.4319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5786, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:04,  5.42s/it][A

	loss_cls: tensor(0.5874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<29:56,  5.41s/it][A

	loss_cls: tensor(0.5514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0017, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:55,  5.43s/it][A

	loss_cls: tensor(0.3186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3639, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:43,  5.40s/it][A

	loss_cls: tensor(0.6745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0313, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:45,  5.43s/it][A

	loss_cls: tensor(0.5464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6445, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:36,  5.42s/it][A

	loss_cls: tensor(0.4673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5611, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:28,  5.41s/it][A

	loss_cls: tensor(0.7730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0494, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:28,  5.43s/it][A

	loss_cls: tensor(0.7616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0123, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:20,  5.42s/it][A

	loss_cls: tensor(0.6384, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1848, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:18,  5.43s/it][A

	loss_cls: tensor(0.3731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6017, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:06,  5.41s/it][A

	loss_cls: tensor(0.6914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1224, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<29:08,  5.43s/it][A

	loss_cls: tensor(0.3592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5215, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:59,  5.42s/it][A

	loss_cls: tensor(0.8432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9944, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:51,  5.41s/it][A

	loss_cls: tensor(0.4968, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5736, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:51,  5.43s/it][A

	loss_cls: tensor(0.9996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1550, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:15<28:40,  5.41s/it][A

	loss_cls: tensor(0.7491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0341, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:42,  5.43s/it][A

	loss_cls: tensor(0.4491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5274, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:32,  5.42s/it][A

	loss_cls: tensor(0.8003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1344, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:32,  5.44s/it][A

	loss_cls: tensor(0.4876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7254, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:23,  5.43s/it][A

	loss_cls: tensor(0.5896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9283, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:42<28:13,  5.41s/it][A

	loss_cls: tensor(0.7371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9803, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:11,  5.42s/it][A

	loss_cls: tensor(0.5587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7308, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:53<28:03,  5.41s/it][A

	loss_cls: tensor(0.5244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7205, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<28:00,  5.42s/it][A

	loss_cls: tensor(0.4808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7814, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:50,  5.41s/it][A

	loss_cls: tensor(0.5212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9532, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:50,  5.42s/it][A

	loss_cls: tensor(0.4808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7636, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:42,  5.41s/it][A

	loss_cls: tensor(0.8193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9804, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:20<27:35,  5.41s/it][A

	loss_cls: tensor(0.6742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9655, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:32,  5.42s/it][A

	loss_cls: tensor(0.6676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8317, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:22,  5.40s/it][A

	loss_cls: tensor(0.8499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9702, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:22,  5.42s/it][A

	loss_cls: tensor(0.5510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:16,  5.42s/it][A

	loss_cls: tensor(0.6085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7558, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:47<27:17,  5.44s/it][A

	loss_cls: tensor(0.5558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8013, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<27:07,  5.42s/it][A

	loss_cls: tensor(0.6092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8199, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:58<26:57,  5.41s/it][A

	loss_cls: tensor(1.1324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2332, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<26:57,  5.43s/it][A

	loss_cls: tensor(0.5924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8023, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:45,  5.41s/it][A

	loss_cls: tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7347, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:46,  5.43s/it][A

	loss_cls: tensor(0.4417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9763, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:40,  5.42s/it][A

	loss_cls: tensor(0.5042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7031, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:39,  5.44s/it][A

	loss_cls: tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7720, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:30<26:28,  5.42s/it][A

	loss_cls: tensor(0.4526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5937, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:36<26:21,  5.42s/it][A

	loss_cls: tensor(0.3481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4738, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:19,  5.43s/it][A

	loss_cls: tensor(1.0991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3604, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:46<26:09,  5.41s/it][A

	loss_cls: tensor(0.7616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9832, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:52<26:08,  5.43s/it][A

	loss_cls: tensor(1.0746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3922, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:57<26:01,  5.42s/it][A

	loss_cls: tensor(0.5699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7598, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:03<26:00,  5.44s/it][A

	loss_cls: tensor(0.4413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4761, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:08<25:49,  5.42s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6548, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:13<25:41,  5.41s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8379, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:19<25:42,  5.43s/it][A

	loss_cls: tensor(0.5509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6739, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:24<25:34,  5.42s/it][A

	loss_cls: tensor(0.4778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9578, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:30<25:34,  5.44s/it][A

	loss_cls: tensor(0.5029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5520, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:35<25:28,  5.44s/it][A

	loss_cls: tensor(0.4824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5484, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:41<25:26,  5.45s/it][A

	loss_cls: tensor(0.4028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4412, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:46<25:17,  5.44s/it][A

	loss_cls: tensor(0.5274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7455, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:52<25:08,  5.43s/it][A

	loss_cls: tensor(0.3966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6503, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:57<25:07,  5.44s/it][A

	loss_cls: tensor(0.4476, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6528, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:02<24:59,  5.43s/it][A

	loss_cls: tensor(0.8372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9650, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:08<24:59,  5.45s/it][A

	loss_cls: tensor(0.5633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7324, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:13<24:50,  5.44s/it][A

	loss_cls: tensor(0.7545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0229, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:19<24:48,  5.45s/it][A

	loss_cls: tensor(0.7344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9614, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:24<24:40,  5.44s/it][A

	loss_cls: tensor(0.5276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8756, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:30<24:31,  5.43s/it][A

	loss_cls: tensor(0.7361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0088, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:35<24:28,  5.44s/it][A

	loss_cls: tensor(0.6731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8964, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:41<24:20,  5.43s/it][A

	loss_cls: tensor(0.6727, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9982, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:46<24:22,  5.46s/it][A

	loss_cls: tensor(0.4410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7314, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:51<24:12,  5.44s/it][A

	loss_cls: tensor(0.6577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8003, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:57<24:13,  5.47s/it][A

	loss_cls: tensor(0.3933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4463, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:02<24:03,  5.45s/it][A

	loss_cls: tensor(0.5911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7382, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:08<23:51,  5.42s/it][A

	loss_cls: tensor(0.4247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5638, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:13<23:49,  5.44s/it][A

	loss_cls: tensor(0.6306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6812, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:19<23:43,  5.43s/it][A

	loss_cls: tensor(0.4883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9380, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:24<23:43,  5.45s/it][A

	loss_cls: tensor(0.6366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7479, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:30<23:32,  5.43s/it][A

	loss_cls: tensor(0.3988, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5266, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:35<23:29,  5.44s/it][A

	loss_cls: tensor(0.5340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8311, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:40<23:17,  5.42s/it][A

	loss_cls: tensor(0.4445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1314, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5759, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:46<23:07,  5.40s/it][A

	loss_cls: tensor(0.3927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4233, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:51<23:07,  5.42s/it][A

	loss_cls: tensor(0.8310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2996, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:57<23:02,  5.42s/it][A

	loss_cls: tensor(0.4275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6783, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:02<23:00,  5.44s/it][A

	loss_cls: tensor(0.3930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4250, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:07<22:48,  5.41s/it][A

	loss_cls: tensor(0.2184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3207, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:13<22:47,  5.43s/it][A

	loss_cls: tensor(0.7327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8633, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:18<22:37,  5.41s/it][A

	loss_cls: tensor(1.1314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4508, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:24<22:29,  5.40s/it][A

	loss_cls: tensor(0.9099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4101, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:29<22:37,  5.45s/it][A

	loss_cls: tensor(0.6542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0510, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:35<22:34,  5.46s/it][A

	loss_cls: tensor(0.5973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0095, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:40<22:36,  5.49s/it][A

	loss_cls: tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8319, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:46<22:30,  5.49s/it][A

	loss_cls: tensor(0.4028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4989, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:51<22:28,  5.50s/it][A

	loss_cls: tensor(0.5583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8717, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:57<22:16,  5.48s/it][A

	loss_cls: tensor(0.6127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6892, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:02<22:14,  5.49s/it][A

	loss_cls: tensor(0.7989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0928, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:08<22:08,  5.49s/it][A

	loss_cls: tensor(0.6726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9568, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:13<21:59,  5.47s/it][A

	loss_cls: tensor(0.6166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9318, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:19<21:59,  5.50s/it][A

	loss_cls: tensor(0.4840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5668, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:24<21:49,  5.48s/it][A

	loss_cls: tensor(0.6180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9289, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:30<21:47,  5.49s/it][A

	loss_cls: tensor(0.3869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5150, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:35<21:37,  5.48s/it][A

	loss_cls: tensor(0.3444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8902, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:41<21:36,  5.49s/it][A

	loss_cls: tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7137, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:46<21:28,  5.48s/it][A

	loss_cls: tensor(0.6390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7737, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:52<21:22,  5.48s/it][A

	loss_cls: tensor(0.4909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8510, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:57<21:20,  5.49s/it][A

	loss_cls: tensor(0.7162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9977, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:03<21:13,  5.49s/it][A

	loss_cls: tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3947, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8702, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:08<21:10,  5.50s/it][A

	loss_cls: tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2022, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7041, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:13<20:59,  5.48s/it][A

	loss_cls: tensor(0.5489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6689, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:19<20:59,  5.50s/it][A

	loss_cls: tensor(1.0139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3353, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:24<20:51,  5.49s/it][A

	loss_cls: tensor(0.5160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8055, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:30<20:43,  5.48s/it][A

	loss_cls: tensor(0.3479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7226, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:35<20:41,  5.49s/it][A

	loss_cls: tensor(0.6922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1502, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:41<20:33,  5.48s/it][A

	loss_cls: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6778, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:46<20:29,  5.49s/it][A

	loss_cls: tensor(0.8545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2197, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:52<20:20,  5.47s/it][A

	loss_cls: tensor(0.3486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5329, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:57<20:16,  5.48s/it][A

	loss_cls: tensor(0.4335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8406, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:03<20:10,  5.48s/it][A

	loss_cls: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7157, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:08<20:03,  5.47s/it][A

	loss_cls: tensor(0.8614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0336, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:14<20:01,  5.48s/it][A

	loss_cls: tensor(0.3111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3981, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:19<19:49,  5.46s/it][A

	loss_cls: tensor(0.6552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8957, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:25<19:43,  5.45s/it][A

	loss_cls: tensor(0.4934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6146, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:30<19:33,  5.43s/it][A

	loss_cls: tensor(0.5280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5680, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:36<19:33,  5.46s/it][A

	loss_cls: tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6657, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:41<19:24,  5.44s/it][A

	loss_cls: tensor(0.7510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9879, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:46<19:15,  5.42s/it][A

	loss_cls: tensor(0.4117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5761, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:52<19:13,  5.44s/it][A

	loss_cls: tensor(0.5077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6888, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:57<19:03,  5.42s/it][A

	loss_cls: tensor(0.3241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2947, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6189, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:03<18:58,  5.42s/it][A

	loss_cls: tensor(0.3496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4932, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:08<18:50,  5.41s/it][A

	loss_cls: tensor(0.6970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0288, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:13<18:48,  5.42s/it][A

	loss_cls: tensor(0.6239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9465, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:19<18:39,  5.41s/it][A

	loss_cls: tensor(1.2250, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5604, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:24<18:33,  5.40s/it][A

	loss_cls: tensor(0.8440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1676, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:30<18:31,  5.42s/it][A

	loss_cls: tensor(0.3817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4130, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:35<18:24,  5.41s/it][A

	loss_cls: tensor(0.8641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1436, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0077, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:41<18:22,  5.43s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6959, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:46<18:13,  5.41s/it][A

	loss_cls: tensor(0.9690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1763, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:51<18:10,  5.43s/it][A

	loss_cls: tensor(0.5892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:57<18:01,  5.41s/it][A

	loss_cls: tensor(0.3961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7377, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:02<17:53,  5.39s/it][A

	loss_cls: tensor(0.5676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6261, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:08<17:51,  5.41s/it][A

	loss_cls: tensor(0.5290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8109, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:13<17:44,  5.40s/it][A

	loss_cls: tensor(0.5150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6969, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:18<17:42,  5.42s/it][A

	loss_cls: tensor(0.8758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3854, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:24<17:34,  5.41s/it][A

	loss_cls: tensor(0.6503, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9264, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:29<17:31,  5.42s/it][A

	loss_cls: tensor(1.1708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4091, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:35<17:23,  5.41s/it][A

	loss_cls: tensor(0.5062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0526, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5587, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:40<17:17,  5.40s/it][A

	loss_cls: tensor(0.5322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5880, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:45<17:16,  5.43s/it][A

	loss_cls: tensor(0.4835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7603, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:51<17:07,  5.41s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7732, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:56<17:06,  5.43s/it][A

	loss_cls: tensor(0.5665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8949, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:02<16:58,  5.42s/it][A

	loss_cls: tensor(0.5882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8340, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:07<16:55,  5.43s/it][A

	loss_cls: tensor(0.6333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7453, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:13<16:49,  5.43s/it][A

	loss_cls: tensor(0.6572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0704, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:18<16:41,  5.41s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7114, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:23<16:38,  5.43s/it][A

	loss_cls: tensor(0.7732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4317, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:29<16:30,  5.41s/it][A

	loss_cls: tensor(0.5863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7293, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:34<16:30,  5.44s/it][A

	loss_cls: tensor(0.5101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7243, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:40<16:21,  5.42s/it][A

	loss_cls: tensor(0.4840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7214, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:45<16:17,  5.43s/it][A

	loss_cls: tensor(0.3766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6542, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:51<16:09,  5.42s/it][A

	loss_cls: tensor(0.5916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8806, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:56<16:02,  5.41s/it][A

	loss_cls: tensor(0.3358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3987, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:01<15:59,  5.42s/it][A

	loss_cls: tensor(0.9428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2336, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:07<15:52,  5.41s/it][A

	loss_cls: tensor(0.5151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6743, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:12<15:51,  5.44s/it][A

	loss_cls: tensor(0.6550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7973, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:18<15:45,  5.43s/it][A

	loss_cls: tensor(0.7816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9326, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:23<15:42,  5.45s/it][A

	loss_cls: tensor(0.6759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0741, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:29<15:34,  5.44s/it][A

	loss_cls: tensor(0.7893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9373, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:34<15:25,  5.41s/it][A

	loss_cls: tensor(0.5454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7026, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:39<15:21,  5.42s/it][A

	loss_cls: tensor(0.7605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2067, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:45<15:14,  5.41s/it][A

	loss_cls: tensor(0.4573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7480, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:50<15:13,  5.44s/it][A

	loss_cls: tensor(0.4353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5302, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:56<15:05,  5.42s/it][A

	loss_cls: tensor(0.6985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9354, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:01<15:02,  5.44s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8373, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:06<14:54,  5.42s/it][A

	loss_cls: tensor(0.8624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2593, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:12<14:45,  5.40s/it][A

	loss_cls: tensor(0.4745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5716, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:17<14:42,  5.41s/it][A

	loss_cls: tensor(0.4976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6298, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:23<14:35,  5.40s/it][A

	loss_cls: tensor(0.7641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9946, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:28<14:32,  5.42s/it][A

	loss_cls: tensor(0.4298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6380, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:33<14:24,  5.40s/it][A

	loss_cls: tensor(0.5907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6477, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:39<14:21,  5.42s/it][A

	loss_cls: tensor(0.4553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5733, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:44<14:12,  5.39s/it][A

	loss_cls: tensor(0.6235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1066, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:50<14:05,  5.38s/it][A

	loss_cls: tensor(0.6622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9510, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:55<14:03,  5.41s/it][A

	loss_cls: tensor(0.7458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9601, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:00<13:57,  5.40s/it][A

	loss_cls: tensor(0.4788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6811, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:06<13:53,  5.41s/it][A

	loss_cls: tensor(0.3349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5241, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:11<13:47,  5.41s/it][A

	loss_cls: tensor(0.6233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7597, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:17<13:43,  5.42s/it][A

	loss_cls: tensor(0.6427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7855, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:22<13:35,  5.40s/it][A

	loss_cls: tensor(1.0623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3563, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:27<13:28,  5.39s/it][A

	loss_cls: tensor(0.5612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8612, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:33<13:27,  5.42s/it][A

	loss_cls: tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4926, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:38<13:20,  5.41s/it][A

	loss_cls: tensor(0.7759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9949, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:44<13:16,  5.42s/it][A

	loss_cls: tensor(0.5916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7553, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:49<13:09,  5.41s/it][A

	loss_cls: tensor(0.6244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8589, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:55<13:06,  5.43s/it][A

	loss_cls: tensor(0.4225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4624, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:00<12:59,  5.41s/it][A

	loss_cls: tensor(0.5177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6855, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:05<12:55,  5.42s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6614, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:11<12:48,  5.41s/it][A

	loss_cls: tensor(0.6704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2663, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:16<12:41,  5.40s/it][A

	loss_cls: tensor(0.5142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6350, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:22<12:39,  5.42s/it][A

	loss_cls: tensor(0.9180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3356, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:27<12:31,  5.41s/it][A

	loss_cls: tensor(0.5154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6151, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:33<12:28,  5.42s/it][A

	loss_cls: tensor(0.4124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4646, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:38<12:20,  5.41s/it][A

	loss_cls: tensor(0.6999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7546, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:43<12:17,  5.42s/it][A

	loss_cls: tensor(0.5498, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9524, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:49<12:11,  5.42s/it][A

	loss_cls: tensor(0.8327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9572, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:54<12:03,  5.40s/it][A

	loss_cls: tensor(0.7292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9381, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:00<12:01,  5.42s/it][A

	loss_cls: tensor(0.4392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5251, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:05<11:53,  5.41s/it][A

	loss_cls: tensor(0.4823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5313, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:10<11:49,  5.42s/it][A

	loss_cls: tensor(0.4975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8017, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:16<11:41,  5.40s/it][A

	loss_cls: tensor(0.5721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9086, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:21<11:38,  5.42s/it][A

	loss_cls: tensor(0.6729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8851, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:27<11:30,  5.40s/it][A

	loss_cls: tensor(0.5676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8553, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:32<11:26,  5.40s/it][A

	loss_cls: tensor(0.4239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6204, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:37<11:22,  5.42s/it][A

	loss_cls: tensor(1.0218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2201, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:43<11:16,  5.41s/it][A

	loss_cls: tensor(0.5667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7829, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:48<11:11,  5.42s/it][A

	loss_cls: tensor(0.7831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1290, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:54<11:04,  5.40s/it][A

	loss_cls: tensor(0.4325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6140, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:59<11:01,  5.42s/it][A

	loss_cls: tensor(0.4156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5725, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:05<10:55,  5.42s/it][A

	loss_cls: tensor(0.8775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9751, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:10<10:48,  5.40s/it][A

	loss_cls: tensor(0.4071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4426, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:15<10:44,  5.42s/it][A

	loss_cls: tensor(0.9396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0568, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:21<10:37,  5.41s/it][A

	loss_cls: tensor(0.4964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5777, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:26<10:33,  5.42s/it][A

	loss_cls: tensor(0.4696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7309, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:32<10:27,  5.41s/it][A

	loss_cls: tensor(0.4246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1840, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6086, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:37<10:23,  5.42s/it][A

	loss_cls: tensor(0.7225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0429, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:42<10:16,  5.41s/it][A

	loss_cls: tensor(0.5715, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6760, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:48<10:09,  5.39s/it][A

	loss_cls: tensor(0.4066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4787, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:53<10:06,  5.41s/it][A

	loss_cls: tensor(0.6853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9823, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:59<09:59,  5.40s/it][A

	loss_cls: tensor(0.5758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6818, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:04<09:55,  5.41s/it][A

	loss_cls: tensor(0.3927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4201, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:09<09:48,  5.40s/it][A

	loss_cls: tensor(0.6774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0514, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:15<09:47,  5.44s/it][A

	loss_cls: tensor(0.5406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7253, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:20<09:40,  5.43s/it][A

	loss_cls: tensor(0.6434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7922, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:26<09:34,  5.42s/it][A

	loss_cls: tensor(0.4609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0298, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4906, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:31<09:30,  5.43s/it][A

	loss_cls: tensor(0.3585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3994, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:37<09:23,  5.42s/it][A

	loss_cls: tensor(0.4805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6397, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:42<09:18,  5.43s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0311, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5354, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:47<09:12,  5.41s/it][A

	loss_cls: tensor(0.8456, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3738, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:53<09:08,  5.43s/it][A

	loss_cls: tensor(0.9721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4526, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4247, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:58<09:01,  5.42s/it][A

	loss_cls: tensor(0.5439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7065, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:04<08:55,  5.41s/it][A

	loss_cls: tensor(0.6801, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8703, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:09<08:51,  5.42s/it][A

	loss_cls: tensor(0.4917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6124, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:14<08:43,  5.40s/it][A

	loss_cls: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8569, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:20<08:39,  5.41s/it][A

	loss_cls: tensor(0.7439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0060, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:25<08:33,  5.40s/it][A

	loss_cls: tensor(0.5852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9152, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:31<08:28,  5.41s/it][A

	loss_cls: tensor(0.7302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9560, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:36<08:22,  5.40s/it][A

	loss_cls: tensor(0.5472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8654, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:41<08:15,  5.39s/it][A

	loss_cls: tensor(0.5494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6848, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:47<08:12,  5.41s/it][A

	loss_cls: tensor(0.6797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8850, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:52<08:05,  5.40s/it][A

	loss_cls: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7301, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:58<08:02,  5.43s/it][A

	loss_cls: tensor(0.5458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0925, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:03<07:56,  5.41s/it][A

	loss_cls: tensor(0.6351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6896, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:09<07:51,  5.42s/it][A

	loss_cls: tensor(0.6846, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8521, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:14<07:44,  5.41s/it][A

	loss_cls: tensor(0.3328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7902, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:19<07:39,  5.40s/it][A

	loss_cls: tensor(0.5187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6811, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:25<07:35,  5.42s/it][A

	loss_cls: tensor(0.7159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9594, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:30<07:28,  5.41s/it][A

	loss_cls: tensor(0.5696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8292, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:36<07:25,  5.43s/it][A

	loss_cls: tensor(0.6309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7064, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:41<07:18,  5.41s/it][A

	loss_cls: tensor(0.5585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6795, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:46<07:13,  5.42s/it][A

	loss_cls: tensor(0.5093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5760, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:52<07:07,  5.41s/it][A

	loss_cls: tensor(0.5453, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9394, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:57<07:00,  5.40s/it][A

	loss_cls: tensor(0.7516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8244, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:03<06:56,  5.41s/it][A

	loss_cls: tensor(0.4744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6151, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:08<06:50,  5.40s/it][A

	loss_cls: tensor(0.4453, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6279, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:14<06:47,  5.43s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8146, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:19<06:40,  5.42s/it][A

	loss_cls: tensor(0.6435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8531, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:24<06:36,  5.43s/it][A

	loss_cls: tensor(0.4035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7498, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:30<06:29,  5.41s/it][A

	loss_cls: tensor(0.6085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7718, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:35<06:23,  5.40s/it][A

	loss_cls: tensor(0.3336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6335, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:41<06:18,  5.41s/it][A

	loss_cls: tensor(0.3913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5439, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:46<06:12,  5.40s/it][A

	loss_cls: tensor(0.5446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3049, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:51<06:08,  5.42s/it][A

	loss_cls: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9638, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:57<06:01,  5.40s/it][A

	loss_cls: tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7724, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:02<05:57,  5.42s/it][A

	loss_cls: tensor(0.3208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6577, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:08<05:51,  5.40s/it][A

	loss_cls: tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8332, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:13<05:44,  5.38s/it][A

	loss_cls: tensor(0.3613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4446, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:18<05:40,  5.40s/it][A

	loss_cls: tensor(0.6443, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9771, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:24<05:34,  5.39s/it][A

	loss_cls: tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7868, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:29<05:30,  5.42s/it][A

	loss_cls: tensor(0.7563, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9950, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:35<05:24,  5.40s/it][A

	loss_cls: tensor(0.4926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7145, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:40<05:19,  5.41s/it][A

	loss_cls: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7902, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:45<05:12,  5.39s/it][A

	loss_cls: tensor(1.1606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3069, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:51<05:06,  5.38s/it][A

	loss_cls: tensor(0.7154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0137, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:56<05:02,  5.40s/it][A

	loss_cls: tensor(0.4504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7969, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:02<04:56,  5.40s/it][A

	loss_cls: tensor(0.2811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4336, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:07<04:52,  5.41s/it][A

	loss_cls: tensor(0.5494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6215, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:12<04:46,  5.40s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9302, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:18<04:41,  5.41s/it][A

	loss_cls: tensor(0.6297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8265, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:23<04:35,  5.40s/it][A

	loss_cls: tensor(0.5752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2838, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:29<04:29,  5.38s/it][A

	loss_cls: tensor(0.4555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5593, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:34<04:25,  5.41s/it][A

	loss_cls: tensor(0.9810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3028, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:39<04:19,  5.40s/it][A

	loss_cls: tensor(0.9520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3744, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:45<04:14,  5.42s/it][A

	loss_cls: tensor(0.4277, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6076, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:50<04:08,  5.40s/it][A

	loss_cls: tensor(0.4868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6949, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:56<04:03,  5.41s/it][A

	loss_cls: tensor(0.3804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5754, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:01<03:57,  5.39s/it][A

	loss_cls: tensor(0.6898, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9220, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:06<03:52,  5.40s/it][A

	loss_cls: tensor(1.2392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4669, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:12<03:46,  5.39s/it][A

	loss_cls: tensor(0.5317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1708, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7025, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:17<03:40,  5.38s/it][A

	loss_cls: tensor(0.6477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:23<03:36,  5.40s/it][A

	loss_cls: tensor(1.1147, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1941, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:28<03:30,  5.39s/it][A

	loss_cls: tensor(1.0401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2099, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:33<03:25,  5.41s/it][A

	loss_cls: tensor(0.5437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7193, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:39<03:19,  5.39s/it][A

	loss_cls: tensor(0.8911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0654, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:44<03:14,  5.41s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7856, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:50<03:09,  5.41s/it][A

	loss_cls: tensor(0.4396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8113, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:55<03:03,  5.40s/it][A

	loss_cls: tensor(0.3975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6031, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:00<02:58,  5.42s/it][A

	loss_cls: tensor(0.6444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9085, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:06<02:52,  5.40s/it][A

	loss_cls: tensor(0.5432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3381, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:11<02:47,  5.41s/it][A

	loss_cls: tensor(0.4923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8751, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:17<02:41,  5.39s/it][A

	loss_cls: tensor(0.4538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6395, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:22<02:37,  5.41s/it][A

	loss_cls: tensor(0.8660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1383, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:27<02:31,  5.41s/it][A

	loss_cls: tensor(0.5090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:33<02:25,  5.40s/it][A

	loss_cls: tensor(0.7567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0005, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:38<02:20,  5.41s/it][A

	loss_cls: tensor(0.4671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6307, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:44<02:14,  5.39s/it][A

	loss_cls: tensor(0.4340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6208, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:49<02:09,  5.40s/it][A

	loss_cls: tensor(0.4941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8890, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:54<02:03,  5.39s/it][A

	loss_cls: tensor(0.8904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1237, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:00<01:59,  5.41s/it][A

	loss_cls: tensor(0.8086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1041, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:05<01:53,  5.41s/it][A

	loss_cls: tensor(0.3897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8437, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:11<01:47,  5.40s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8690, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:16<01:42,  5.42s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7968, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:21<01:37,  5.40s/it][A

	loss_cls: tensor(0.7920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9479, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:27<01:32,  5.42s/it][A

	loss_cls: tensor(0.4617, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6857, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:32<01:26,  5.41s/it][A

	loss_cls: tensor(0.5660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7554, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:38<01:21,  5.43s/it][A

	loss_cls: tensor(0.7816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8852, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:43<01:15,  5.42s/it][A

	loss_cls: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9193, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:48<01:10,  5.40s/it][A

	loss_cls: tensor(0.6180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8658, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:54<01:04,  5.41s/it][A

	loss_cls: tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8947, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:59<00:59,  5.40s/it][A

	loss_cls: tensor(0.6534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8387, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:05<00:54,  5.41s/it][A

	loss_cls: tensor(0.5659, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7314, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:10<00:48,  5.39s/it][A

	loss_cls: tensor(0.4598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7834, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:16<00:43,  5.42s/it][A

	loss_cls: tensor(0.7407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7735, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:21<00:37,  5.40s/it][A

	loss_cls: tensor(0.6224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9085, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:26<00:32,  5.39s/it][A

	loss_cls: tensor(0.4462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2084, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6546, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:32<00:27,  5.41s/it][A

	loss_cls: tensor(0.6656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9645, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:37<00:21,  5.40s/it][A

	loss_cls: tensor(0.5330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8041, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:43<00:16,  5.41s/it][A

	loss_cls: tensor(0.6883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0764, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:48<00:10,  5.40s/it][A

	loss_cls: tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9098, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:53<00:05,  5.41s/it][A

	loss_cls: tensor(0.6225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8328, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:55<00:00,  5.41s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.8634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9304, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8299905685550075

	Training cls acc: 0.71109934086629

	Training cls prec: 0.5857513229864925

	Training cls rec: 0.6321501513238801

	Training cls f1: 0.551164779257619

--
	Training ner acc: 0.9553735524529472

	Training ner prec: 0.278622022761442

	Training ner rec: 0.28799818455862103

	Training ner f1: 0.28315741778148984

	Current Learning rate:  0.0006285714285714285



  1%|          | 1/177 [00:00<02:09,  1.36it/s][A
  1%|          | 2/177 [00:01<01:59,  1.46it/s][A
  2%|▏         | 3/177 [00:02<02:02,  1.42it/s][A
  2%|▏         | 4/177 [00:02<02:03,  1.40it/s][A
  3%|▎         | 5/177 [00:03<02:04,  1.39it/s][A
  3%|▎         | 6/177 [00:04<01:59,  1.43it/s][A
  4%|▍         | 7/177 [00:04<02:00,  1.41it/s][A
  5%|▍         | 8/177 [00:05<02:00,  1.40it/s][A
  5%|▌         | 9/177 [00:06<01:56,  1.44it/s][A
  6%|▌         | 10/177 [00:07<01:57,  1.42it/s][A
  6%|▌         | 11/177 [00:07<01:58,  1.40it/s][A
  7%|▋         | 12/177 [00:08<01:58,  1.39it/s][A
  7%|▋         | 13/177 [00:09<01:54,  1.43it/s][A
  8%|▊         | 14/177 [00:09<01:55,  1.41it/s][A
  8%|▊         | 15/177 [00:10<01:55,  1.40it/s][A
  9%|▉         | 16/177 [00:11<01:56,  1.39it/s][A
 10%|▉         | 17/177 [00:12<01:52,  1.43it/s][A
 10%|█         | 18/177 [00:12<01:53,  1.40it/s][A
 11%|█         | 19/177 [00:13<01:53,  1.39it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.755827599326096

	Validation cls acc: 0.7464689265536724

	Validation cls prec: 0.6075531342480495

	Validation cls rec: 0.6014763249932741

	Validation cls f1: 0.5728005829700745

--
	Validation ner acc: 0.9556193954945874

	Validation ner prec: 0.4172909780767549

	Validation ner rec: 0.42758945386064034

	Validation ner f1: 0.4222270616798367



  0%|          | 1/354 [00:05<32:06,  5.46s/it][A

	loss_cls: tensor(0.5590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8181, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:47,  5.42s/it][A

	loss_cls: tensor(0.3991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4572, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:52,  5.45s/it][A

	loss_cls: tensor(0.7060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9684, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:38,  5.42s/it][A

	loss_cls: tensor(0.9347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2460, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:40,  5.45s/it][A

	loss_cls: tensor(0.5695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7988, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:25,  5.42s/it][A

	loss_cls: tensor(0.5247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8042, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:25,  5.43s/it][A

	loss_cls: tensor(0.4700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5629, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:17,  5.43s/it][A

	loss_cls: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5261, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:04,  5.41s/it][A

	loss_cls: tensor(0.5872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8811, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:06,  5.43s/it][A

	loss_cls: tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5875, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:54,  5.41s/it][A

	loss_cls: tensor(0.6659, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9363, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:51,  5.41s/it][A

	loss_cls: tensor(0.3490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6795, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:36,  5.39s/it][A

	loss_cls: tensor(0.7190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0160, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:37,  5.40s/it][A

	loss_cls: tensor(0.4410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7070, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:27,  5.39s/it][A

	loss_cls: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6345, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:18,  5.38s/it][A

	loss_cls: tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7862, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:19,  5.40s/it][A

	loss_cls: tensor(0.6806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1026, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:11,  5.39s/it][A

	loss_cls: tensor(0.4396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5413, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:12,  5.41s/it][A

	loss_cls: tensor(0.4752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6302, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:02,  5.40s/it][A

	loss_cls: tensor(1.0429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1948, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:00,  5.41s/it][A

	loss_cls: tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8930, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:50,  5.39s/it][A

	loss_cls: tensor(0.7042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8333, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:39,  5.37s/it][A

	loss_cls: tensor(0.3453, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4921, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:40,  5.40s/it][A

	loss_cls: tensor(0.4679, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6754, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:30,  5.38s/it][A

	loss_cls: tensor(0.5492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7773, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:34,  5.41s/it][A

	loss_cls: tensor(0.9064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2424, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:25<29:28,  5.41s/it][A

	loss_cls: tensor(0.3491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7546, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:28,  5.42s/it][A

	loss_cls: tensor(0.5150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0139, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:19,  5.41s/it][A

	loss_cls: tensor(0.7797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0079, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:12,  5.41s/it][A

	loss_cls: tensor(0.5871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8287, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:14,  5.43s/it][A

	loss_cls: tensor(1.0627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2985, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<29:04,  5.42s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8437, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<29:02,  5.43s/it][A

	loss_cls: tensor(0.8238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9893, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<28:53,  5.42s/it][A

	loss_cls: tensor(0.7426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9526, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:53,  5.43s/it][A

	loss_cls: tensor(0.7596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9784, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:44,  5.42s/it][A

	loss_cls: tensor(0.7861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0577, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:36,  5.41s/it][A

	loss_cls: tensor(0.6528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8443, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:37,  5.44s/it][A

	loss_cls: tensor(0.5014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0671, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:27,  5.42s/it][A

	loss_cls: tensor(0.6983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8312, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:26,  5.44s/it][A

	loss_cls: tensor(0.6834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8907, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<28:18,  5.43s/it][A

	loss_cls: tensor(0.5151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8468, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:15,  5.43s/it][A

	loss_cls: tensor(0.6649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9890, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<28:05,  5.42s/it][A

	loss_cls: tensor(0.5478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8131, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<27:57,  5.41s/it][A

	loss_cls: tensor(0.5458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8762, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:58,  5.43s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7800, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:46,  5.41s/it][A

	loss_cls: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8540, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:47,  5.43s/it][A

	loss_cls: tensor(0.5024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8256, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:38,  5.42s/it][A

	loss_cls: tensor(0.5093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6949, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:39,  5.44s/it][A

	loss_cls: tensor(1.1033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3520, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:27,  5.42s/it][A

	loss_cls: tensor(0.5840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8034, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:19,  5.41s/it][A

	loss_cls: tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1049, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:18,  5.43s/it][A

	loss_cls: tensor(0.5209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6728, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:46<27:09,  5.41s/it][A

	loss_cls: tensor(0.7885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2270, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<27:09,  5.43s/it][A

	loss_cls: tensor(0.7482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9022, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:57<27:02,  5.43s/it][A

	loss_cls: tensor(0.6631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1141, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<27:01,  5.44s/it][A

	loss_cls: tensor(0.5273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9240, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:54,  5.43s/it][A

	loss_cls: tensor(0.5211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7638, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:44,  5.42s/it][A

	loss_cls: tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0083, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:44,  5.44s/it][A

	loss_cls: tensor(0.6548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8373, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:32,  5.42s/it][A

	loss_cls: tensor(0.5767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6471, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:30<26:40,  5.46s/it][A

	loss_cls: tensor(0.4942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8629, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:36<26:38,  5.47s/it][A

	loss_cls: tensor(0.4960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7763, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:39,  5.50s/it][A

	loss_cls: tensor(0.6586, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0798, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:47<26:34,  5.50s/it][A

	loss_cls: tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0363, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:52<26:26,  5.49s/it][A

	loss_cls: tensor(0.8272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9840, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:58<26:24,  5.50s/it][A

	loss_cls: tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6390, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:03<25:33,  5.34s/it][A

	loss_cls: tensor(0.6735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7581, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:08<25:39,  5.38s/it][A

	loss_cls: tensor(0.6690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7341, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:14<25:41,  5.41s/it][A

	loss_cls: tensor(0.5478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6699, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:19<25:49,  5.46s/it][A

	loss_cls: tensor(0.7385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9913, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:25<25:48,  5.47s/it][A

	loss_cls: tensor(0.5513, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7719, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:30<25:44,  5.48s/it][A

	loss_cls: tensor(0.6194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9192, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:36<25:43,  5.49s/it][A

	loss_cls: tensor(0.7242, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0920, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:41<25:35,  5.49s/it][A

	loss_cls: tensor(0.5450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7291, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:47<25:37,  5.51s/it][A

	loss_cls: tensor(0.3887, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5825, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:52<24:47,  5.35s/it][A

	loss_cls: tensor(0.6917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9892, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:57<24:16,  5.26s/it][A

	loss_cls: tensor(0.5658, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5977, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:02<23:50,  5.18s/it][A

	loss_cls: tensor(0.5378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9156, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:07<23:29,  5.13s/it][A

	loss_cls: tensor(0.8795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1576, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:12<23:46,  5.21s/it][A

	loss_cls: tensor(0.7357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0757, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:18<24:04,  5.29s/it][A

	loss_cls: tensor(0.6058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6544, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:23<24:21,  5.37s/it][A

	loss_cls: tensor(0.4536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0448, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4984, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:29<24:23,  5.40s/it][A

	loss_cls: tensor(0.6973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8982, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:34<23:48,  5.29s/it][A

	loss_cls: tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0215, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:39<23:40,  5.28s/it][A

	loss_cls: tensor(0.7445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9255, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<23:50,  5.34s/it][A

	loss_cls: tensor(0.4498, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5863, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:50<24:03,  5.41s/it][A

	loss_cls: tensor(0.4857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7749, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<24:04,  5.43s/it][A

	loss_cls: tensor(0.4602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5651, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:01<24:10,  5.47s/it][A

	loss_cls: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6638, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:06<24:04,  5.47s/it][A

	loss_cls: tensor(0.6490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8804, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:12<24:05,  5.50s/it][A

	loss_cls: tensor(0.7151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8057, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:17<23:17,  5.34s/it][A

	loss_cls: tensor(0.7424, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8420, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<22:43,  5.22s/it][A

	loss_cls: tensor(0.6668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9331, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:27<22:25,  5.18s/it][A

	loss_cls: tensor(0.6635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1402, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:32<22:07,  5.13s/it][A

	loss_cls: tensor(0.5930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9998, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:37<21:59,  5.11s/it][A

	loss_cls: tensor(0.4358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6237, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:43<22:18,  5.21s/it][A

	loss_cls: tensor(0.4553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6807, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:48<22:38,  5.31s/it][A

	loss_cls: tensor(0.6105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7661, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:54<22:45,  5.36s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9435, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:59<22:53,  5.41s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8320, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:05<22:53,  5.43s/it][A

	loss_cls: tensor(0.6649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9305, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:10<22:51,  5.44s/it][A

	loss_cls: tensor(0.7600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9784, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:55,  5.48s/it][A

	loss_cls: tensor(0.6316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9348, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:21<22:50,  5.48s/it][A

	loss_cls: tensor(0.5367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1147, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:27<22:50,  5.50s/it][A

	loss_cls: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7115, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:32<22:42,  5.49s/it][A

	loss_cls: tensor(0.7698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9421, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:38<22:41,  5.51s/it][A

	loss_cls: tensor(0.5363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:43<22:33,  5.50s/it][A

	loss_cls: tensor(0.9670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2964, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:49<22:26,  5.50s/it][A

	loss_cls: tensor(0.5725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9244, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:54<22:25,  5.51s/it][A

	loss_cls: tensor(0.8804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0570, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:00<22:15,  5.50s/it][A

	loss_cls: tensor(0.5976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7736, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:05<21:36,  5.36s/it][A

	loss_cls: tensor(0.7671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8543, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:10<21:38,  5.39s/it][A

	loss_cls: tensor(0.7555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3877, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:15<21:07,  5.28s/it][A

	loss_cls: tensor(0.5399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6171, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:20<20:41,  5.19s/it][A

	loss_cls: tensor(0.4683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5085, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:25<20:22,  5.14s/it][A

	loss_cls: tensor(0.8368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0746, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:30<20:12,  5.11s/it][A

	loss_cls: tensor(0.6584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8363, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:35<19:59,  5.08s/it][A

	loss_cls: tensor(0.7864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9909, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:40<19:55,  5.09s/it][A

	loss_cls: tensor(0.8797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0992, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:46<20:14,  5.19s/it][A

	loss_cls: tensor(0.6913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0524, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:51<20:35,  5.30s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6049, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:57<20:44,  5.36s/it][A

	loss_cls: tensor(0.6006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7047, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:02<20:45,  5.39s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6957, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:08<20:53,  5.45s/it][A

	loss_cls: tensor(0.6181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6491, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:13<20:52,  5.47s/it][A

	loss_cls: tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8978, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:19<20:52,  5.49s/it][A

	loss_cls: tensor(0.5079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7689, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:24<20:46,  5.49s/it][A

	loss_cls: tensor(0.6419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8588, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:30<20:45,  5.51s/it][A

	loss_cls: tensor(0.6460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7662, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:35<20:39,  5.51s/it][A

	loss_cls: tensor(0.4641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6529, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:41<20:31,  5.50s/it][A

	loss_cls: tensor(0.6838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1036, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:46<20:29,  5.51s/it][A

	loss_cls: tensor(0.5847, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7826, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:52<20:21,  5.50s/it][A

	loss_cls: tensor(0.6018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9691, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:57<20:10,  5.48s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7803, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:03<20:04,  5.47s/it][A

	loss_cls: tensor(0.7075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7931, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:08<19:29,  5.34s/it][A

	loss_cls: tensor(0.5796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8630, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:13<19:32,  5.38s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5786, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:19<19:33,  5.41s/it][A

	loss_cls: tensor(0.5525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7507, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:24<19:10,  5.32s/it][A

	loss_cls: tensor(0.6126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7712, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:29<19:12,  5.36s/it][A

	loss_cls: tensor(0.6352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9361, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:35<19:14,  5.39s/it][A

	loss_cls: tensor(0.6019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7677, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:40<19:13,  5.42s/it][A

	loss_cls: tensor(0.5836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0483, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6319, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:46<19:15,  5.45s/it][A

	loss_cls: tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6809, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:51<19:12,  5.46s/it][A

	loss_cls: tensor(0.4264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6357, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:57<19:07,  5.47s/it][A

	loss_cls: tensor(0.6067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8773, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:02<19:07,  5.49s/it][A

	loss_cls: tensor(0.3371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3625, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:08<19:00,  5.48s/it][A

	loss_cls: tensor(0.4813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8933, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:13<18:59,  5.51s/it][A

	loss_cls: tensor(0.7351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9920, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:19<18:53,  5.50s/it][A

	loss_cls: tensor(0.8320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2414, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:24<18:50,  5.51s/it][A

	loss_cls: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6931, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:30<18:41,  5.50s/it][A

	loss_cls: tensor(0.7144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1657, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:35<18:33,  5.49s/it][A

	loss_cls: tensor(0.3640, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5810, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:41<18:32,  5.51s/it][A

	loss_cls: tensor(0.4040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5796, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:46<18:25,  5.50s/it][A

	loss_cls: tensor(0.3647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4077, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:52<18:25,  5.53s/it][A

	loss_cls: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4697, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:57<17:56,  5.41s/it][A

	loss_cls: tensor(0.4993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7402, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:03<17:59,  5.45s/it][A

	loss_cls: tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2831, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:08<17:55,  5.46s/it][A

	loss_cls: tensor(0.8576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2536, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:13<17:20,  5.31s/it][A

	loss_cls: tensor(0.8508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0716, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:19<17:29,  5.38s/it][A

	loss_cls: tensor(1.0529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3284, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:24<17:30,  5.41s/it][A

	loss_cls: tensor(0.8551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1914, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:30<17:33,  5.46s/it][A

	loss_cls: tensor(0.5946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9470, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:35<17:29,  5.47s/it][A

	loss_cls: tensor(0.5550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6130, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:41<17:30,  5.50s/it][A

	loss_cls: tensor(0.3975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5266, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:46<17:23,  5.49s/it][A

	loss_cls: tensor(0.6024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8144, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:52<17:13,  5.47s/it][A

	loss_cls: tensor(0.5825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7140, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:57<17:13,  5.50s/it][A

	loss_cls: tensor(0.6848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8772, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:02<16:37,  5.34s/it][A

	loss_cls: tensor(0.4712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5548, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:07<16:31,  5.33s/it][A

	loss_cls: tensor(0.4642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7065, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:13<16:35,  5.38s/it][A

	loss_cls: tensor(0.6186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0545, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:19<16:38,  5.43s/it][A

	loss_cls: tensor(0.4264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4881, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:24<16:36,  5.44s/it][A

	loss_cls: tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6210, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:30<16:33,  5.46s/it][A

	loss_cls: tensor(0.4326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5916, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:35<16:34,  5.49s/it][A

	loss_cls: tensor(0.4158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4693, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:41<16:30,  5.50s/it][A

	loss_cls: tensor(0.6340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9165, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:46<16:32,  5.54s/it][A

	loss_cls: tensor(0.4766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7000, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:52<16:22,  5.52s/it][A

	loss_cls: tensor(0.4860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9387, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:57<16:20,  5.54s/it][A

	loss_cls: tensor(1.0392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2063, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:03<16:13,  5.53s/it][A

	loss_cls: tensor(0.9573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4503, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:08<16:03,  5.51s/it][A

	loss_cls: tensor(0.7428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0469, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:14<16:00,  5.52s/it][A

	loss_cls: tensor(0.8475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0336, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:19<15:54,  5.52s/it][A

	loss_cls: tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9950, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:25<15:53,  5.54s/it][A

	loss_cls: tensor(0.4852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6254, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:30<15:19,  5.38s/it][A

	loss_cls: tensor(0.2893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5148, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:35<15:16,  5.39s/it][A

	loss_cls: tensor(0.5929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7714, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:41<15:14,  5.41s/it][A

	loss_cls: tensor(0.6740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8905, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:46<15:13,  5.44s/it][A

	loss_cls: tensor(0.6948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9104, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:52<15:10,  5.45s/it][A

	loss_cls: tensor(0.6957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7621, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:57<15:06,  5.46s/it][A

	loss_cls: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9303, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:03<15:05,  5.49s/it][A

	loss_cls: tensor(0.6253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9749, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:08<14:59,  5.49s/it][A

	loss_cls: tensor(0.5689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7020, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:14<14:57,  5.51s/it][A

	loss_cls: tensor(1.0204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0886, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1090, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:19<14:49,  5.49s/it][A

	loss_cls: tensor(0.4564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6042, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:25<14:42,  5.48s/it][A

	loss_cls: tensor(0.4286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5185, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:30<14:41,  5.51s/it][A

	loss_cls: tensor(0.5808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1708, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7517, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:35<14:10,  5.35s/it][A

	loss_cls: tensor(0.6450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7241, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:40<13:50,  5.25s/it][A

	loss_cls: tensor(0.4611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5686, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:45<13:39,  5.22s/it][A

	loss_cls: tensor(0.5677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8300, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:51<13:49,  5.31s/it][A

	loss_cls: tensor(0.6776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8180, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:56<13:50,  5.36s/it][A

	loss_cls: tensor(0.6805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0592, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:02<13:52,  5.41s/it][A

	loss_cls: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7780, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:08<13:51,  5.44s/it][A

	loss_cls: tensor(0.6930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9615, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:13<13:47,  5.45s/it][A

	loss_cls: tensor(0.6802, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0673, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:19<13:47,  5.48s/it][A

	loss_cls: tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0889, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:24<13:41,  5.48s/it][A

	loss_cls: tensor(0.7513, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9848, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:30<13:39,  5.50s/it][A

	loss_cls: tensor(0.7119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0071, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:35<13:25,  5.44s/it][A

	loss_cls: tensor(0.3971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5479, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:40<13:21,  5.45s/it][A

	loss_cls: tensor(0.4896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7537, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:46<13:13,  5.43s/it][A

	loss_cls: tensor(0.4885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5773, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:51<13:05,  5.41s/it][A

	loss_cls: tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9355, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:57<13:01,  5.43s/it][A

	loss_cls: tensor(0.7760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1591, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:02<12:54,  5.42s/it][A

	loss_cls: tensor(0.5693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6962, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:07<12:49,  5.42s/it][A

	loss_cls: tensor(0.7274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9089, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:13<12:41,  5.40s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7242, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:18<12:37,  5.41s/it][A

	loss_cls: tensor(0.5525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0295, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:24<12:29,  5.39s/it][A

	loss_cls: tensor(0.8665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1467, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:29<12:24,  5.39s/it][A

	loss_cls: tensor(0.6104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7959, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:34<12:21,  5.41s/it][A

	loss_cls: tensor(0.6478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9281, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:40<12:13,  5.40s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5523, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:45<12:10,  5.41s/it][A

	loss_cls: tensor(0.8527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2082, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:51<12:02,  5.39s/it][A

	loss_cls: tensor(0.5729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8868, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:56<12:00,  5.41s/it][A

	loss_cls: tensor(0.6012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8484, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:01<11:53,  5.40s/it][A

	loss_cls: tensor(0.6752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7595, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:07<11:45,  5.39s/it][A

	loss_cls: tensor(0.4968, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9348, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:12<11:43,  5.41s/it][A

	loss_cls: tensor(0.6513, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9040, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:18<11:34,  5.38s/it][A

	loss_cls: tensor(0.5434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6575, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:23<11:31,  5.40s/it][A

	loss_cls: tensor(0.5257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2712, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7969, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:28<11:24,  5.39s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7113, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:34<11:22,  5.42s/it][A

	loss_cls: tensor(0.6752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:39<11:15,  5.40s/it][A

	loss_cls: tensor(0.6446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8127, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:45<11:08,  5.39s/it][A

	loss_cls: tensor(0.8200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9529, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:50<11:04,  5.40s/it][A

	loss_cls: tensor(0.7698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0160, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:55<10:56,  5.38s/it][A

	loss_cls: tensor(0.7160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8622, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:01<10:52,  5.39s/it][A

	loss_cls: tensor(0.7248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9632, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:06<10:46,  5.39s/it][A

	loss_cls: tensor(0.6490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9336, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:12<10:42,  5.40s/it][A

	loss_cls: tensor(0.4855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6854, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:17<10:36,  5.39s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:22<10:30,  5.39s/it][A

	loss_cls: tensor(0.6238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7636, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:28<10:26,  5.40s/it][A

	loss_cls: tensor(0.5172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6792, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:33<10:19,  5.39s/it][A

	loss_cls: tensor(0.3775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4102, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:39<10:16,  5.41s/it][A

	loss_cls: tensor(0.7851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9822, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:44<10:10,  5.40s/it][A

	loss_cls: tensor(0.7216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9125, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:49<10:05,  5.41s/it][A

	loss_cls: tensor(0.4905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6136, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:55<09:59,  5.40s/it][A

	loss_cls: tensor(0.4656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7112, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:00<09:53,  5.39s/it][A

	loss_cls: tensor(0.5218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5604, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:06<09:49,  5.41s/it][A

	loss_cls: tensor(0.4506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5765, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:11<09:42,  5.39s/it][A

	loss_cls: tensor(0.4167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6228, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:16<09:38,  5.41s/it][A

	loss_cls: tensor(0.3364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3623, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:22<09:31,  5.40s/it][A

	loss_cls: tensor(0.9069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1527, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:27<09:27,  5.41s/it][A

	loss_cls: tensor(0.3541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5237, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:32<09:20,  5.39s/it][A

	loss_cls: tensor(0.4121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2114, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6235, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:38<09:13,  5.37s/it][A

	loss_cls: tensor(0.8229, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3307, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:43<09:09,  5.39s/it][A

	loss_cls: tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5844, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:49<09:03,  5.38s/it][A

	loss_cls: tensor(0.6530, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8272, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:54<08:59,  5.40s/it][A

	loss_cls: tensor(0.3366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7884, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:59<08:53,  5.38s/it][A

	loss_cls: tensor(0.6779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7919, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:05<08:49,  5.40s/it][A

	loss_cls: tensor(0.4453, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5378, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:10<08:41,  5.38s/it][A

	loss_cls: tensor(0.6545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8227, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:16<08:35,  5.37s/it][A

	loss_cls: tensor(0.6292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7628, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:21<08:32,  5.39s/it][A

	loss_cls: tensor(0.3734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4324, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:26<08:26,  5.39s/it][A

	loss_cls: tensor(0.7268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8693, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:32<08:22,  5.41s/it][A

	loss_cls: tensor(0.3851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7167, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:37<08:16,  5.40s/it][A

	loss_cls: tensor(0.3604, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5018, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:43<08:12,  5.41s/it][A

	loss_cls: tensor(0.7709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9849, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:48<08:06,  5.40s/it][A

	loss_cls: tensor(0.8022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0802, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:53<07:59,  5.39s/it][A

	loss_cls: tensor(0.6232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9061, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:59<07:56,  5.41s/it][A

	loss_cls: tensor(0.8597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0982, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:04<07:49,  5.40s/it][A

	loss_cls: tensor(0.7084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0464, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:10<07:46,  5.42s/it][A

	loss_cls: tensor(0.3281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4144, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:15<07:39,  5.41s/it][A

	loss_cls: tensor(0.4438, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4826, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:20<07:35,  5.42s/it][A

	loss_cls: tensor(0.6984, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8925, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:26<07:28,  5.40s/it][A

	loss_cls: tensor(0.7958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9928, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:31<07:21,  5.39s/it][A

	loss_cls: tensor(0.7118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9523, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:37<07:17,  5.40s/it][A

	loss_cls: tensor(0.5237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6643, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:42<07:10,  5.38s/it][A

	loss_cls: tensor(0.6820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0758, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:47<07:07,  5.41s/it][A

	loss_cls: tensor(0.4825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:53<07:00,  5.39s/it][A

	loss_cls: tensor(0.3779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5464, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:58<06:56,  5.41s/it][A

	loss_cls: tensor(0.7128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8283, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:04<06:49,  5.39s/it][A

	loss_cls: tensor(0.4190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6726, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:09<06:43,  5.38s/it][A

	loss_cls: tensor(0.5753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1595, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7348, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:14<06:39,  5.41s/it][A

	loss_cls: tensor(0.5415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8752, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:20<06:34,  5.40s/it][A

	loss_cls: tensor(0.6799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8961, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:25<06:30,  5.42s/it][A

	loss_cls: tensor(0.4573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6615, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:31<06:23,  5.41s/it][A

	loss_cls: tensor(0.3973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3917, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7890, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:36<06:20,  5.43s/it][A

	loss_cls: tensor(0.6226, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9704, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:42<06:13,  5.42s/it][A

	loss_cls: tensor(0.4870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5451, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:47<06:07,  5.40s/it][A

	loss_cls: tensor(0.8814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1812, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:52<06:03,  5.42s/it][A

	loss_cls: tensor(0.7211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9180, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:58<05:56,  5.41s/it][A

	loss_cls: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0296, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:03<05:51,  5.42s/it][A

	loss_cls: tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7925, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:09<05:45,  5.40s/it][A

	loss_cls: tensor(0.3524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4668, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:14<05:41,  5.41s/it][A

	loss_cls: tensor(0.9157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5141, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:19<05:34,  5.40s/it][A

	loss_cls: tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8595, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:25<05:28,  5.39s/it][A

	loss_cls: tensor(0.5030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7690, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:30<05:24,  5.42s/it][A

	loss_cls: tensor(0.5145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7745, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:36<05:19,  5.41s/it][A

	loss_cls: tensor(0.3755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4798, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:41<05:14,  5.42s/it][A

	loss_cls: tensor(0.4214, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6210, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:46<05:07,  5.40s/it][A

	loss_cls: tensor(0.4742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6251, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:52<05:03,  5.41s/it][A

	loss_cls: tensor(0.6108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8349, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:57<04:56,  5.40s/it][A

	loss_cls: tensor(0.5999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9664, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:03<04:51,  5.40s/it][A

	loss_cls: tensor(0.4719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8449, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:08<04:46,  5.40s/it][A

	loss_cls: tensor(1.0569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1946, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:13<04:39,  5.38s/it][A

	loss_cls: tensor(0.8099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0423, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:19<04:35,  5.40s/it][A

	loss_cls: tensor(0.5127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8006, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:24<04:29,  5.39s/it][A

	loss_cls: tensor(0.6097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5022, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1119, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:30<04:24,  5.40s/it][A

	loss_cls: tensor(0.5475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8904, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:35<04:18,  5.39s/it][A

	loss_cls: tensor(0.7024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8466, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:40<04:14,  5.41s/it][A

	loss_cls: tensor(0.7297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8402, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:46<04:07,  5.39s/it][A

	loss_cls: tensor(0.6255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7628, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:51<04:02,  5.38s/it][A

	loss_cls: tensor(0.4553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7374, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:57<03:57,  5.40s/it][A

	loss_cls: tensor(0.6312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8670, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:02<03:51,  5.39s/it][A

	loss_cls: tensor(0.4485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9632, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:07<03:47,  5.41s/it][A

	loss_cls: tensor(0.4931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7131, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:13<03:40,  5.38s/it][A

	loss_cls: tensor(0.6996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7581, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:18<03:36,  5.41s/it][A

	loss_cls: tensor(0.6570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9501, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:24<03:30,  5.41s/it][A

	loss_cls: tensor(0.3504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5511, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:29<03:25,  5.40s/it][A

	loss_cls: tensor(0.7830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9206, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:34<03:20,  5.41s/it][A

	loss_cls: tensor(0.4955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6613, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:40<03:14,  5.41s/it][A

	loss_cls: tensor(0.8319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9474, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:45<03:09,  5.42s/it][A

	loss_cls: tensor(0.4494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5111, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:51<03:03,  5.40s/it][A

	loss_cls: tensor(0.6053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0810, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:56<02:58,  5.42s/it][A

	loss_cls: tensor(0.4351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7176, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:01<02:52,  5.40s/it][A

	loss_cls: tensor(0.6646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9166, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:07<02:46,  5.38s/it][A

	loss_cls: tensor(0.4613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7042, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:12<02:41,  5.39s/it][A

	loss_cls: tensor(0.4107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6649, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:17<02:36,  5.38s/it][A

	loss_cls: tensor(0.9072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3651, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:23<02:30,  5.39s/it][A

	loss_cls: tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5359, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:28<02:25,  5.38s/it][A

	loss_cls: tensor(0.7544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0751, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:34<02:20,  5.40s/it][A

	loss_cls: tensor(0.6730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1695, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:39<02:14,  5.40s/it][A

	loss_cls: tensor(0.8603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2035, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:44<02:09,  5.39s/it][A

	loss_cls: tensor(0.4790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8826, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:50<02:04,  5.40s/it][A

	loss_cls: tensor(0.5421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8918, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:55<01:58,  5.39s/it][A

	loss_cls: tensor(0.3403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5099, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:01<01:53,  5.39s/it][A

	loss_cls: tensor(0.5356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7408, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:06<01:47,  5.38s/it][A

	loss_cls: tensor(0.7675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2353, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:11<01:42,  5.41s/it][A

	loss_cls: tensor(0.8399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0085, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:17<01:37,  5.40s/it][A

	loss_cls: tensor(0.4736, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9534, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:22<01:31,  5.39s/it][A

	loss_cls: tensor(0.7165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7774, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:28<01:26,  5.41s/it][A

	loss_cls: tensor(0.6161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8482, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:33<01:20,  5.40s/it][A

	loss_cls: tensor(0.7541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8768, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:39<01:15,  5.42s/it][A

	loss_cls: tensor(0.8078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8492, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:44<01:10,  5.41s/it][A

	loss_cls: tensor(0.4602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7158, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:49<01:05,  5.42s/it][A

	loss_cls: tensor(0.4765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6744, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:55<00:59,  5.41s/it][A

	loss_cls: tensor(0.3870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7570, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:00<00:54,  5.41s/it][A

	loss_cls: tensor(0.5084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5495, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:06<00:48,  5.43s/it][A

	loss_cls: tensor(0.6328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3412, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:11<00:43,  5.41s/it][A

	loss_cls: tensor(0.4558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6020, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:16<00:38,  5.43s/it][A

	loss_cls: tensor(0.6183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0247, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:22<00:32,  5.43s/it][A

	loss_cls: tensor(0.6727, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0249, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:27<00:27,  5.44s/it][A

	loss_cls: tensor(0.3949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8504, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:33<00:21,  5.42s/it][A

	loss_cls: tensor(0.4921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7507, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:38<00:16,  5.41s/it][A

	loss_cls: tensor(0.7719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9824, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:44<00:10,  5.42s/it][A

	loss_cls: tensor(0.5448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7116, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:49<00:05,  5.41s/it][A

	loss_cls: tensor(0.5425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7778, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:51<00:00,  5.40s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.4742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5499, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8362456015442724

	Training cls acc: 0.701035781544256

	Training cls prec: 0.5836229389725153

	Training cls rec: 0.6161219665456954

	Training cls f1: 0.5438235643104546

--
	Training ner acc: 0.9556146997310916

	Training ner prec: 0.2807716986239054

	Training ner rec: 0.2880014893384227

	Training ner f1: 0.28363360603759885

	Current Learning rate:  0.0006



  1%|          | 1/177 [00:00<01:54,  1.53it/s][A
  1%|          | 2/177 [00:01<02:01,  1.44it/s][A
  2%|▏         | 3/177 [00:02<02:04,  1.40it/s][A
  2%|▏         | 4/177 [00:02<02:04,  1.39it/s][A
  3%|▎         | 5/177 [00:03<01:59,  1.44it/s][A
  3%|▎         | 6/177 [00:04<02:01,  1.41it/s][A
  4%|▍         | 7/177 [00:04<02:02,  1.39it/s][A
  5%|▍         | 8/177 [00:05<01:57,  1.44it/s][A
  5%|▌         | 9/177 [00:06<01:58,  1.42it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.40it/s][A
  6%|▌         | 11/177 [00:07<01:59,  1.39it/s][A
  7%|▋         | 12/177 [00:08<01:55,  1.43it/s][A
  7%|▋         | 13/177 [00:09<01:56,  1.41it/s][A
  8%|▊         | 14/177 [00:09<01:56,  1.39it/s][A
  8%|▊         | 15/177 [00:10<01:56,  1.38it/s][A
  9%|▉         | 16/177 [00:11<01:52,  1.43it/s][A
 10%|▉         | 17/177 [00:12<01:53,  1.41it/s][A
 10%|█         | 18/177 [00:12<01:54,  1.39it/s][A
 11%|█         | 19/177 [00:13<01:49,  1.44it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7666530545148472

	Validation cls acc: 0.7158662900188324

	Validation cls prec: 0.6103208232445521

	Validation cls rec: 0.5977703793381759

	Validation cls f1: 0.5626513473971101

--
	Validation ner acc: 0.9540846315319776

	Validation ner prec: 0.41580786459860053

	Validation ner rec: 0.42627118644067796

	Validation ner f1: 0.4208195824306344



  0%|          | 1/354 [00:05<32:02,  5.45s/it][A

	loss_cls: tensor(0.3746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5106, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:33,  5.38s/it][A

	loss_cls: tensor(1.1207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3338, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:38,  5.41s/it][A

	loss_cls: tensor(0.7033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9724, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:27,  5.39s/it][A

	loss_cls: tensor(0.9478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1339, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:21,  5.39s/it][A

	loss_cls: tensor(0.5631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8396, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:22,  5.41s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6135, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:14,  5.40s/it][A

	loss_cls: tensor(0.5193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6209, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:12,  5.41s/it][A

	loss_cls: tensor(0.8915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2589, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:01,  5.40s/it][A

	loss_cls: tensor(0.5014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9840, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:02,  5.41s/it][A

	loss_cls: tensor(0.3790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5299, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:51,  5.40s/it][A

	loss_cls: tensor(0.6549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8790, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:40,  5.38s/it][A

	loss_cls: tensor(0.5964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7617, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:39,  5.40s/it][A

	loss_cls: tensor(0.6599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9497, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:30,  5.38s/it][A

	loss_cls: tensor(0.4677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8444, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:33,  5.41s/it][A

	loss_cls: tensor(0.4729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6384, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:23,  5.39s/it][A

	loss_cls: tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8068, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:23,  5.41s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6832, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:14,  5.40s/it][A

	loss_cls: tensor(0.6952, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7506, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:06,  5.39s/it][A

	loss_cls: tensor(0.5544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8862, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:05,  5.41s/it][A

	loss_cls: tensor(0.9628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1275, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<29:54,  5.39s/it][A

	loss_cls: tensor(0.3552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5142, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:53,  5.40s/it][A

	loss_cls: tensor(0.8026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8543, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:46,  5.40s/it][A

	loss_cls: tensor(0.5272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6077, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:45,  5.41s/it][A

	loss_cls: tensor(0.6134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7056, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:14<29:33,  5.39s/it][A

	loss_cls: tensor(0.8124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1620, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:25,  5.38s/it][A

	loss_cls: tensor(0.6916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0141, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:25<29:25,  5.40s/it][A

	loss_cls: tensor(0.5486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7147, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:13,  5.38s/it][A

	loss_cls: tensor(0.7413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.8364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5777, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:13,  5.40s/it][A

	loss_cls: tensor(0.4410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7116, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:41<29:05,  5.39s/it][A

	loss_cls: tensor(0.7925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9005, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:06,  5.41s/it][A

	loss_cls: tensor(0.6005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0951, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:52<28:57,  5.40s/it][A

	loss_cls: tensor(0.8359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0103, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:47,  5.38s/it][A

	loss_cls: tensor(0.5656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0191, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<28:48,  5.40s/it][A

	loss_cls: tensor(0.4449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8730, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:08<28:38,  5.39s/it][A

	loss_cls: tensor(0.4662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8038, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:39,  5.41s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3280, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8624, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:19<28:30,  5.40s/it][A

	loss_cls: tensor(0.7750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9425, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:29,  5.41s/it][A

	loss_cls: tensor(0.6655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7790, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:30<28:18,  5.39s/it][A

	loss_cls: tensor(0.5967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7879, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:35<28:10,  5.39s/it][A

	loss_cls: tensor(0.5593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6892, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<28:09,  5.40s/it][A

	loss_cls: tensor(0.4732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8631, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:46<27:59,  5.38s/it][A

	loss_cls: tensor(0.8377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9413, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<28:00,  5.40s/it][A

	loss_cls: tensor(0.6318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8805, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:57<27:52,  5.39s/it][A

	loss_cls: tensor(0.5565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6555, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:02<27:51,  5.41s/it][A

	loss_cls: tensor(0.6305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6927, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:08<27:43,  5.40s/it][A

	loss_cls: tensor(0.6001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7574, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:13<27:32,  5.38s/it][A

	loss_cls: tensor(0.4189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8491, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:31,  5.40s/it][A

	loss_cls: tensor(0.6066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6910, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:24<27:21,  5.38s/it][A

	loss_cls: tensor(0.6688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9394, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:29<27:23,  5.40s/it][A

	loss_cls: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9389, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:35<27:12,  5.39s/it][A

	loss_cls: tensor(0.6665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7902, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:40<27:09,  5.40s/it][A

	loss_cls: tensor(0.5522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5834, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:45<26:59,  5.38s/it][A

	loss_cls: tensor(0.5661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6940, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:51<26:51,  5.37s/it][A

	loss_cls: tensor(0.2843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3056, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:56<26:53,  5.40s/it][A

	loss_cls: tensor(0.8551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1963, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:02<26:44,  5.38s/it][A

	loss_cls: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8644, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:07<26:44,  5.40s/it][A

	loss_cls: tensor(0.8330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4006, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:12<26:36,  5.39s/it][A

	loss_cls: tensor(0.6051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7517, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:18<26:36,  5.41s/it][A

	loss_cls: tensor(0.8478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9208, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:23<26:26,  5.40s/it][A

	loss_cls: tensor(0.8661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2260, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:29<26:25,  5.41s/it][A

	loss_cls: tensor(0.4035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5909, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:34<26:22,  5.42s/it][A

	loss_cls: tensor(0.3815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6524, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:40<26:12,  5.41s/it][A

	loss_cls: tensor(0.8084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0409, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:45<26:09,  5.41s/it][A

	loss_cls: tensor(0.7334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1168, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:50<25:56,  5.39s/it][A

	loss_cls: tensor(0.4631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6122, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:56<25:58,  5.41s/it][A

	loss_cls: tensor(0.5728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8841, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:01<25:47,  5.39s/it][A

	loss_cls: tensor(0.5059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0028, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:06<25:36,  5.37s/it][A

	loss_cls: tensor(0.6591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8468, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:12<25:36,  5.39s/it][A

	loss_cls: tensor(0.4297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7330, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:17<25:29,  5.39s/it][A

	loss_cls: tensor(0.6463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8245, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:23<25:28,  5.40s/it][A

	loss_cls: tensor(0.4626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8882, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:28<25:19,  5.39s/it][A

	loss_cls: tensor(0.7538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8141, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:33<25:17,  5.40s/it][A

	loss_cls: tensor(0.7154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8595, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:39<25:08,  5.39s/it][A

	loss_cls: tensor(0.5549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7129, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:44<25:00,  5.38s/it][A

	loss_cls: tensor(0.9315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1109, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:50<25:03,  5.41s/it][A

	loss_cls: tensor(0.5353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6796, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:55<24:53,  5.39s/it][A

	loss_cls: tensor(0.8504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2095, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:00<24:52,  5.41s/it][A

	loss_cls: tensor(0.5014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7545, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:06<24:42,  5.39s/it][A

	loss_cls: tensor(0.6154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0070, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:11<24:41,  5.41s/it][A

	loss_cls: tensor(0.3852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5938, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<24:32,  5.39s/it][A

	loss_cls: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7766, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<24:29,  5.40s/it][A

	loss_cls: tensor(0.7278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0359, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:27<24:21,  5.39s/it][A

	loss_cls: tensor(0.4588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7242, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:20,  5.41s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8171, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:38<24:18,  5.42s/it][A

	loss_cls: tensor(0.6302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8975, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:08,  5.40s/it][A

	loss_cls: tensor(0.7879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0296, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:49<24:04,  5.41s/it][A

	loss_cls: tensor(0.8127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1880, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:54<23:56,  5.40s/it][A

	loss_cls: tensor(0.7196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9677, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<23:54,  5.41s/it][A

	loss_cls: tensor(0.6919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9847, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:05<23:46,  5.40s/it][A

	loss_cls: tensor(0.6235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9048, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:37,  5.39s/it][A

	loss_cls: tensor(0.5486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7087, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<23:36,  5.40s/it][A

	loss_cls: tensor(0.5382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8878, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:21<23:25,  5.39s/it][A

	loss_cls: tensor(0.5404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7549, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:27<23:22,  5.39s/it][A

	loss_cls: tensor(0.6682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9830, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:32<23:12,  5.38s/it][A

	loss_cls: tensor(0.5135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7372, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:38<23:12,  5.40s/it][A

	loss_cls: tensor(0.6164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7330, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:43<23:04,  5.39s/it][A

	loss_cls: tensor(0.6024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8508, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:48<22:57,  5.38s/it][A

	loss_cls: tensor(0.4467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8608, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:54<22:54,  5.39s/it][A

	loss_cls: tensor(0.6792, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8453, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:59<22:43,  5.37s/it][A

	loss_cls: tensor(0.7682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2578, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:05<22:43,  5.39s/it][A

	loss_cls: tensor(0.8374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0189, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:10<22:33,  5.37s/it][A

	loss_cls: tensor(0.3906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0385, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:15<22:32,  5.39s/it][A

	loss_cls: tensor(0.4608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7824, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:21<22:22,  5.37s/it][A

	loss_cls: tensor(0.8535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9845, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:26<22:15,  5.36s/it][A

	loss_cls: tensor(0.7198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9414, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:31<22:14,  5.38s/it][A

	loss_cls: tensor(0.6817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8464, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:37<22:06,  5.37s/it][A

	loss_cls: tensor(0.5469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0531, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:42<22:06,  5.39s/it][A

	loss_cls: tensor(0.4503, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7174, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:48<21:59,  5.39s/it][A

	loss_cls: tensor(0.4902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6413, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:53<22:01,  5.42s/it][A

	loss_cls: tensor(0.4584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1561, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6145, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:58<21:53,  5.40s/it][A

	loss_cls: tensor(0.5198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5844, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:04<21:43,  5.38s/it][A

	loss_cls: tensor(0.5474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2812, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8286, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:09<21:43,  5.41s/it][A

	loss_cls: tensor(0.5569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8970, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:15<21:34,  5.39s/it][A

	loss_cls: tensor(0.5760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8389, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:20<21:32,  5.41s/it][A

	loss_cls: tensor(0.5843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8108, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:25<21:27,  5.41s/it][A

	loss_cls: tensor(0.8580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2451, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:31<21:26,  5.43s/it][A

	loss_cls: tensor(0.6767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9454, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:36<21:17,  5.41s/it][A

	loss_cls: tensor(0.5090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8132, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:42<21:10,  5.41s/it][A

	loss_cls: tensor(0.6748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0282, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:47<21:09,  5.43s/it][A

	loss_cls: tensor(0.8791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0635, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:52<21:00,  5.41s/it][A

	loss_cls: tensor(0.4239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6242, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:58<21:00,  5.43s/it][A

	loss_cls: tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7721, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:03<20:52,  5.42s/it][A

	loss_cls: tensor(0.8537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1017, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:09<20:51,  5.44s/it][A

	loss_cls: tensor(0.7756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8913, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:14<20:42,  5.43s/it][A

	loss_cls: tensor(0.7714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1492, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:20<20:33,  5.41s/it][A

	loss_cls: tensor(0.6784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7373, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:25<20:32,  5.43s/it][A

	loss_cls: tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5774, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:30<20:21,  5.41s/it][A

	loss_cls: tensor(0.5906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8248, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:36<20:22,  5.43s/it][A

	loss_cls: tensor(0.4822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6692, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:41<20:18,  5.44s/it][A

	loss_cls: tensor(0.4336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5187, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:47<20:18,  5.46s/it][A

	loss_cls: tensor(0.6330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7808, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:52<20:08,  5.44s/it][A

	loss_cls: tensor(0.7180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1294, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:58<19:58,  5.42s/it][A

	loss_cls: tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7524, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:03<19:51,  5.42s/it][A

	loss_cls: tensor(0.6314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8600, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:08<19:43,  5.40s/it][A

	loss_cls: tensor(0.5304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:14<19:41,  5.42s/it][A

	loss_cls: tensor(0.6094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8123, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:19<19:32,  5.40s/it][A

	loss_cls: tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6176, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:25<19:30,  5.42s/it][A

	loss_cls: tensor(0.4723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8257, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:30<19:23,  5.41s/it][A

	loss_cls: tensor(0.3819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5361, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:35<19:13,  5.39s/it][A

	loss_cls: tensor(0.6536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8594, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:41<19:12,  5.41s/it][A

	loss_cls: tensor(0.5682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8540, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:46<19:04,  5.40s/it][A

	loss_cls: tensor(0.4356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5278, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:52<19:04,  5.43s/it][A

	loss_cls: tensor(0.5624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8186, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:57<18:57,  5.42s/it][A

	loss_cls: tensor(0.4205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5439, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:03<18:54,  5.43s/it][A

	loss_cls: tensor(1.0587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4339, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:08<18:45,  5.41s/it][A

	loss_cls: tensor(0.8851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0386, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:13<18:36,  5.39s/it][A

	loss_cls: tensor(0.4528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6083, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:19<18:33,  5.41s/it][A

	loss_cls: tensor(0.4598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6942, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:24<18:26,  5.40s/it][A

	loss_cls: tensor(0.4309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6245, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:30<18:24,  5.41s/it][A

	loss_cls: tensor(1.2656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5944, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:35<18:17,  5.41s/it][A

	loss_cls: tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7617, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:40<18:15,  5.42s/it][A

	loss_cls: tensor(0.4414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7651, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:46<18:07,  5.41s/it][A

	loss_cls: tensor(0.6059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8136, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:51<17:58,  5.39s/it][A

	loss_cls: tensor(0.7280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9693, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:57<17:56,  5.41s/it][A

	loss_cls: tensor(0.4097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7653, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:02<17:50,  5.41s/it][A

	loss_cls: tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8338, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:08<17:48,  5.42s/it][A

	loss_cls: tensor(0.4401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6422, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:13<17:40,  5.41s/it][A

	loss_cls: tensor(0.4212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6988, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:18<17:36,  5.42s/it][A

	loss_cls: tensor(0.4822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3222, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8044, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:24<17:28,  5.40s/it][A

	loss_cls: tensor(0.8130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9818, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:29<17:19,  5.38s/it][A

	loss_cls: tensor(0.5710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6879, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:34<17:16,  5.40s/it][A

	loss_cls: tensor(0.6395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7449, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:40<17:10,  5.40s/it][A

	loss_cls: tensor(0.8368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0101, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:45<17:08,  5.42s/it][A

	loss_cls: tensor(0.5184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7167, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:51<17:00,  5.40s/it][A

	loss_cls: tensor(0.5386, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6460, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:56<16:58,  5.42s/it][A

	loss_cls: tensor(0.6139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8873, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:01<16:49,  5.40s/it][A

	loss_cls: tensor(0.5602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7376, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:07<16:41,  5.38s/it][A

	loss_cls: tensor(0.8458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0994, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9453, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:12<16:39,  5.40s/it][A

	loss_cls: tensor(0.7150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9211, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:18<16:31,  5.39s/it][A

	loss_cls: tensor(0.5765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8298, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:23<16:29,  5.41s/it][A

	loss_cls: tensor(0.8630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9469, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:28<16:22,  5.40s/it][A

	loss_cls: tensor(0.5224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7725, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:34<16:19,  5.41s/it][A

	loss_cls: tensor(0.5504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1574, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:39<16:12,  5.40s/it][A

	loss_cls: tensor(0.4841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5759, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:45<16:03,  5.38s/it][A

	loss_cls: tensor(0.3867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4990, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:50<16:03,  5.41s/it][A

	loss_cls: tensor(0.5309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7999, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:56<15:57,  5.41s/it][A

	loss_cls: tensor(0.5333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6553, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:01<15:54,  5.42s/it][A

	loss_cls: tensor(0.3645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5229, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:06<15:45,  5.40s/it][A

	loss_cls: tensor(0.6527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8392, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:12<15:42,  5.41s/it][A

	loss_cls: tensor(0.2986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3631, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:17<15:34,  5.40s/it][A

	loss_cls: tensor(0.4010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6200, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:23<15:31,  5.42s/it][A

	loss_cls: tensor(0.7264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8250, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:28<15:25,  5.41s/it][A

	loss_cls: tensor(0.9179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0336, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:33<15:17,  5.40s/it][A

	loss_cls: tensor(1.1824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4660, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:39<15:14,  5.41s/it][A

	loss_cls: tensor(0.7957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4221, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:44<15:05,  5.39s/it][A

	loss_cls: tensor(0.7177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0558, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:50<15:02,  5.40s/it][A

	loss_cls: tensor(0.5101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8277, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:55<14:54,  5.39s/it][A

	loss_cls: tensor(0.5026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7713, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:00<14:50,  5.40s/it][A

	loss_cls: tensor(0.9039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3411, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:06<14:43,  5.39s/it][A

	loss_cls: tensor(0.3697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5869, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:11<14:37,  5.38s/it][A

	loss_cls: tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7678, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:17<14:34,  5.40s/it][A

	loss_cls: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6319, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:22<14:26,  5.38s/it][A

	loss_cls: tensor(0.6941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8988, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:27<14:22,  5.39s/it][A

	loss_cls: tensor(0.5175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6780, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:33<14:13,  5.37s/it][A

	loss_cls: tensor(0.4860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6589, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:38<14:12,  5.39s/it][A

	loss_cls: tensor(0.8605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9685, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:43<14:05,  5.38s/it][A

	loss_cls: tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1917, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6774, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:49<13:58,  5.38s/it][A

	loss_cls: tensor(0.7747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8176, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:54<13:58,  5.41s/it][A

	loss_cls: tensor(0.7260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9492, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:00<13:51,  5.40s/it][A

	loss_cls: tensor(0.6157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8814, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:05<13:48,  5.42s/it][A

	loss_cls: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0389, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:10<13:39,  5.39s/it][A

	loss_cls: tensor(0.4774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8537, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:16<13:35,  5.40s/it][A

	loss_cls: tensor(0.9310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2328, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:21<13:28,  5.39s/it][A

	loss_cls: tensor(0.7128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0249, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:27<13:21,  5.38s/it][A

	loss_cls: tensor(0.5621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7130, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:32<13:19,  5.40s/it][A

	loss_cls: tensor(0.4947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8849, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:37<13:12,  5.39s/it][A

	loss_cls: tensor(0.7539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7995, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:43<13:07,  5.40s/it][A

	loss_cls: tensor(0.6066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7583, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:48<13:01,  5.39s/it][A

	loss_cls: tensor(0.8499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1680, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:54<12:59,  5.41s/it][A

	loss_cls: tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6411, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:59<12:52,  5.40s/it][A

	loss_cls: tensor(0.8233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8835, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:04<12:44,  5.39s/it][A

	loss_cls: tensor(0.5387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6601, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:10<12:41,  5.40s/it][A

	loss_cls: tensor(0.5032, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7127, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:15<12:34,  5.39s/it][A

	loss_cls: tensor(0.6199, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6951, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:21<12:30,  5.40s/it][A

	loss_cls: tensor(0.7795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1214, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:26<12:23,  5.39s/it][A

	loss_cls: tensor(0.6319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9789, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:31<12:21,  5.41s/it][A

	loss_cls: tensor(0.5474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7281, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:37<12:13,  5.40s/it][A

	loss_cls: tensor(0.5180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5602, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:42<12:07,  5.39s/it][A

	loss_cls: tensor(1.1132, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3024, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:48<12:05,  5.42s/it][A

	loss_cls: tensor(0.7392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9562, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:53<11:58,  5.40s/it][A

	loss_cls: tensor(0.5035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5757, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:58<11:55,  5.42s/it][A

	loss_cls: tensor(0.7174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8808, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:04<11:48,  5.41s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6316, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:09<11:45,  5.43s/it][A

	loss_cls: tensor(0.4493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5982, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:15<11:39,  5.42s/it][A

	loss_cls: tensor(0.7643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8989, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:20<11:32,  5.41s/it][A

	loss_cls: tensor(0.5299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7482, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:26<11:28,  5.43s/it][A

	loss_cls: tensor(0.3942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4147, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:31<11:21,  5.41s/it][A

	loss_cls: tensor(0.7983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8521, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:36<11:18,  5.43s/it][A

	loss_cls: tensor(0.2648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3014, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:42<11:10,  5.41s/it][A

	loss_cls: tensor(0.7773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1281, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:47<11:07,  5.42s/it][A

	loss_cls: tensor(0.9396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6152, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:53<11:00,  5.41s/it][A

	loss_cls: tensor(0.5111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7152, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:58<10:53,  5.40s/it][A

	loss_cls: tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8329, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:03<10:49,  5.41s/it][A

	loss_cls: tensor(0.9177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5747, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:09<10:42,  5.40s/it][A

	loss_cls: tensor(0.8529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2328, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:14<10:38,  5.41s/it][A

	loss_cls: tensor(1.0874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3260, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:20<10:31,  5.40s/it][A

	loss_cls: tensor(0.3974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4762, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:25<10:28,  5.42s/it][A

	loss_cls: tensor(0.8768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0678, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:30<10:21,  5.40s/it][A

	loss_cls: tensor(0.4981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6591, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:36<10:13,  5.38s/it][A

	loss_cls: tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5826, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:41<10:10,  5.40s/it][A

	loss_cls: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9415, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:47<10:03,  5.39s/it][A

	loss_cls: tensor(0.8053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1467, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:52<09:59,  5.40s/it][A

	loss_cls: tensor(0.6474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8333, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:57<09:53,  5.39s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6852, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:03<09:49,  5.41s/it][A

	loss_cls: tensor(0.7043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1052, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:08<09:43,  5.40s/it][A

	loss_cls: tensor(0.7566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1823, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:14<09:36,  5.39s/it][A

	loss_cls: tensor(0.6613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1962, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:19<09:33,  5.41s/it][A

	loss_cls: tensor(0.6311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3112, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9422, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:24<09:26,  5.40s/it][A

	loss_cls: tensor(0.6002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8546, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:30<09:23,  5.42s/it][A

	loss_cls: tensor(0.6009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6537, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:35<09:17,  5.41s/it][A

	loss_cls: tensor(0.5715, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8960, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:41<09:12,  5.42s/it][A

	loss_cls: tensor(0.5371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3436, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8807, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:46<09:06,  5.41s/it][A

	loss_cls: tensor(0.6166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8294, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:51<08:59,  5.40s/it][A

	loss_cls: tensor(0.6675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8336, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:57<08:55,  5.41s/it][A

	loss_cls: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8309, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:02<08:48,  5.39s/it][A

	loss_cls: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7290, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:08<08:45,  5.42s/it][A

	loss_cls: tensor(0.6567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7216, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:13<08:38,  5.41s/it][A

	loss_cls: tensor(0.5944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7566, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:19<08:34,  5.42s/it][A

	loss_cls: tensor(0.9084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0753, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:24<08:27,  5.40s/it][A

	loss_cls: tensor(0.5849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7416, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:29<08:20,  5.38s/it][A

	loss_cls: tensor(0.6750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9582, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:35<08:17,  5.41s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7165, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:40<08:11,  5.40s/it][A

	loss_cls: tensor(0.6642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7280, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:46<08:08,  5.43s/it][A

	loss_cls: tensor(0.6293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0327, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:51<08:01,  5.41s/it][A

	loss_cls: tensor(0.5645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6590, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:56<07:57,  5.43s/it][A

	loss_cls: tensor(0.5547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7831, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:02<07:50,  5.41s/it][A

	loss_cls: tensor(0.5257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6597, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:07<07:44,  5.40s/it][A

	loss_cls: tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0979, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:13<07:39,  5.41s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6320, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:18<07:33,  5.40s/it][A

	loss_cls: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3059, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:23<07:29,  5.41s/it][A

	loss_cls: tensor(0.5624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8589, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:29<07:22,  5.40s/it][A

	loss_cls: tensor(0.5138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6174, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:34<07:18,  5.42s/it][A

	loss_cls: tensor(0.5329, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7486, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:40<07:12,  5.40s/it][A

	loss_cls: tensor(0.8216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9321, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:45<07:06,  5.39s/it][A

	loss_cls: tensor(0.5556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8964, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:50<07:02,  5.42s/it][A

	loss_cls: tensor(0.5678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7089, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:56<06:56,  5.41s/it][A

	loss_cls: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8432, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:01<06:51,  5.42s/it][A

	loss_cls: tensor(0.4431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6947, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:07<06:47,  5.44s/it][A

	loss_cls: tensor(0.7076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8566, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:13<06:50,  5.54s/it][A

	loss_cls: tensor(0.6753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7337, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:18<06:47,  5.59s/it][A

	loss_cls: tensor(0.8961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0356, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:24<06:49,  5.68s/it][A

	loss_cls: tensor(0.8319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8865, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:30<06:49,  5.77s/it][A

	loss_cls: tensor(0.6238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7685, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:36<06:47,  5.83s/it][A

	loss_cls: tensor(0.5297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7266, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:42<06:47,  5.91s/it][A

	loss_cls: tensor(0.4685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7899, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:48<06:40,  5.89s/it][A

	loss_cls: tensor(0.4485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6748, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:54<06:31,  5.84s/it][A

	loss_cls: tensor(0.3725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4196, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:59<06:21,  5.78s/it][A

	loss_cls: tensor(0.6555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0006, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:05<06:09,  5.69s/it][A

	loss_cls: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4074, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:10<05:58,  5.60s/it][A

	loss_cls: tensor(0.4903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1526, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6429, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:16<05:48,  5.54s/it][A

	loss_cls: tensor(0.7921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9158, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:21<05:41,  5.51s/it][A

	loss_cls: tensor(0.8749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9360, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:26<05:33,  5.47s/it][A

	loss_cls: tensor(0.8291, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2874, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:32<05:27,  5.46s/it][A

	loss_cls: tensor(0.4630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7334, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:37<05:19,  5.42s/it][A

	loss_cls: tensor(0.7755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0206, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:43<05:14,  5.43s/it][A

	loss_cls: tensor(0.3639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3955, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:48<05:08,  5.41s/it][A

	loss_cls: tensor(0.3521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6144, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:53<05:02,  5.40s/it][A

	loss_cls: tensor(0.4047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4874, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:59<04:58,  5.42s/it][A

	loss_cls: tensor(0.9591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3058, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:04<04:51,  5.40s/it][A

	loss_cls: tensor(0.5865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8357, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:10<04:46,  5.41s/it][A

	loss_cls: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2360, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:15<04:40,  5.40s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8229, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:21<04:35,  5.41s/it][A

	loss_cls: tensor(0.5364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6726, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:26<04:29,  5.39s/it][A

	loss_cls: tensor(0.5807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0310, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:31<04:23,  5.38s/it][A

	loss_cls: tensor(0.5284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8536, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:37<04:18,  5.39s/it][A

	loss_cls: tensor(0.6331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9527, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:42<04:12,  5.38s/it][A

	loss_cls: tensor(0.4877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7087, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:47<04:08,  5.39s/it][A

	loss_cls: tensor(0.6546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7679, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:53<04:02,  5.39s/it][A

	loss_cls: tensor(0.4585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8680, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:58<03:57,  5.40s/it][A

	loss_cls: tensor(0.4453, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7997, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:04<03:51,  5.38s/it][A

	loss_cls: tensor(0.6432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8630, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:09<03:45,  5.37s/it][A

	loss_cls: tensor(0.3701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2102, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5803, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:14<03:40,  5.39s/it][A

	loss_cls: tensor(0.7104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7737, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:20<03:35,  5.38s/it][A

	loss_cls: tensor(0.4475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3356, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7831, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:25<03:30,  5.39s/it][A

	loss_cls: tensor(0.5280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5779, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:31<03:24,  5.39s/it][A

	loss_cls: tensor(0.4363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6383, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:36<03:20,  5.41s/it][A

	loss_cls: tensor(0.5356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6924, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:41<03:14,  5.40s/it][A

	loss_cls: tensor(0.4554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6688, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:47<03:08,  5.39s/it][A

	loss_cls: tensor(0.4052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6975, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:52<03:03,  5.41s/it][A

	loss_cls: tensor(0.6039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7868, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:58<02:57,  5.39s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7838, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:03<02:52,  5.41s/it][A

	loss_cls: tensor(0.4950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7766, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:08<02:47,  5.39s/it][A

	loss_cls: tensor(0.4514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7115, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:14<02:42,  5.41s/it][A

	loss_cls: tensor(0.3011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4279, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:19<02:36,  5.39s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5991, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:24<02:30,  5.38s/it][A

	loss_cls: tensor(0.7184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2018, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:30<02:25,  5.40s/it][A

	loss_cls: tensor(0.6236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2561, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8797, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:35<02:20,  5.39s/it][A

	loss_cls: tensor(0.2611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3635, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:41<02:15,  5.40s/it][A

	loss_cls: tensor(0.2948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3409, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:46<02:09,  5.39s/it][A

	loss_cls: tensor(0.8128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0711, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:52<02:04,  5.41s/it][A

	loss_cls: tensor(0.2810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5883, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:57<01:58,  5.39s/it][A

	loss_cls: tensor(0.2760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3689, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:02<01:52,  5.38s/it][A

	loss_cls: tensor(0.8333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0576, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:08<01:48,  5.40s/it][A

	loss_cls: tensor(0.7565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9003, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:13<01:42,  5.38s/it][A

	loss_cls: tensor(0.4119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5307, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:18<01:37,  5.41s/it][A

	loss_cls: tensor(0.6812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9000, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:24<01:31,  5.39s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7066, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:29<01:26,  5.41s/it][A

	loss_cls: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6033, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:35<01:20,  5.40s/it][A

	loss_cls: tensor(0.3609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4833, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:40<01:15,  5.38s/it][A

	loss_cls: tensor(0.2623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4745, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:45<01:10,  5.40s/it][A

	loss_cls: tensor(0.7223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2168, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:51<01:04,  5.39s/it][A

	loss_cls: tensor(0.2862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3228, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:56<00:59,  5.41s/it][A

	loss_cls: tensor(0.6888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8833, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:02<00:54,  5.42s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7082, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:07<00:49,  5.44s/it][A

	loss_cls: tensor(1.4470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7108, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:13<00:43,  5.42s/it][A

	loss_cls: tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7797, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:18<00:37,  5.40s/it][A

	loss_cls: tensor(0.7423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0945, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:23<00:32,  5.41s/it][A

	loss_cls: tensor(0.5817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8583, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:29<00:26,  5.39s/it][A

	loss_cls: tensor(0.6339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8851, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:34<00:21,  5.41s/it][A

	loss_cls: tensor(0.4622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7828, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:40<00:16,  5.39s/it][A

	loss_cls: tensor(0.6723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8269, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:45<00:10,  5.40s/it][A

	loss_cls: tensor(0.5805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1009, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:50<00:05,  5.38s/it][A

	loss_cls: tensor(0.5028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8905, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:52<00:00,  5.40s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.6362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7363, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8398194638854366

	Training cls acc: 0.7020951035781545

	Training cls prec: 0.5815318136822374

	Training cls rec: 0.6175043765086139

	Training cls f1: 0.542065962379213

--
	Training ner acc: 0.9555145937485032

	Training ner prec: 0.27807249841853454

	Training ner rec: 0.28617838198924006

	Training ner f1: 0.28171543165579427

	Current Learning rate:  0.0005714285714285714



  1%|          | 1/177 [00:00<02:10,  1.35it/s][A
  1%|          | 2/177 [00:01<02:08,  1.36it/s][A
  2%|▏         | 3/177 [00:02<02:07,  1.37it/s][A
  2%|▏         | 4/177 [00:02<02:00,  1.44it/s][A
  3%|▎         | 5/177 [00:03<02:01,  1.41it/s][A
  3%|▎         | 6/177 [00:04<02:03,  1.39it/s][A
  4%|▍         | 7/177 [00:04<01:58,  1.44it/s][A
  5%|▍         | 8/177 [00:05<01:58,  1.42it/s][A
  5%|▌         | 9/177 [00:06<01:59,  1.41it/s][A
  6%|▌         | 10/177 [00:07<01:59,  1.40it/s][A
  6%|▌         | 11/177 [00:07<01:55,  1.43it/s][A
  7%|▋         | 12/177 [00:08<01:56,  1.42it/s][A
  7%|▋         | 13/177 [00:09<01:57,  1.40it/s][A
  8%|▊         | 14/177 [00:09<01:56,  1.39it/s][A
  8%|▊         | 15/177 [00:10<01:52,  1.44it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.42it/s][A
 10%|▉         | 17/177 [00:12<01:53,  1.40it/s][A
 10%|█         | 18/177 [00:12<01:49,  1.45it/s][A
 11%|█         | 19/177 [00:13<01:50,  1.43it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.829831623424918

	Validation cls acc: 0.5736817325800376

	Validation cls prec: 0.5837570621468927

	Validation cls rec: 0.5502757600215227

	Validation cls f1: 0.47908688925638077

--
	Validation ner acc: 0.9540805564110914

	Validation ner prec: 0.4020534886428477

	Validation ner rec: 0.412617702448211

	Validation ner f1: 0.40712175459224803



  0%|          | 1/354 [00:05<31:23,  5.33s/it][A

	loss_cls: tensor(1.3008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5084, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:41,  5.40s/it][A

	loss_cls: tensor(0.6778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9262, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:28,  5.38s/it][A

	loss_cls: tensor(0.5009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8314, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:28,  5.40s/it][A

	loss_cls: tensor(0.6185, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7778, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:12,  5.37s/it][A

	loss_cls: tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2744, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8578, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:13,  5.38s/it][A

	loss_cls: tensor(0.9615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1236, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:02,  5.37s/it][A

	loss_cls: tensor(0.7450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8651, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:42<30:52,  5.36s/it][A

	loss_cls: tensor(0.5707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8460, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:54,  5.38s/it][A

	loss_cls: tensor(0.6253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8568, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:44,  5.36s/it][A

	loss_cls: tensor(0.6315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8931, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:43,  5.37s/it][A

	loss_cls: tensor(0.9146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1039, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:31,  5.36s/it][A

	loss_cls: tensor(0.5882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8905, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:33,  5.38s/it][A

	loss_cls: tensor(0.4316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7673, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:23,  5.36s/it][A

	loss_cls: tensor(0.6093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0497, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:14,  5.35s/it][A

	loss_cls: tensor(0.5333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9232, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:25<30:13,  5.37s/it][A

	loss_cls: tensor(0.6936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8682, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:02,  5.35s/it][A

	loss_cls: tensor(0.4670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6508, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<30:02,  5.36s/it][A

	loss_cls: tensor(0.7620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8856, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:41<29:51,  5.35s/it][A

	loss_cls: tensor(0.8315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0955, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<29:55,  5.37s/it][A

	loss_cls: tensor(0.8284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9531, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:46,  5.37s/it][A

	loss_cls: tensor(0.4219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7088, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:37,  5.35s/it][A

	loss_cls: tensor(0.7111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9763, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:35,  5.36s/it][A

	loss_cls: tensor(0.7171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:25,  5.35s/it][A

	loss_cls: tensor(0.8381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9756, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:14<29:24,  5.36s/it][A

	loss_cls: tensor(0.6241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0186, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:17,  5.36s/it][A

	loss_cls: tensor(0.7093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8215, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:18,  5.38s/it][A

	loss_cls: tensor(0.5987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2792, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:30<29:06,  5.36s/it][A

	loss_cls: tensor(0.5511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1289, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6800, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<28:59,  5.35s/it][A

	loss_cls: tensor(0.6084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8232, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<28:58,  5.37s/it][A

	loss_cls: tensor(0.3398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4496, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:46<28:46,  5.34s/it][A

	loss_cls: tensor(0.8946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1917, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<28:44,  5.36s/it][A

	loss_cls: tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8458, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:56<28:36,  5.35s/it][A

	loss_cls: tensor(0.4378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5719, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:02<28:36,  5.36s/it][A

	loss_cls: tensor(0.6256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0522, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:28,  5.36s/it][A

	loss_cls: tensor(0.9190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1259, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:13<28:20,  5.35s/it][A

	loss_cls: tensor(0.8834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2150, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:21,  5.37s/it][A

	loss_cls: tensor(0.5389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7560, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:23<28:09,  5.35s/it][A

	loss_cls: tensor(0.6107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9377, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:29<28:11,  5.37s/it][A

	loss_cls: tensor(0.5361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7442, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:34<28:00,  5.35s/it][A

	loss_cls: tensor(0.4626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6014, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:39<28:01,  5.37s/it][A

	loss_cls: tensor(0.7900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0889, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:45<27:54,  5.37s/it][A

	loss_cls: tensor(0.5876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7049, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:50<27:43,  5.35s/it][A

	loss_cls: tensor(0.4418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6058, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:55<27:43,  5.37s/it][A

	loss_cls: tensor(0.5451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6641, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:01<27:31,  5.35s/it][A

	loss_cls: tensor(0.7684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1422, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:06<27:33,  5.37s/it][A

	loss_cls: tensor(0.5193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5993, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:12<27:25,  5.36s/it][A

	loss_cls: tensor(0.6128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7558, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:17<27:24,  5.37s/it][A

	loss_cls: tensor(0.5511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6110, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:22<27:16,  5.37s/it][A

	loss_cls: tensor(0.5638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7042, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:28<27:07,  5.35s/it][A

	loss_cls: tensor(0.5928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9148, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:33<27:07,  5.37s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6720, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:38<26:55,  5.35s/it][A

	loss_cls: tensor(0.8443, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0059, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:44<26:56,  5.37s/it][A

	loss_cls: tensor(0.5749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7071, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:49<26:48,  5.36s/it][A

	loss_cls: tensor(0.7367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2461, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:54<26:48,  5.38s/it][A

	loss_cls: tensor(0.4767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2485, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7252, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:00<26:39,  5.37s/it][A

	loss_cls: tensor(0.4684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5006, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:05<26:29,  5.35s/it][A

	loss_cls: tensor(0.5458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8596, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:11<26:28,  5.37s/it][A

	loss_cls: tensor(0.7048, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8289, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:16<26:21,  5.36s/it][A

	loss_cls: tensor(0.4994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5529, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:21<26:23,  5.39s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7087, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:27<26:15,  5.38s/it][A

	loss_cls: tensor(0.4353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7499, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:32<26:15,  5.40s/it][A

	loss_cls: tensor(0.5128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6913, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:37<26:04,  5.38s/it][A

	loss_cls: tensor(0.3614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5237, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:43<25:54,  5.36s/it][A

	loss_cls: tensor(1.2240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6717, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:48<25:56,  5.39s/it][A

	loss_cls: tensor(0.9403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1333, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:54<25:48,  5.38s/it][A

	loss_cls: tensor(0.4118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5546, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [05:59<25:48,  5.40s/it][A

	loss_cls: tensor(0.6275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8020, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:04<25:39,  5.38s/it][A

	loss_cls: tensor(0.7532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9039, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:10<25:36,  5.39s/it][A

	loss_cls: tensor(0.5204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8456, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:15<25:27,  5.38s/it][A

	loss_cls: tensor(0.5780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8557, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:20<25:19,  5.37s/it][A

	loss_cls: tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6455, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:26<25:19,  5.39s/it][A

	loss_cls: tensor(0.7426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0291, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:31<25:15,  5.39s/it][A

	loss_cls: tensor(0.4237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6712, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:37<25:17,  5.42s/it][A

	loss_cls: tensor(0.5331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1830, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7161, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:42<25:07,  5.40s/it][A

	loss_cls: tensor(0.5822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7516, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:48<25:07,  5.42s/it][A

	loss_cls: tensor(0.5480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8360, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:53<24:57,  5.41s/it][A

	loss_cls: tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6174, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [06:58<24:49,  5.40s/it][A

	loss_cls: tensor(0.5559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9624, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:04<24:51,  5.42s/it][A

	loss_cls: tensor(0.6695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7372, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:09<24:43,  5.42s/it][A

	loss_cls: tensor(0.5523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8463, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:15<24:43,  5.43s/it][A

	loss_cls: tensor(0.5612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7753, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:20<24:34,  5.42s/it][A

	loss_cls: tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7228, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:26<24:33,  5.44s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8833, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:31<24:23,  5.42s/it][A

	loss_cls: tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7295, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:36<24:14,  5.41s/it][A

	loss_cls: tensor(0.9328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.8036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7364, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:42<24:17,  5.44s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6125, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:47<24:08,  5.43s/it][A

	loss_cls: tensor(0.4140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4565, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:53<24:08,  5.45s/it][A

	loss_cls: tensor(0.5855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7426, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [07:58<23:58,  5.43s/it][A

	loss_cls: tensor(0.5801, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9174, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:04<23:55,  5.44s/it][A

	loss_cls: tensor(0.7433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0739, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:09<23:43,  5.41s/it][A

	loss_cls: tensor(0.6327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7891, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:14<23:36,  5.41s/it][A

	loss_cls: tensor(0.3583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6032, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:20<23:31,  5.41s/it][A

	loss_cls: tensor(0.4122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6087, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:25<23:25,  5.41s/it][A

	loss_cls: tensor(0.7011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.8569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5580, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:31<23:22,  5.41s/it][A

	loss_cls: tensor(0.6990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8200, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:36<23:14,  5.41s/it][A

	loss_cls: tensor(0.5533, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0163, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:41<23:15,  5.43s/it][A

	loss_cls: tensor(0.3728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6962, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:47<23:05,  5.41s/it][A

	loss_cls: tensor(1.0714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2476, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:52<23:03,  5.43s/it][A

	loss_cls: tensor(0.5829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8993, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:58<22:53,  5.41s/it][A

	loss_cls: tensor(0.4388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5664, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:03<22:44,  5.39s/it][A

	loss_cls: tensor(0.5257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7735, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:08<22:42,  5.41s/it][A

	loss_cls: tensor(0.7420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8458, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:14<22:32,  5.39s/it][A

	loss_cls: tensor(0.7877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9939, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:19<22:31,  5.41s/it][A

	loss_cls: tensor(0.6571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1857, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:25<22:20,  5.38s/it][A

	loss_cls: tensor(0.4926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5985, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:30<22:17,  5.39s/it][A

	loss_cls: tensor(0.6845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:35<22:09,  5.38s/it][A

	loss_cls: tensor(0.4419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7557, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:41<22:01,  5.37s/it][A

	loss_cls: tensor(0.6864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8022, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:46<21:59,  5.38s/it][A

	loss_cls: tensor(0.7370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1877, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:51<21:49,  5.37s/it][A

	loss_cls: tensor(0.4598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4850, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:57<21:47,  5.38s/it][A

	loss_cls: tensor(0.4793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7851, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:02<21:39,  5.37s/it][A

	loss_cls: tensor(0.5757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8808, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:08<21:38,  5.39s/it][A

	loss_cls: tensor(0.4661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8458, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:13<21:31,  5.38s/it][A

	loss_cls: tensor(0.4452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9087, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:18<21:22,  5.37s/it][A

	loss_cls: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2022, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9041, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:24<21:21,  5.38s/it][A

	loss_cls: tensor(0.5499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1713, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7212, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:29<21:15,  5.38s/it][A

	loss_cls: tensor(0.7088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9924, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:35<21:12,  5.39s/it][A

	loss_cls: tensor(0.5380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5967, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:40<21:04,  5.38s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7266, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:45<21:02,  5.40s/it][A

	loss_cls: tensor(0.7224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0679, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:51<20:53,  5.38s/it][A

	loss_cls: tensor(0.6286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7904, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:56<20:45,  5.37s/it][A

	loss_cls: tensor(0.3873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4925, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:01<20:44,  5.39s/it][A

	loss_cls: tensor(0.8449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1204, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:07<20:36,  5.38s/it][A

	loss_cls: tensor(0.5882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7509, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:12<20:33,  5.39s/it][A

	loss_cls: tensor(0.5672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9145, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:18<20:26,  5.38s/it][A

	loss_cls: tensor(0.3197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4206, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:23<20:25,  5.40s/it][A

	loss_cls: tensor(0.7687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1199, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:28<20:15,  5.38s/it][A

	loss_cls: tensor(0.6161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8549, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:34<20:06,  5.36s/it][A

	loss_cls: tensor(0.4020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5414, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:39<20:04,  5.38s/it][A

	loss_cls: tensor(1.0581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2130, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:44<19:55,  5.36s/it][A

	loss_cls: tensor(0.7471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9670, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:50<19:54,  5.38s/it][A

	loss_cls: tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0300, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:55<19:48,  5.38s/it][A

	loss_cls: tensor(0.7029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0301, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:01<19:47,  5.40s/it][A

	loss_cls: tensor(0.4764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7381, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:06<19:39,  5.39s/it][A

	loss_cls: tensor(0.7754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0179, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:11<19:40,  5.41s/it][A

	loss_cls: tensor(0.4092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5675, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:17<19:43,  5.46s/it][A

	loss_cls: tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6507, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:23<19:38,  5.46s/it][A

	loss_cls: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9865, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:28<19:42,  5.50s/it][A

	loss_cls: tensor(0.8853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1534, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:34<19:35,  5.49s/it][A

	loss_cls: tensor(0.6262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9078, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:39<19:30,  5.50s/it][A

	loss_cls: tensor(0.7931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8961, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:45<19:24,  5.49s/it][A

	loss_cls: tensor(0.6853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8811, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:50<19:18,  5.49s/it][A

	loss_cls: tensor(0.4182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7148, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:56<19:16,  5.51s/it][A

	loss_cls: tensor(0.4425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7897, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:01<19:03,  5.47s/it][A

	loss_cls: tensor(0.7143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9158, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:07<19:02,  5.49s/it][A

	loss_cls: tensor(0.7730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9045, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:12<18:54,  5.48s/it][A

	loss_cls: tensor(0.8170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8772, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:18<18:52,  5.50s/it][A

	loss_cls: tensor(0.8420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9346, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:23<18:42,  5.48s/it][A

	loss_cls: tensor(0.4843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8557, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:28<18:34,  5.46s/it][A

	loss_cls: tensor(0.6683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9053, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:34<18:31,  5.47s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8555, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:39<18:22,  5.46s/it][A

	loss_cls: tensor(0.7279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8214, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:45<18:20,  5.48s/it][A

	loss_cls: tensor(0.7176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1784, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:50<18:13,  5.47s/it][A

	loss_cls: tensor(0.5237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8479, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:56<18:09,  5.48s/it][A

	loss_cls: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1736, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7019, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:01<18:02,  5.47s/it][A

	loss_cls: tensor(0.6457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9797, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:07<17:55,  5.46s/it][A

	loss_cls: tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3121, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:12<17:53,  5.48s/it][A

	loss_cls: tensor(0.4462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1739, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6201, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:18<17:45,  5.47s/it][A

	loss_cls: tensor(0.7560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9883, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:23<17:43,  5.48s/it][A

	loss_cls: tensor(0.5691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7992, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:29<17:36,  5.48s/it][A

	loss_cls: tensor(0.6515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4826, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1341, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:34<17:34,  5.49s/it][A

	loss_cls: tensor(0.4923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6707, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:40<17:24,  5.47s/it][A

	loss_cls: tensor(0.4279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5590, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:45<17:16,  5.45s/it][A

	loss_cls: tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8106, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:50<17:13,  5.47s/it][A

	loss_cls: tensor(0.4975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6689, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:56<17:05,  5.46s/it][A

	loss_cls: tensor(0.4957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6419, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:01<17:04,  5.48s/it][A

	loss_cls: tensor(0.4877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6745, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:07<16:56,  5.47s/it][A

	loss_cls: tensor(0.3281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4258, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:12<16:55,  5.49s/it][A

	loss_cls: tensor(0.8141, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0319, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:18<16:47,  5.48s/it][A

	loss_cls: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5641, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:23<16:38,  5.46s/it][A

	loss_cls: tensor(0.6338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8957, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:29<16:37,  5.48s/it][A

	loss_cls: tensor(0.5140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6145, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:34<16:29,  5.47s/it][A

	loss_cls: tensor(0.4499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4670, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:40<16:28,  5.49s/it][A

	loss_cls: tensor(1.0402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1746, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:45<16:18,  5.47s/it][A

	loss_cls: tensor(0.5156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7357, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:51<16:18,  5.50s/it][A

	loss_cls: tensor(0.5538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0885, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:56<16:09,  5.48s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7689, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:02<16:01,  5.46s/it][A

	loss_cls: tensor(0.4114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6803, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:07<15:58,  5.48s/it][A

	loss_cls: tensor(0.5448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7549, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:13<15:52,  5.47s/it][A

	loss_cls: tensor(0.5224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8137, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:18<15:50,  5.49s/it][A

	loss_cls: tensor(0.7839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8631, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:24<15:41,  5.48s/it][A

	loss_cls: tensor(0.2957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3442, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:29<15:37,  5.48s/it][A

	loss_cls: tensor(0.8052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3084, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1136, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:34<15:27,  5.46s/it][A

	loss_cls: tensor(0.3576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5240, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:40<15:19,  5.44s/it][A

	loss_cls: tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6458, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:45<15:15,  5.45s/it][A

	loss_cls: tensor(0.6155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9528, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:51<15:10,  5.45s/it][A

	loss_cls: tensor(0.5202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7906, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:56<15:10,  5.48s/it][A

	loss_cls: tensor(0.5152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5772, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:02<15:02,  5.47s/it][A

	loss_cls: tensor(0.7505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9563, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:07<15:01,  5.50s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7368, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:13<14:53,  5.48s/it][A

	loss_cls: tensor(0.5481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8636, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:18<14:48,  5.48s/it][A

	loss_cls: tensor(0.8577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1589, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:24<14:42,  5.48s/it][A

	loss_cls: tensor(0.6883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9678, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:29<14:35,  5.47s/it][A

	loss_cls: tensor(0.4818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6556, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:35<14:32,  5.49s/it][A

	loss_cls: tensor(0.9608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1141, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:40<14:24,  5.47s/it][A

	loss_cls: tensor(0.6504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7590, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:46<14:21,  5.49s/it][A

	loss_cls: tensor(0.6784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9865, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:51<14:13,  5.47s/it][A

	loss_cls: tensor(0.4030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6547, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:57<14:09,  5.48s/it][A

	loss_cls: tensor(0.8140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1617, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:02<14:02,  5.47s/it][A

	loss_cls: tensor(0.5377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6160, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:08<13:56,  5.47s/it][A

	loss_cls: tensor(0.6474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9118, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:13<13:54,  5.49s/it][A

	loss_cls: tensor(0.5408, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7925, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:19<13:46,  5.48s/it][A

	loss_cls: tensor(0.6325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7877, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:24<13:43,  5.49s/it][A

	loss_cls: tensor(0.6876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1890, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:29<13:35,  5.47s/it][A

	loss_cls: tensor(0.5560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9957, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:35<13:32,  5.49s/it][A

	loss_cls: tensor(0.3243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4696, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:40<13:26,  5.49s/it][A

	loss_cls: tensor(0.4606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7710, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:46<13:18,  5.47s/it][A

	loss_cls: tensor(0.6073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7576, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:51<13:16,  5.49s/it][A

	loss_cls: tensor(0.6658, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7924, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:57<13:07,  5.47s/it][A

	loss_cls: tensor(0.4889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6969, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:02<13:03,  5.48s/it][A

	loss_cls: tensor(0.6830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8973, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:08<12:57,  5.47s/it][A

	loss_cls: tensor(0.5107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7276, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:13<12:52,  5.48s/it][A

	loss_cls: tensor(0.6531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2577, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:19<12:45,  5.47s/it][A

	loss_cls: tensor(0.5035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7367, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:24<12:39,  5.47s/it][A

	loss_cls: tensor(0.3743, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5121, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:30<12:35,  5.48s/it][A

	loss_cls: tensor(0.6638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0300, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:35<12:28,  5.46s/it][A

	loss_cls: tensor(0.6903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0096, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:41<12:25,  5.48s/it][A

	loss_cls: tensor(0.4280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9095, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:46<12:18,  5.47s/it][A

	loss_cls: tensor(0.3581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6758, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:52<12:16,  5.50s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6612, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:57<12:09,  5.49s/it][A

	loss_cls: tensor(0.7948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9583, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:03<12:02,  5.47s/it][A

	loss_cls: tensor(0.4402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7308, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:08<11:59,  5.49s/it][A

	loss_cls: tensor(0.8302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0897, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:14<11:51,  5.47s/it][A

	loss_cls: tensor(0.4471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6237, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:19<11:48,  5.49s/it][A

	loss_cls: tensor(0.7528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8937, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:25<11:40,  5.47s/it][A

	loss_cls: tensor(0.5636, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8271, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:30<11:37,  5.49s/it][A

	loss_cls: tensor(0.4619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8995, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:36<11:30,  5.48s/it][A

	loss_cls: tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7325, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:41<11:23,  5.47s/it][A

	loss_cls: tensor(0.7482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9655, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:46<11:19,  5.48s/it][A

	loss_cls: tensor(0.6279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6738, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:52<11:10,  5.45s/it][A

	loss_cls: tensor(0.6422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7862, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:57<11:07,  5.47s/it][A

	loss_cls: tensor(0.5525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9006, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:03<11:01,  5.47s/it][A

	loss_cls: tensor(0.6285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7286, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:08<10:56,  5.47s/it][A

	loss_cls: tensor(0.4186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6294, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:14<10:49,  5.46s/it][A

	loss_cls: tensor(0.4537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5124, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:19<10:44,  5.46s/it][A

	loss_cls: tensor(0.5542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5839, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:25<10:40,  5.48s/it][A

	loss_cls: tensor(1.1393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4539, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:30<10:34,  5.47s/it][A

	loss_cls: tensor(0.5300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7204, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:36<10:31,  5.49s/it][A

	loss_cls: tensor(0.7112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1308, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:41<10:24,  5.48s/it][A

	loss_cls: tensor(0.8562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0891, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:47<10:21,  5.50s/it][A

	loss_cls: tensor(0.7130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9388, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:52<10:14,  5.49s/it][A

	loss_cls: tensor(0.4803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5786, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:58<10:08,  5.48s/it][A

	loss_cls: tensor(0.9162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5241, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:03<10:04,  5.49s/it][A

	loss_cls: tensor(0.4106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7173, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:09<09:57,  5.48s/it][A

	loss_cls: tensor(0.8399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2012, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:14<09:54,  5.50s/it][A

	loss_cls: tensor(0.4665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5840, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:20<09:47,  5.49s/it][A

	loss_cls: tensor(0.9269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0773, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:25<09:42,  5.50s/it][A

	loss_cls: tensor(0.9839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2980, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:31<09:35,  5.48s/it][A

	loss_cls: tensor(0.6265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1779, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8044, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:36<09:28,  5.47s/it][A

	loss_cls: tensor(0.5169, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6013, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:42<09:25,  5.49s/it][A

	loss_cls: tensor(0.4889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5592, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:47<09:18,  5.48s/it][A

	loss_cls: tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6262, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:52<09:13,  5.48s/it][A

	loss_cls: tensor(0.4645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5783, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:58<09:07,  5.47s/it][A

	loss_cls: tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9972, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:04<09:04,  5.50s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6359, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:09<08:57,  5.49s/it][A

	loss_cls: tensor(0.5877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0269, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:14<08:51,  5.48s/it][A

	loss_cls: tensor(0.4876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5926, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:20<08:46,  5.49s/it][A

	loss_cls: tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0419, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:25<08:40,  5.48s/it][A

	loss_cls: tensor(0.7823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9791, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:31<08:38,  5.51s/it][A

	loss_cls: tensor(0.8359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2065, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:36<08:31,  5.50s/it][A

	loss_cls: tensor(0.6233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1837, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:42<08:27,  5.52s/it][A

	loss_cls: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6982, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:47<08:20,  5.50s/it][A

	loss_cls: tensor(0.6151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0608, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:53<08:12,  5.48s/it][A

	loss_cls: tensor(0.6498, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8095, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:58<08:08,  5.48s/it][A

	loss_cls: tensor(0.6337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9511, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:04<08:02,  5.48s/it][A

	loss_cls: tensor(0.6948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9458, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:09<07:58,  5.50s/it][A

	loss_cls: tensor(0.6731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7549, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:15<07:52,  5.49s/it][A

	loss_cls: tensor(0.7452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9204, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:20<07:48,  5.51s/it][A

	loss_cls: tensor(0.5152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7594, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:26<07:41,  5.49s/it][A

	loss_cls: tensor(0.5239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6963, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:31<07:34,  5.48s/it][A

	loss_cls: tensor(0.4725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6826, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:37<07:30,  5.49s/it][A

	loss_cls: tensor(0.5099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9869, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:42<07:24,  5.48s/it][A

	loss_cls: tensor(0.6735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7807, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:48<07:19,  5.50s/it][A

	loss_cls: tensor(1.0346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1465, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:53<07:12,  5.48s/it][A

	loss_cls: tensor(0.4006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4365, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:59<07:07,  5.49s/it][A

	loss_cls: tensor(0.5380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0336, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:04<07:00,  5.47s/it][A

	loss_cls: tensor(0.4302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5614, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:10<06:54,  5.45s/it][A

	loss_cls: tensor(0.6253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8762, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:15<06:50,  5.48s/it][A

	loss_cls: tensor(0.7407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8834, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:21<06:45,  5.48s/it][A

	loss_cls: tensor(0.6841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7150, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:26<06:40,  5.49s/it][A

	loss_cls: tensor(0.4958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6664, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:32<06:34,  5.48s/it][A

	loss_cls: tensor(0.4258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4527, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:37<06:30,  5.50s/it][A

	loss_cls: tensor(0.6916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9849, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:43<06:24,  5.49s/it][A

	loss_cls: tensor(0.8037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0217, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:48<06:17,  5.47s/it][A

	loss_cls: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1139, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:54<06:14,  5.50s/it][A

	loss_cls: tensor(0.4421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5764, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:59<06:07,  5.48s/it][A

	loss_cls: tensor(0.4268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6854, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:05<06:02,  5.49s/it][A

	loss_cls: tensor(0.4007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5826, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:10<05:56,  5.48s/it][A

	loss_cls: tensor(0.6832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8612, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:16<05:51,  5.50s/it][A

	loss_cls: tensor(0.6476, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7713, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:21<05:45,  5.48s/it][A

	loss_cls: tensor(0.8249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2005, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:27<05:40,  5.49s/it][A

	loss_cls: tensor(1.2341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6021, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:32<05:34,  5.49s/it][A

	loss_cls: tensor(0.6595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1309, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:37<05:28,  5.47s/it][A

	loss_cls: tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8532, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:43<05:23,  5.49s/it][A

	loss_cls: tensor(0.7147, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8794, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:48<05:17,  5.48s/it][A

	loss_cls: tensor(0.7007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9399, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:54<05:13,  5.49s/it][A

	loss_cls: tensor(0.3149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4512, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:59<05:05,  5.46s/it][A

	loss_cls: tensor(0.5838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9116, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:05<05:01,  5.48s/it][A

	loss_cls: tensor(0.6295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8362, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:10<04:55,  5.48s/it][A

	loss_cls: tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7302, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:16<04:49,  5.47s/it][A

	loss_cls: tensor(0.4345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6260, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:21<04:45,  5.49s/it][A

	loss_cls: tensor(0.5999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1191, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7190, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:27<04:39,  5.48s/it][A

	loss_cls: tensor(0.6362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7469, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:32<04:34,  5.49s/it][A

	loss_cls: tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8978, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:38<04:27,  5.47s/it][A

	loss_cls: tensor(0.5516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7912, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:43<04:23,  5.49s/it][A

	loss_cls: tensor(0.4308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6983, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:49<04:17,  5.48s/it][A

	loss_cls: tensor(0.5886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6907, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:54<04:11,  5.47s/it][A

	loss_cls: tensor(0.6358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9689, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:00<04:06,  5.49s/it][A

	loss_cls: tensor(0.6045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7806, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:05<04:00,  5.47s/it][A

	loss_cls: tensor(0.6079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8244, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:11<03:55,  5.49s/it][A

	loss_cls: tensor(0.5086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7135, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:16<03:50,  5.48s/it][A

	loss_cls: tensor(0.7417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0212, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:22<03:45,  5.50s/it][A

	loss_cls: tensor(0.8283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9421, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:27<03:39,  5.49s/it][A

	loss_cls: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7958, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:33<03:33,  5.48s/it][A

	loss_cls: tensor(0.7043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8283, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:38<03:28,  5.49s/it][A

	loss_cls: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7268, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:44<03:22,  5.48s/it][A

	loss_cls: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1712, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7654, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:49<03:17,  5.49s/it][A

	loss_cls: tensor(0.7506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8540, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:55<03:11,  5.48s/it][A

	loss_cls: tensor(0.8664, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0200, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:00<03:06,  5.49s/it][A

	loss_cls: tensor(0.6489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8080, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:05<03:00,  5.47s/it][A

	loss_cls: tensor(0.5622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7426, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:11<02:54,  5.46s/it][A

	loss_cls: tensor(0.7150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0280, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7430, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:16<02:49,  5.48s/it][A

	loss_cls: tensor(0.4593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6834, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:22<02:44,  5.47s/it][A

	loss_cls: tensor(0.4164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4445, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:27<02:39,  5.49s/it][A

	loss_cls: tensor(0.5845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6884, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:33<02:33,  5.47s/it][A

	loss_cls: tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.9349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5684, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:38<02:28,  5.50s/it][A

	loss_cls: tensor(0.7011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1072, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:44<02:22,  5.49s/it][A

	loss_cls: tensor(0.5281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5963, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:49<02:17,  5.48s/it][A

	loss_cls: tensor(0.5903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9182, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:55<02:11,  5.50s/it][A

	loss_cls: tensor(0.4638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6327, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:00<02:06,  5.48s/it][A

	loss_cls: tensor(0.3380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4731, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:06<02:00,  5.48s/it][A

	loss_cls: tensor(0.7558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8868, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:11<01:54,  5.47s/it][A

	loss_cls: tensor(0.8336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0711, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:17<01:49,  5.49s/it][A

	loss_cls: tensor(0.4263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7160, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:22<01:44,  5.48s/it][A

	loss_cls: tensor(0.7201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9719, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:28<01:38,  5.46s/it][A

	loss_cls: tensor(0.4064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5990, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:33<01:33,  5.48s/it][A

	loss_cls: tensor(0.7250, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1081, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:39<01:27,  5.47s/it][A

	loss_cls: tensor(0.4464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5341, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:44<01:22,  5.49s/it][A

	loss_cls: tensor(0.5820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7695, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:50<01:16,  5.49s/it][A

	loss_cls: tensor(0.9930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5116, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5046, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:55<01:11,  5.51s/it][A

	loss_cls: tensor(0.8992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2219, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:01<01:05,  5.49s/it][A

	loss_cls: tensor(0.8408, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9484, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:06<00:59,  5.45s/it][A

	loss_cls: tensor(1.0336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1609, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:11<00:54,  5.46s/it][A

	loss_cls: tensor(0.4529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6131, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:17<00:49,  5.45s/it][A

	loss_cls: tensor(0.5073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6615, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:22<00:43,  5.48s/it][A

	loss_cls: tensor(0.4605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5658, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:28<00:38,  5.46s/it][A

	loss_cls: tensor(0.6055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9721, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:33<00:32,  5.48s/it][A

	loss_cls: tensor(0.7064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9602, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:39<00:27,  5.47s/it][A

	loss_cls: tensor(0.6841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0923, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:44<00:21,  5.45s/it][A

	loss_cls: tensor(0.7853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0753, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:50<00:16,  5.47s/it][A

	loss_cls: tensor(0.5093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5653, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:55<00:10,  5.46s/it][A

	loss_cls: tensor(0.5456, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6264, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:01<00:05,  5.48s/it][A

	loss_cls: tensor(0.6390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7669, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:03<00:00,  5.43s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3736, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9625, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8466921390618308

	Training cls acc: 0.6992113935969868

	Training cls prec: 0.5786805081296607

	Training cls rec: 0.6120016518321604

	Training cls f1: 0.5353885574786873

--
	Training ner acc: 0.9550503840845547

	Training ner prec: 0.26570046181616214

	Training ner rec: 0.27298259268215436

	Training ner f1: 0.26852134381192516

	Current Learning rate:  0.0005428571428571428



  1%|          | 1/177 [00:00<02:12,  1.33it/s][A
  1%|          | 2/177 [00:01<02:10,  1.34it/s][A
  2%|▏         | 3/177 [00:02<02:02,  1.42it/s][A
  2%|▏         | 4/177 [00:02<02:04,  1.39it/s][A
  3%|▎         | 5/177 [00:03<02:04,  1.38it/s][A
  3%|▎         | 6/177 [00:04<02:00,  1.42it/s][A
  4%|▍         | 7/177 [00:05<02:01,  1.40it/s][A
  5%|▍         | 8/177 [00:05<02:01,  1.39it/s][A
  5%|▌         | 9/177 [00:06<02:02,  1.38it/s][A
  6%|▌         | 10/177 [00:07<01:57,  1.42it/s][A
  6%|▌         | 11/177 [00:07<01:58,  1.40it/s][A
  7%|▋         | 12/177 [00:08<01:59,  1.38it/s][A
  7%|▋         | 13/177 [00:09<01:59,  1.37it/s][A
  8%|▊         | 14/177 [00:10<01:55,  1.41it/s][A
  8%|▊         | 15/177 [00:10<01:56,  1.39it/s][A
  9%|▉         | 16/177 [00:11<01:56,  1.38it/s][A
 10%|▉         | 17/177 [00:12<01:52,  1.42it/s][A
 10%|█         | 18/177 [00:12<01:53,  1.40it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.39it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8075433524988466

	Validation cls acc: 0.7285781544256119

	Validation cls prec: 0.6081584611245628

	Validation cls rec: 0.5870762711864407

	Validation cls f1: 0.5634078413739432

--
	Validation ner acc: 0.9542896144501299

	Validation ner prec: 0.42398044191971107

	Validation ner rec: 0.43455743879472697

	Validation ner f1: 0.42904452468074644



  0%|          | 1/354 [00:05<32:06,  5.46s/it][A

	loss_cls: tensor(0.5462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5953, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<32:17,  5.50s/it][A

	loss_cls: tensor(0.5835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6878, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:04,  5.48s/it][A

	loss_cls: tensor(0.6115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0443, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:53,  5.47s/it][A

	loss_cls: tensor(0.6530, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7617, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:54,  5.49s/it][A

	loss_cls: tensor(0.6303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8701, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:41,  5.46s/it][A

	loss_cls: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6460, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:39,  5.47s/it][A

	loss_cls: tensor(0.3696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0313, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4009, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:34,  5.47s/it][A

	loss_cls: tensor(0.7651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1295, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8946, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:36,  5.50s/it][A

	loss_cls: tensor(0.5079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7336, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:24,  5.48s/it][A

	loss_cls: tensor(0.9293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0164, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:14,  5.46s/it][A

	loss_cls: tensor(0.7651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1865, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:06,  5.46s/it][A

	loss_cls: tensor(0.9821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1061, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<30:56,  5.45s/it][A

	loss_cls: tensor(0.6033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9405, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<30:58,  5.47s/it][A

	loss_cls: tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8376, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:22<30:50,  5.46s/it][A

	loss_cls: tensor(0.8269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2450, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<30:51,  5.48s/it][A

	loss_cls: tensor(1.0798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2436, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:41,  5.46s/it][A

	loss_cls: tensor(0.5924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9453, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:31,  5.45s/it][A

	loss_cls: tensor(0.9732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1192, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:31,  5.47s/it][A

	loss_cls: tensor(0.6006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1344, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:09,  5.42s/it][A

	loss_cls: tensor(0.6230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7304, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:03,  5.41s/it][A

	loss_cls: tensor(0.7237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9041, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<29:48,  5.39s/it][A

	loss_cls: tensor(0.6118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9195, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:05<29:41,  5.38s/it][A

	loss_cls: tensor(0.6137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0859, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:31,  5.37s/it][A

	loss_cls: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7452, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:22,  5.36s/it][A

	loss_cls: tensor(0.7513, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8634, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:21<29:20,  5.37s/it][A

	loss_cls: tensor(0.5393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8320, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:06,  5.34s/it][A

	loss_cls: tensor(0.6597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8271, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:32<29:09,  5.37s/it][A

	loss_cls: tensor(0.6493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9138, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:02,  5.36s/it][A

	loss_cls: tensor(0.7508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0668, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:00,  5.37s/it][A

	loss_cls: tensor(0.6575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6974, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:48<28:50,  5.36s/it][A

	loss_cls: tensor(0.6030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8166, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<28:41,  5.35s/it][A

	loss_cls: tensor(0.5108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9033, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:40,  5.36s/it][A

	loss_cls: tensor(0.6278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7857, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:31,  5.35s/it][A

	loss_cls: tensor(0.6948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1734, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:32,  5.37s/it][A

	loss_cls: tensor(0.5677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6606, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:28,  5.37s/it][A

	loss_cls: tensor(0.7626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8978, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:27,  5.39s/it][A

	loss_cls: tensor(0.6208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7182, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:17,  5.37s/it][A

	loss_cls: tensor(0.5937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8773, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:07,  5.36s/it][A

	loss_cls: tensor(0.5651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0258, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:06,  5.37s/it][A

	loss_cls: tensor(0.4936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6289, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<27:55,  5.35s/it][A

	loss_cls: tensor(0.5110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5537, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<27:56,  5.37s/it][A

	loss_cls: tensor(0.5975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8475, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<27:44,  5.35s/it][A

	loss_cls: tensor(0.5127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9049, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:57<27:45,  5.37s/it][A

	loss_cls: tensor(0.5919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8798, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:36,  5.36s/it][A

	loss_cls: tensor(0.6966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9889, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:08<27:26,  5.34s/it][A

	loss_cls: tensor(0.6449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8025, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:13<27:26,  5.36s/it][A

	loss_cls: tensor(0.5594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6823, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:21,  5.36s/it][A

	loss_cls: tensor(0.5683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7281, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:24<27:22,  5.38s/it][A

	loss_cls: tensor(0.6832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8472, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:12,  5.37s/it][A

	loss_cls: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7508, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:35<27:12,  5.39s/it][A

	loss_cls: tensor(0.5573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2994, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8567, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:40<27:00,  5.37s/it][A

	loss_cls: tensor(0.4072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7739, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:46<26:48,  5.34s/it][A

	loss_cls: tensor(0.6668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9978, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:51<26:48,  5.36s/it][A

	loss_cls: tensor(0.6266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8524, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:56<26:45,  5.37s/it][A

	loss_cls: tensor(0.3596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4539, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:02<26:46,  5.39s/it][A

	loss_cls: tensor(0.5722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6412, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:07<26:38,  5.38s/it][A

	loss_cls: tensor(0.3450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6963, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:13<26:40,  5.41s/it][A

	loss_cls: tensor(0.3873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9507, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:18<26:31,  5.40s/it][A

	loss_cls: tensor(0.4696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6112, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:23<26:20,  5.38s/it][A

	loss_cls: tensor(0.4782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5951, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:29<26:19,  5.39s/it][A

	loss_cls: tensor(0.7299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8919, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:34<26:11,  5.38s/it][A

	loss_cls: tensor(0.6926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8121, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:40<26:10,  5.40s/it][A

	loss_cls: tensor(0.7791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0947, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:45<26:03,  5.39s/it][A

	loss_cls: tensor(0.5782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8824, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:50<26:03,  5.41s/it][A

	loss_cls: tensor(0.4653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5635, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:56<25:54,  5.40s/it][A

	loss_cls: tensor(0.5627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7085, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:01<25:46,  5.39s/it][A

	loss_cls: tensor(0.6567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8882, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:07<25:45,  5.41s/it][A

	loss_cls: tensor(0.6465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7753, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:12<25:40,  5.40s/it][A

	loss_cls: tensor(0.4622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5515, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:17<25:37,  5.41s/it][A

	loss_cls: tensor(0.4563, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4912, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:23<25:28,  5.40s/it][A

	loss_cls: tensor(0.5465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7998, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:28<25:24,  5.41s/it][A

	loss_cls: tensor(0.4268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5821, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:34<25:14,  5.39s/it][A

	loss_cls: tensor(0.6681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1313, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:39<25:13,  5.41s/it][A

	loss_cls: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8774, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:44<25:06,  5.40s/it][A

	loss_cls: tensor(0.7462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1544, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:50<24:58,  5.39s/it][A

	loss_cls: tensor(0.3946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5777, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:55<25:00,  5.42s/it][A

	loss_cls: tensor(0.7398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0308, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:01<24:53,  5.41s/it][A

	loss_cls: tensor(0.8106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1670, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:06<24:52,  5.43s/it][A

	loss_cls: tensor(0.4362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5049, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:12<24:41,  5.41s/it][A

	loss_cls: tensor(1.1332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2152, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<24:42,  5.43s/it][A

	loss_cls: tensor(0.6057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9350, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<24:36,  5.43s/it][A

	loss_cls: tensor(0.4742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9076, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:28<24:26,  5.41s/it][A

	loss_cls: tensor(0.5958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7424, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:25,  5.43s/it][A

	loss_cls: tensor(0.3331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2994, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6325, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:39<24:18,  5.42s/it][A

	loss_cls: tensor(0.4853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6102, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:16,  5.43s/it][A

	loss_cls: tensor(0.4347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7388, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:50<24:07,  5.42s/it][A

	loss_cls: tensor(0.6391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8822, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<24:06,  5.44s/it][A

	loss_cls: tensor(0.4877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6709, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<23:58,  5.43s/it][A

	loss_cls: tensor(0.4550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0413, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:06<23:49,  5.42s/it][A

	loss_cls: tensor(0.5714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6862, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:54,  5.45s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0529, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:17<23:42,  5.43s/it][A

	loss_cls: tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8107, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<23:41,  5.45s/it][A

	loss_cls: tensor(0.4067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7886, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:28<23:29,  5.42s/it][A

	loss_cls: tensor(0.6316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7673, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:33<23:25,  5.43s/it][A

	loss_cls: tensor(0.6449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7657, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:38<23:20,  5.43s/it][A

	loss_cls: tensor(0.6287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3692, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9979, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:44<23:14,  5.43s/it][A

	loss_cls: tensor(0.5360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7918, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:49<23:09,  5.43s/it][A

	loss_cls: tensor(0.4906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7665, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:55<22:57,  5.40s/it][A

	loss_cls: tensor(0.5975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8718, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:00<22:52,  5.40s/it][A

	loss_cls: tensor(0.8290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1313, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9603, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:05<22:43,  5.39s/it][A

	loss_cls: tensor(0.4764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8311, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:11<22:41,  5.40s/it][A

	loss_cls: tensor(0.5699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6219, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:30,  5.38s/it][A

	loss_cls: tensor(0.6921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7978, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:22<22:23,  5.37s/it][A

	loss_cls: tensor(0.6176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2615, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:27<22:23,  5.39s/it][A

	loss_cls: tensor(0.4030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7179, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:32<22:13,  5.38s/it][A

	loss_cls: tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8003, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:38<22:11,  5.39s/it][A

	loss_cls: tensor(0.5093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6967, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:43<22:03,  5.38s/it][A

	loss_cls: tensor(0.5216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7242, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:49<22:02,  5.40s/it][A

	loss_cls: tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4619, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:54<21:53,  5.39s/it][A

	loss_cls: tensor(0.5810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8374, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:59<21:45,  5.37s/it][A

	loss_cls: tensor(0.8997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0221, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:05<21:43,  5.39s/it][A

	loss_cls: tensor(0.5608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7655, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:10<21:34,  5.37s/it][A

	loss_cls: tensor(1.1042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4120, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:15<21:31,  5.38s/it][A

	loss_cls: tensor(0.4327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7076, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:21<21:23,  5.37s/it][A

	loss_cls: tensor(0.5084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8123, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:26<21:20,  5.38s/it][A

	loss_cls: tensor(0.4936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8271, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:31<21:11,  5.36s/it][A

	loss_cls: tensor(0.8964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1108, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:37<21:04,  5.36s/it][A

	loss_cls: tensor(0.6690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7852, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:42<21:11,  5.41s/it][A

	loss_cls: tensor(0.4679, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7079, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:48<21:08,  5.42s/it][A

	loss_cls: tensor(0.8155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9862, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:53<21:13,  5.47s/it][A

	loss_cls: tensor(0.5036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6679, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:59<21:09,  5.47s/it][A

	loss_cls: tensor(0.5515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9395, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:04<21:10,  5.50s/it][A

	loss_cls: tensor(0.7932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8449, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:10<21:02,  5.49s/it][A

	loss_cls: tensor(0.7933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9211, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:15<20:56,  5.48s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7979, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:21<20:51,  5.49s/it][A

	loss_cls: tensor(0.4640, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6202, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:26<20:43,  5.48s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9229, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:32<20:44,  5.51s/it][A

	loss_cls: tensor(1.0050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0778, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:37<20:36,  5.50s/it][A

	loss_cls: tensor(0.4848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6922, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:43<20:35,  5.52s/it][A

	loss_cls: tensor(0.6339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8590, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:48<20:28,  5.51s/it][A

	loss_cls: tensor(0.5368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6777, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:54<20:20,  5.50s/it][A

	loss_cls: tensor(0.8139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9076, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:59<20:18,  5.51s/it][A

	loss_cls: tensor(0.6113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0081, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:05<20:10,  5.50s/it][A

	loss_cls: tensor(0.6148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0991, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:10<20:10,  5.53s/it][A

	loss_cls: tensor(0.3782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4151, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:16<20:00,  5.51s/it][A

	loss_cls: tensor(0.7134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9377, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:21<19:58,  5.52s/it][A

	loss_cls: tensor(0.6442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9283, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:27<19:53,  5.53s/it][A

	loss_cls: tensor(0.4932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5413, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:32<19:44,  5.51s/it][A

	loss_cls: tensor(0.5757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6793, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:38<19:41,  5.52s/it][A

	loss_cls: tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8012, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:43<19:33,  5.51s/it][A

	loss_cls: tensor(0.7928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0287, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:49<19:32,  5.53s/it][A

	loss_cls: tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5974, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:55<19:23,  5.51s/it][A

	loss_cls: tensor(0.7639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9058, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:00<19:22,  5.54s/it][A

	loss_cls: tensor(0.6060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9482, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:06<19:13,  5.52s/it][A

	loss_cls: tensor(0.3809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4434, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:11<19:05,  5.51s/it][A

	loss_cls: tensor(0.4629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6352, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:16<18:24,  5.33s/it][A

	loss_cls: tensor(1.0310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4452, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:21<17:57,  5.23s/it][A

	loss_cls: tensor(0.7773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9882, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:27<18:11,  5.32s/it][A

	loss_cls: tensor(0.7844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9232, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:32<18:14,  5.37s/it][A

	loss_cls: tensor(0.4176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5598, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:38<18:19,  5.42s/it][A

	loss_cls: tensor(0.6355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9980, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:43<18:17,  5.43s/it][A

	loss_cls: tensor(0.6556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0516, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:49<18:14,  5.45s/it][A

	loss_cls: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8506, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:54<18:17,  5.49s/it][A

	loss_cls: tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8929, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:00<18:10,  5.48s/it][A

	loss_cls: tensor(0.4199, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6249, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:05<18:09,  5.50s/it][A

	loss_cls: tensor(0.4527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5269, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:11<18:01,  5.49s/it][A

	loss_cls: tensor(0.6884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1119, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:16<17:58,  5.50s/it][A

	loss_cls: tensor(0.5644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7278, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:22<17:48,  5.48s/it][A

	loss_cls: tensor(0.5915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8981, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:27<17:40,  5.47s/it][A

	loss_cls: tensor(0.6051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9259, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:32<17:39,  5.49s/it][A

	loss_cls: tensor(0.6258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7876, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:38<17:30,  5.47s/it][A

	loss_cls: tensor(0.5290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6497, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:43<17:27,  5.48s/it][A

	loss_cls: tensor(0.4487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5008, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:49<17:18,  5.47s/it][A

	loss_cls: tensor(0.5292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7871, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:54<17:16,  5.49s/it][A

	loss_cls: tensor(0.4155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6604, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:00<17:07,  5.47s/it][A

	loss_cls: tensor(0.4401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4739, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:05<17:02,  5.47s/it][A

	loss_cls: tensor(0.6464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7195, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:11<17:02,  5.50s/it][A

	loss_cls: tensor(0.7259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0075, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:16<16:55,  5.49s/it][A

	loss_cls: tensor(0.6068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9271, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:22<16:52,  5.50s/it][A

	loss_cls: tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6334, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:27<16:40,  5.47s/it][A

	loss_cls: tensor(0.4282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8469, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:33<16:38,  5.48s/it][A

	loss_cls: tensor(0.8352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1650, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:38<16:29,  5.47s/it][A

	loss_cls: tensor(0.5804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7966, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:44<16:26,  5.48s/it][A

	loss_cls: tensor(0.4317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7476, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:49<16:22,  5.49s/it][A

	loss_cls: tensor(0.5177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5823, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:55<16:15,  5.48s/it][A

	loss_cls: tensor(0.4958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6563, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:00<16:11,  5.49s/it][A

	loss_cls: tensor(0.6095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0068, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:06<16:04,  5.48s/it][A

	loss_cls: tensor(0.4362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5539, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:11<16:01,  5.49s/it][A

	loss_cls: tensor(0.4958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6250, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:17<15:50,  5.47s/it][A

	loss_cls: tensor(0.5947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9332, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:22<15:49,  5.49s/it][A

	loss_cls: tensor(0.5828, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7674, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:28<15:42,  5.48s/it][A

	loss_cls: tensor(0.8000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1745, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:33<15:34,  5.46s/it][A

	loss_cls: tensor(0.7996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0392, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:39<15:32,  5.48s/it][A

	loss_cls: tensor(0.7505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1510, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:44<15:25,  5.48s/it][A

	loss_cls: tensor(0.4494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6342, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:50<15:22,  5.49s/it][A

	loss_cls: tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9347, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:55<15:14,  5.48s/it][A

	loss_cls: tensor(0.5060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2822, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7881, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:01<15:14,  5.51s/it][A

	loss_cls: tensor(0.5030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9260, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:06<15:06,  5.49s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7280, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:11<14:59,  5.49s/it][A

	loss_cls: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6873, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:17<14:56,  5.50s/it][A

	loss_cls: tensor(0.5748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:22<14:48,  5.48s/it][A

	loss_cls: tensor(0.5097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8331, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:28<14:44,  5.49s/it][A

	loss_cls: tensor(0.7820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9292, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:33<14:35,  5.47s/it][A

	loss_cls: tensor(0.5540, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8268, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:39<14:33,  5.49s/it][A

	loss_cls: tensor(0.6564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8916, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:44<14:25,  5.48s/it][A

	loss_cls: tensor(0.4007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7218, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:50<14:18,  5.47s/it][A

	loss_cls: tensor(0.6629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8204, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:55<14:15,  5.48s/it][A

	loss_cls: tensor(0.7664, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8402, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:01<14:07,  5.47s/it][A

	loss_cls: tensor(0.5526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7943, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:06<14:03,  5.48s/it][A

	loss_cls: tensor(0.4630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6142, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:12<13:57,  5.48s/it][A

	loss_cls: tensor(0.3691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6037, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:17<13:55,  5.49s/it][A

	loss_cls: tensor(0.3506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5262, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:23<13:48,  5.49s/it][A

	loss_cls: tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5918, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:28<13:40,  5.47s/it][A

	loss_cls: tensor(0.6668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:34<13:36,  5.48s/it][A

	loss_cls: tensor(0.8633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0531, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:39<13:29,  5.47s/it][A

	loss_cls: tensor(0.3965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5133, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:45<13:26,  5.49s/it][A

	loss_cls: tensor(0.9227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0358, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:50<13:20,  5.49s/it][A

	loss_cls: tensor(0.4979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7353, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:56<13:16,  5.49s/it][A

	loss_cls: tensor(0.4603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6348, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:01<13:09,  5.48s/it][A

	loss_cls: tensor(0.5588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7677, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:07<13:01,  5.46s/it][A

	loss_cls: tensor(1.5302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6815, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:12<12:58,  5.48s/it][A

	loss_cls: tensor(0.6052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9006, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:17<12:49,  5.46s/it][A

	loss_cls: tensor(0.8000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0092, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:23<12:46,  5.47s/it][A

	loss_cls: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8598, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:28<12:39,  5.47s/it][A

	loss_cls: tensor(0.6021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8442, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:34<12:31,  5.45s/it][A

	loss_cls: tensor(0.5972, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7829, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:39<12:21,  5.41s/it][A

	loss_cls: tensor(0.5112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8400, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:44<12:10,  5.37s/it][A

	loss_cls: tensor(0.8478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1682, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:50<12:07,  5.39s/it][A

	loss_cls: tensor(0.6976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8664, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:55<12:00,  5.38s/it][A

	loss_cls: tensor(0.5629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7362, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:01<11:56,  5.39s/it][A

	loss_cls: tensor(0.5563, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6619, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:06<11:49,  5.38s/it][A

	loss_cls: tensor(0.6625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8547, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:11<11:45,  5.39s/it][A

	loss_cls: tensor(0.5137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5736, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:17<11:38,  5.37s/it][A

	loss_cls: tensor(0.5142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9682, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:22<11:30,  5.36s/it][A

	loss_cls: tensor(0.6505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8622, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:27<11:28,  5.38s/it][A

	loss_cls: tensor(0.6257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8163, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:33<11:20,  5.36s/it][A

	loss_cls: tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7407, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:38<11:17,  5.38s/it][A

	loss_cls: tensor(0.6687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7207, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:44<11:10,  5.36s/it][A

	loss_cls: tensor(0.6040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9168, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:49<11:06,  5.38s/it][A

	loss_cls: tensor(0.4223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8416, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:54<10:59,  5.37s/it][A

	loss_cls: tensor(0.7018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1008, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:00<10:53,  5.35s/it][A

	loss_cls: tensor(0.6106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7492, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:05<10:49,  5.37s/it][A

	loss_cls: tensor(0.6118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8984, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:10<10:42,  5.35s/it][A

	loss_cls: tensor(0.6979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7895, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:16<10:39,  5.37s/it][A

	loss_cls: tensor(0.6636, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9088, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:21<10:33,  5.37s/it][A

	loss_cls: tensor(0.7181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8299, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:27<10:31,  5.39s/it][A

	loss_cls: tensor(0.8510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2462, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:32<10:25,  5.39s/it][A

	loss_cls: tensor(0.7798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9727, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:37<10:17,  5.37s/it][A

	loss_cls: tensor(0.4613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9154, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:43<10:12,  5.37s/it][A

	loss_cls: tensor(0.5427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6994, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:48<10:06,  5.37s/it][A

	loss_cls: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7390, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:53<10:03,  5.39s/it][A

	loss_cls: tensor(0.5911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8829, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:59<09:56,  5.37s/it][A

	loss_cls: tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7632, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:04<09:53,  5.39s/it][A

	loss_cls: tensor(0.8105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9129, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:10<09:46,  5.38s/it][A

	loss_cls: tensor(0.4716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6497, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:15<09:39,  5.36s/it][A

	loss_cls: tensor(0.4209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6179, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:20<09:35,  5.38s/it][A

	loss_cls: tensor(0.4720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9495, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:26<09:29,  5.37s/it][A

	loss_cls: tensor(0.4843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7711, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:31<09:26,  5.39s/it][A

	loss_cls: tensor(0.7124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9002, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:36<09:19,  5.38s/it][A

	loss_cls: tensor(0.5115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6261, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:42<09:16,  5.40s/it][A

	loss_cls: tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5401, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:47<09:09,  5.39s/it][A

	loss_cls: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7396, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:53<09:02,  5.37s/it][A

	loss_cls: tensor(0.4242, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5812, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:58<08:58,  5.39s/it][A

	loss_cls: tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6627, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:03<08:52,  5.38s/it][A

	loss_cls: tensor(0.6258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7869, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:09<08:49,  5.41s/it][A

	loss_cls: tensor(0.6511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8729, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:14<08:43,  5.40s/it][A

	loss_cls: tensor(0.4681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6673, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:20<08:39,  5.41s/it][A

	loss_cls: tensor(0.7649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9506, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:25<08:32,  5.40s/it][A

	loss_cls: tensor(0.8526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2763, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:30<08:26,  5.38s/it][A

	loss_cls: tensor(0.2898, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3176, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:36<08:22,  5.40s/it][A

	loss_cls: tensor(0.5955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7259, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:41<08:16,  5.40s/it][A

	loss_cls: tensor(0.8150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0866, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:47<08:14,  5.44s/it][A

	loss_cls: tensor(0.2975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4105, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:52<08:11,  5.46s/it][A

	loss_cls: tensor(0.5794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8976, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:57<08:00,  5.40s/it][A

	loss_cls: tensor(0.2230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3630, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:03<07:57,  5.42s/it][A

	loss_cls: tensor(0.8529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1638, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:08<07:53,  5.45s/it][A

	loss_cls: tensor(0.3479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4418, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:14<07:51,  5.49s/it][A

	loss_cls: tensor(0.5837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7105, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:20<07:46,  5.49s/it][A

	loss_cls: tensor(0.5927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8296, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:25<07:42,  5.51s/it][A

	loss_cls: tensor(0.4681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7315, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:31<07:36,  5.49s/it][A

	loss_cls: tensor(1.0095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3561, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:36<07:32,  5.51s/it][A

	loss_cls: tensor(0.4921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0962, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5883, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:42<07:25,  5.50s/it][A

	loss_cls: tensor(0.2766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3254, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:47<07:21,  5.51s/it][A

	loss_cls: tensor(0.3932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4824, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:53<07:15,  5.51s/it][A

	loss_cls: tensor(0.6566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7886, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:58<07:09,  5.50s/it][A

	loss_cls: tensor(0.4950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6618, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:04<07:05,  5.53s/it][A

	loss_cls: tensor(0.7118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9561, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:09<06:59,  5.52s/it][A

	loss_cls: tensor(0.7067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9356, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:15<06:55,  5.53s/it][A

	loss_cls: tensor(0.7263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7603, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:20<06:35,  5.34s/it][A

	loss_cls: tensor(0.6891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9193, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:25<06:21,  5.23s/it][A

	loss_cls: tensor(0.7633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1288, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:30<06:10,  5.14s/it][A

	loss_cls: tensor(0.6416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0179, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:35<06:01,  5.09s/it][A

	loss_cls: tensor(0.4821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9071, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:40<05:54,  5.07s/it][A

	loss_cls: tensor(0.6461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9446, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:45<05:51,  5.10s/it][A

	loss_cls: tensor(0.6379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3708, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:50<05:55,  5.22s/it][A

	loss_cls: tensor(0.6189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9781, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:56<05:54,  5.30s/it][A

	loss_cls: tensor(0.7257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8231, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:01<05:55,  5.38s/it][A

	loss_cls: tensor(0.5857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7786, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:07<05:51,  5.41s/it][A

	loss_cls: tensor(0.5596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7106, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:12<05:47,  5.43s/it][A

	loss_cls: tensor(0.6190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0229, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:18<05:44,  5.46s/it][A

	loss_cls: tensor(0.6333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8230, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:23<05:38,  5.47s/it][A

	loss_cls: tensor(0.6718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8062, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:29<05:34,  5.49s/it][A

	loss_cls: tensor(0.4327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6492, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:34<05:29,  5.49s/it][A

	loss_cls: tensor(0.6100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9282, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:40<05:25,  5.51s/it][A

	loss_cls: tensor(0.5605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6844, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:45<05:18,  5.49s/it][A

	loss_cls: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7691, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:51<05:13,  5.50s/it][A

	loss_cls: tensor(0.5375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7596, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:56<05:09,  5.52s/it][A

	loss_cls: tensor(0.5483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6360, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:02<05:02,  5.50s/it][A

	loss_cls: tensor(0.4631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6030, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:07<04:58,  5.52s/it][A

	loss_cls: tensor(0.4367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9274, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:13<04:51,  5.51s/it][A

	loss_cls: tensor(0.5967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8732, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:18<04:46,  5.51s/it][A

	loss_cls: tensor(0.7569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1801, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:24<04:40,  5.51s/it][A

	loss_cls: tensor(0.6278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8985, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:29<04:26,  5.33s/it][A

	loss_cls: tensor(0.4228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5149, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:34<04:15,  5.22s/it][A

	loss_cls: tensor(0.5886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8938, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:39<04:10,  5.23s/it][A

	loss_cls: tensor(0.7206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9953, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:44<04:01,  5.14s/it][A

	loss_cls: tensor(1.0030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2919, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:49<03:57,  5.17s/it][A

	loss_cls: tensor(0.5076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5675, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:55<03:57,  5.28s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8765, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:00<03:55,  5.34s/it][A

	loss_cls: tensor(0.8342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9772, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:06<03:51,  5.38s/it][A

	loss_cls: tensor(0.5876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8441, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:11<03:48,  5.44s/it][A

	loss_cls: tensor(0.9765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1803, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:17<03:43,  5.45s/it][A

	loss_cls: tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7362, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:22<03:38,  5.47s/it][A

	loss_cls: tensor(0.4560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0235, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:28<03:33,  5.47s/it][A

	loss_cls: tensor(0.5849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8890, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:33<03:29,  5.50s/it][A

	loss_cls: tensor(0.8941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2268, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:39<03:23,  5.50s/it][A

	loss_cls: tensor(0.5560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8435, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:44<03:17,  5.48s/it][A

	loss_cls: tensor(0.8170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0125, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:50<03:12,  5.51s/it][A

	loss_cls: tensor(0.7963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0171, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:55<03:06,  5.49s/it][A

	loss_cls: tensor(0.5063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7066, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:01<03:02,  5.52s/it][A

	loss_cls: tensor(0.7069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8441, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:06<02:56,  5.51s/it][A

	loss_cls: tensor(0.6506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8151, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:12<02:51,  5.53s/it][A

	loss_cls: tensor(0.6384, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7839, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:17<02:45,  5.51s/it][A

	loss_cls: tensor(0.4465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7899, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:23<02:39,  5.50s/it][A

	loss_cls: tensor(0.6903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0068, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:28<02:34,  5.51s/it][A

	loss_cls: tensor(0.5788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6877, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:34<02:28,  5.49s/it][A

	loss_cls: tensor(0.5172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9167, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:39<02:23,  5.52s/it][A

	loss_cls: tensor(0.7124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7846, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:45<02:17,  5.51s/it][A

	loss_cls: tensor(0.6035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7682, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:50<02:12,  5.51s/it][A

	loss_cls: tensor(0.6055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1594, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:56<02:06,  5.49s/it][A

	loss_cls: tensor(0.8494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1358, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:01<02:00,  5.49s/it][A

	loss_cls: tensor(0.5885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8965, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:07<01:55,  5.50s/it][A

	loss_cls: tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7569, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:12<01:46,  5.32s/it][A

	loss_cls: tensor(0.6565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7447, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:17<01:41,  5.34s/it][A

	loss_cls: tensor(0.5855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9044, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:23<01:36,  5.39s/it][A

	loss_cls: tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8666, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:28<01:32,  5.45s/it][A

	loss_cls: tensor(0.6723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7292, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:33<01:24,  5.29s/it][A

	loss_cls: tensor(0.6658, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7271, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:39<01:19,  5.33s/it][A

	loss_cls: tensor(0.6984, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1255, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:44<01:15,  5.39s/it][A

	loss_cls: tensor(0.5499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1314, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6812, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:50<01:10,  5.43s/it][A

	loss_cls: tensor(0.3910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6317, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:55<01:05,  5.47s/it][A

	loss_cls: tensor(0.7073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8640, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:01<01:00,  5.47s/it][A

	loss_cls: tensor(0.5267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8212, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:06<00:54,  5.49s/it][A

	loss_cls: tensor(0.6527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8004, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:12<00:49,  5.48s/it][A

	loss_cls: tensor(0.8338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1251, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:17<00:43,  5.46s/it][A

	loss_cls: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2820, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:23<00:38,  5.48s/it][A

	loss_cls: tensor(0.5601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7683, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:28<00:32,  5.48s/it][A

	loss_cls: tensor(0.3615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4047, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:34<00:27,  5.51s/it][A

	loss_cls: tensor(0.5695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9553, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:39<00:21,  5.50s/it][A

	loss_cls: tensor(0.4227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5755, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:45<00:16,  5.52s/it][A

	loss_cls: tensor(0.5926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8999, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:50<00:10,  5.48s/it][A

	loss_cls: tensor(0.6711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9592, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:56<00:05,  5.46s/it][A

	loss_cls: tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5196, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:58<00:00,  5.42s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.8121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8821, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8337691394285968

	Training cls acc: 0.7002118644067796

	Training cls prec: 0.5787365510352798

	Training cls rec: 0.6106393665186037

	Training cls f1: 0.5383583532321501

--
	Training ner acc: 0.9552632714939859

	Training ner prec: 0.27325038211464703

	Training ner rec: 0.28085979717807835

	Training ner f1: 0.276448095358598

	Current Learning rate:  0.0005142857142857142



  1%|          | 1/177 [00:00<02:10,  1.35it/s][A
  1%|          | 2/177 [00:01<01:59,  1.47it/s][A
  2%|▏         | 3/177 [00:02<02:01,  1.43it/s][A
  2%|▏         | 4/177 [00:02<02:03,  1.41it/s][A
  3%|▎         | 5/177 [00:03<01:57,  1.46it/s][A
  3%|▎         | 6/177 [00:04<01:58,  1.44it/s][A
  4%|▍         | 7/177 [00:04<01:59,  1.42it/s][A
  5%|▍         | 8/177 [00:05<02:00,  1.40it/s][A
  5%|▌         | 9/177 [00:06<01:55,  1.45it/s][A
  6%|▌         | 10/177 [00:07<01:57,  1.43it/s][A
  6%|▌         | 11/177 [00:07<01:57,  1.41it/s][A
  7%|▋         | 12/177 [00:08<01:57,  1.40it/s][A
  7%|▋         | 13/177 [00:09<01:53,  1.44it/s][A
  8%|▊         | 14/177 [00:09<01:54,  1.42it/s][A
  8%|▊         | 15/177 [00:10<01:54,  1.41it/s][A
  9%|▉         | 16/177 [00:11<01:50,  1.45it/s][A
 10%|▉         | 17/177 [00:11<01:51,  1.44it/s][A
 10%|█         | 18/177 [00:12<01:51,  1.42it/s][A
 11%|█         | 19/177 [00:13<01:52,  1.41it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7771722736668452

	Validation cls acc: 0.7547080979284368

	Validation cls prec: 0.6298829701372075

	Validation cls rec: 0.6263451708366963

	Validation cls f1: 0.5997437904217565

--
	Validation ner acc: 0.9543580904167602

	Validation ner prec: 0.39797652574689873

	Validation ner rec: 0.40856873822975526

	Validation ner f1: 0.4030594936934512



  0%|          | 1/354 [00:05<32:07,  5.46s/it][A

	loss_cls: tensor(0.4832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6920, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:29,  5.37s/it][A

	loss_cls: tensor(0.6914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9071, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:23,  5.37s/it][A

	loss_cls: tensor(0.3998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5321, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:15,  5.36s/it][A

	loss_cls: tensor(0.7280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2436, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9715, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:13,  5.37s/it][A

	loss_cls: tensor(0.3733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7200, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:04,  5.36s/it][A

	loss_cls: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7894, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<30:55,  5.35s/it][A

	loss_cls: tensor(0.3485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4188, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:42<30:57,  5.37s/it][A

	loss_cls: tensor(0.4852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9068, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:43,  5.34s/it][A

	loss_cls: tensor(0.4422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6180, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:44,  5.36s/it][A

	loss_cls: tensor(0.6442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7270, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:58<30:34,  5.35s/it][A

	loss_cls: tensor(0.4213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6970, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:33,  5.36s/it][A

	loss_cls: tensor(0.8748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0058, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:25,  5.35s/it][A

	loss_cls: tensor(0.3085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4009, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:14<30:13,  5.33s/it][A

	loss_cls: tensor(0.4222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6121, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:13,  5.35s/it][A

	loss_cls: tensor(1.0427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2797, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:25<30:01,  5.33s/it][A

	loss_cls: tensor(0.9646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3188, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:04,  5.36s/it][A

	loss_cls: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6898, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<29:55,  5.34s/it][A

	loss_cls: tensor(0.5442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7774, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:41<29:56,  5.36s/it][A

	loss_cls: tensor(0.7165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1739, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<29:45,  5.35s/it][A

	loss_cls: tensor(0.4903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5625, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:36,  5.34s/it][A

	loss_cls: tensor(0.6435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8770, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:57<29:37,  5.35s/it][A

	loss_cls: tensor(0.7862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0795, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:37,  5.37s/it][A

	loss_cls: tensor(0.3626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5496, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:38,  5.39s/it][A

	loss_cls: tensor(0.6463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9367, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:13<29:26,  5.37s/it][A

	loss_cls: tensor(0.4881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7493, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:22,  5.37s/it][A

	loss_cls: tensor(0.6215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7508, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:10,  5.35s/it][A

	loss_cls: tensor(0.5948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7492, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:29<29:00,  5.34s/it][A

	loss_cls: tensor(0.7652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8370, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<28:59,  5.35s/it][A

	loss_cls: tensor(0.4718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9130, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<28:49,  5.34s/it][A

	loss_cls: tensor(0.7363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:46<28:50,  5.36s/it][A

	loss_cls: tensor(0.8110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9420, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<28:41,  5.35s/it][A

	loss_cls: tensor(0.7111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7566, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:56<28:41,  5.36s/it][A

	loss_cls: tensor(0.6438, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6843, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:02<28:30,  5.35s/it][A

	loss_cls: tensor(0.7942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9393, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:21,  5.33s/it][A

	loss_cls: tensor(0.4459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7290, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:12<28:19,  5.34s/it][A

	loss_cls: tensor(0.7020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0862, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:10,  5.33s/it][A

	loss_cls: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7579, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:23<28:12,  5.35s/it][A

	loss_cls: tensor(0.5672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9937, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:28<28:02,  5.34s/it][A

	loss_cls: tensor(0.8348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4158, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:34<28:00,  5.35s/it][A

	loss_cls: tensor(0.8657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3118, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:39<27:50,  5.34s/it][A

	loss_cls: tensor(0.7703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9292, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:44<27:39,  5.32s/it][A

	loss_cls: tensor(0.8920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1496, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:50<27:40,  5.34s/it][A

	loss_cls: tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7567, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:55<27:32,  5.33s/it][A

	loss_cls: tensor(0.4267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5675, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:00<27:34,  5.35s/it][A

	loss_cls: tensor(0.6107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0444, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:06<27:29,  5.36s/it][A

	loss_cls: tensor(0.4831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7350, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:11<27:31,  5.38s/it][A

	loss_cls: tensor(0.5958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7854, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:16<27:18,  5.36s/it][A

	loss_cls: tensor(1.0895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2417, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:22<27:08,  5.34s/it][A

	loss_cls: tensor(0.6057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8564, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:27<27:09,  5.36s/it][A

	loss_cls: tensor(0.5069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5785, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:32<27:02,  5.35s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9544, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:38<27:04,  5.38s/it][A

	loss_cls: tensor(0.6220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9539, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:43<26:53,  5.36s/it][A

	loss_cls: tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6311, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:49<26:51,  5.37s/it][A

	loss_cls: tensor(0.6154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7205, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:54<26:39,  5.35s/it][A

	loss_cls: tensor(0.6370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7944, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [04:59<26:30,  5.34s/it][A

	loss_cls: tensor(0.4026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8327, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:05<26:30,  5.35s/it][A

	loss_cls: tensor(0.7467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1465, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:10<26:21,  5.34s/it][A

	loss_cls: tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8418, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:15<26:21,  5.36s/it][A

	loss_cls: tensor(0.6055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8248, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:21<26:11,  5.35s/it][A

	loss_cls: tensor(0.4273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5473, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:26<26:09,  5.36s/it][A

	loss_cls: tensor(0.7479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0439, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:31<25:59,  5.34s/it][A

	loss_cls: tensor(0.5012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6462, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:37<25:49,  5.32s/it][A

	loss_cls: tensor(0.5234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6809, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:42<25:49,  5.34s/it][A

	loss_cls: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9996, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:47<25:40,  5.33s/it][A

	loss_cls: tensor(0.5108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8651, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:53<25:42,  5.35s/it][A

	loss_cls: tensor(0.6491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9561, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [05:58<25:33,  5.34s/it][A

	loss_cls: tensor(0.8608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3405, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:03<25:30,  5.35s/it][A

	loss_cls: tensor(0.4203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7631, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:09<25:20,  5.34s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7216, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:14<25:18,  5.35s/it][A

	loss_cls: tensor(0.6132, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4222, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0354, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:19<25:13,  5.35s/it][A

	loss_cls: tensor(0.5852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7932, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:25<25:06,  5.34s/it][A

	loss_cls: tensor(0.6119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8003, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:30<25:06,  5.36s/it][A

	loss_cls: tensor(0.6336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8957, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:35<24:57,  5.35s/it][A

	loss_cls: tensor(0.7337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0416, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:41<24:56,  5.36s/it][A

	loss_cls: tensor(0.5307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7652, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:46<24:45,  5.34s/it][A

	loss_cls: tensor(0.7556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9421, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:52<24:44,  5.36s/it][A

	loss_cls: tensor(0.6792, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7481, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [06:57<24:34,  5.34s/it][A

	loss_cls: tensor(0.5739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1783, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:02<24:25,  5.33s/it][A

	loss_cls: tensor(0.7573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0658, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:08<24:25,  5.35s/it][A

	loss_cls: tensor(0.6179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6729, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:13<24:16,  5.34s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7872, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:18<24:14,  5.35s/it][A

	loss_cls: tensor(0.6033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7851, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:24<24:04,  5.33s/it][A

	loss_cls: tensor(0.4736, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8281, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:29<24:04,  5.35s/it][A

	loss_cls: tensor(0.5238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7054, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:34<23:56,  5.34s/it][A

	loss_cls: tensor(0.8422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2547, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:40<23:48,  5.33s/it][A

	loss_cls: tensor(0.4537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8533, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:45<23:47,  5.35s/it][A

	loss_cls: tensor(0.4799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6579, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:50<23:37,  5.33s/it][A

	loss_cls: tensor(0.8486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0086, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [07:56<23:37,  5.35s/it][A

	loss_cls: tensor(0.6563, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9550, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:01<23:28,  5.34s/it][A

	loss_cls: tensor(0.4674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6765, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:06<23:28,  5.36s/it][A

	loss_cls: tensor(0.3660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5876, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:12<23:23,  5.36s/it][A

	loss_cls: tensor(0.4509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5548, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:17<23:17,  5.35s/it][A

	loss_cls: tensor(0.9944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2028, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:22<23:14,  5.36s/it][A

	loss_cls: tensor(0.7409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9455, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:28<23:05,  5.35s/it][A

	loss_cls: tensor(0.5977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9672, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:33<23:02,  5.36s/it][A

	loss_cls: tensor(0.8718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1235, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:38<22:53,  5.35s/it][A

	loss_cls: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8350, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:44<22:53,  5.37s/it][A

	loss_cls: tensor(0.5439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6259, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:49<22:43,  5.35s/it][A

	loss_cls: tensor(0.3981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7475, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:54<22:35,  5.34s/it][A

	loss_cls: tensor(0.6621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7767, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:00<22:33,  5.35s/it][A

	loss_cls: tensor(0.7403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9397, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:05<22:25,  5.34s/it][A

	loss_cls: tensor(0.5594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7471, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:11<22:23,  5.35s/it][A

	loss_cls: tensor(0.4264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7488, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:16<22:16,  5.35s/it][A

	loss_cls: tensor(0.6950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7425, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:21<22:15,  5.36s/it][A

	loss_cls: tensor(0.5382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8034, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:27<22:06,  5.35s/it][A

	loss_cls: tensor(0.5758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6481, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:32<21:57,  5.34s/it][A

	loss_cls: tensor(0.5279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5700, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:37<21:57,  5.35s/it][A

	loss_cls: tensor(0.5926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2710, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:43<21:48,  5.34s/it][A

	loss_cls: tensor(0.8009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1432, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:48<21:46,  5.36s/it][A

	loss_cls: tensor(0.4976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6033, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:53<21:38,  5.34s/it][A

	loss_cls: tensor(0.4774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5188, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [09:59<21:36,  5.36s/it][A

	loss_cls: tensor(0.5473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9150, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:04<21:28,  5.34s/it][A

	loss_cls: tensor(0.6183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1708, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7891, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:09<21:19,  5.33s/it][A

	loss_cls: tensor(0.8408, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9165, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:15<21:16,  5.34s/it][A

	loss_cls: tensor(0.5385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6439, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:20<21:08,  5.33s/it][A

	loss_cls: tensor(0.5349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6831, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:25<21:08,  5.35s/it][A

	loss_cls: tensor(0.7218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8920, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:31<20:59,  5.34s/it][A

	loss_cls: tensor(0.5760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7964, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:36<20:57,  5.35s/it][A

	loss_cls: tensor(0.5135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0852, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5987, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:41<20:49,  5.34s/it][A

	loss_cls: tensor(0.8552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0962, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:47<20:41,  5.33s/it][A

	loss_cls: tensor(0.6460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0805, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:52<20:39,  5.34s/it][A

	loss_cls: tensor(0.6253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9487, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [10:57<20:30,  5.33s/it][A

	loss_cls: tensor(0.7622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1097, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:03<20:30,  5.35s/it][A

	loss_cls: tensor(0.5707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7985, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:08<20:22,  5.34s/it][A

	loss_cls: tensor(0.6581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8518, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:13<20:20,  5.35s/it][A

	loss_cls: tensor(0.4661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6204, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:19<20:12,  5.34s/it][A

	loss_cls: tensor(0.6235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7440, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:24<20:05,  5.33s/it][A

	loss_cls: tensor(0.5303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9037, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:29<20:02,  5.34s/it][A

	loss_cls: tensor(0.5239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6471, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:35<19:53,  5.33s/it][A

	loss_cls: tensor(0.6481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7420, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:40<19:54,  5.36s/it][A

	loss_cls: tensor(0.6280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7821, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:45<19:44,  5.34s/it][A

	loss_cls: tensor(0.8295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9485, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:51<19:44,  5.36s/it][A

	loss_cls: tensor(0.5520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7344, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [11:56<19:33,  5.33s/it][A

	loss_cls: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8557, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:01<19:25,  5.32s/it][A

	loss_cls: tensor(0.8180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0588, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:07<19:24,  5.34s/it][A

	loss_cls: tensor(0.7758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0725, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:12<19:17,  5.33s/it][A

	loss_cls: tensor(0.5817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7481, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:18<19:15,  5.35s/it][A

	loss_cls: tensor(0.5303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6647, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:23<19:07,  5.34s/it][A

	loss_cls: tensor(0.5475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7427, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:28<19:05,  5.35s/it][A

	loss_cls: tensor(0.4072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5085, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:34<18:57,  5.34s/it][A

	loss_cls: tensor(0.6459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6821, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:39<18:49,  5.33s/it][A

	loss_cls: tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8278, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:44<18:47,  5.34s/it][A

	loss_cls: tensor(0.8393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1536, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:50<18:39,  5.33s/it][A

	loss_cls: tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9383, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [12:55<18:38,  5.35s/it][A

	loss_cls: tensor(0.8577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9784, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:00<18:29,  5.34s/it][A

	loss_cls: tensor(0.6596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8983, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:06<18:28,  5.35s/it][A

	loss_cls: tensor(0.5339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8547, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:11<18:18,  5.33s/it][A

	loss_cls: tensor(0.3646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6877, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:16<18:08,  5.31s/it][A

	loss_cls: tensor(0.6056, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9033, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:22<18:06,  5.32s/it][A

	loss_cls: tensor(0.6422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0344, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:27<17:59,  5.32s/it][A

	loss_cls: tensor(0.6379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8292, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:32<17:59,  5.34s/it][A

	loss_cls: tensor(0.4276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8014, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:38<17:51,  5.33s/it][A

	loss_cls: tensor(0.5691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9864, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:43<17:48,  5.34s/it][A

	loss_cls: tensor(0.5719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7177, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:48<17:40,  5.33s/it][A

	loss_cls: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8121, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [13:53<17:32,  5.32s/it][A

	loss_cls: tensor(0.4535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5924, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [13:59<17:31,  5.34s/it][A

	loss_cls: tensor(0.5390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8573, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:04<17:24,  5.33s/it][A

	loss_cls: tensor(0.8436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0240, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:10<17:22,  5.35s/it][A

	loss_cls: tensor(0.4899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7810, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:15<17:14,  5.33s/it][A

	loss_cls: tensor(0.7771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2058, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:20<17:10,  5.34s/it][A

	loss_cls: tensor(0.5236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6881, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:26<17:04,  5.33s/it][A

	loss_cls: tensor(0.6309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8110, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:31<16:56,  5.32s/it][A

	loss_cls: tensor(0.6756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:36<16:56,  5.35s/it][A

	loss_cls: tensor(0.5567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8362, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:42<16:49,  5.34s/it][A

	loss_cls: tensor(0.5512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7622, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:47<16:47,  5.36s/it][A

	loss_cls: tensor(0.6419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7796, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [14:52<16:38,  5.34s/it][A

	loss_cls: tensor(0.8077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8802, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [14:58<16:35,  5.35s/it][A

	loss_cls: tensor(0.6404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0304, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:03<16:26,  5.33s/it][A

	loss_cls: tensor(0.7265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8285, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:08<16:24,  5.35s/it][A

	loss_cls: tensor(0.5159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6160, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:14<16:17,  5.34s/it][A

	loss_cls: tensor(0.7060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9994, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:19<16:08,  5.32s/it][A

	loss_cls: tensor(0.6955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8304, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:24<16:07,  5.35s/it][A

	loss_cls: tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8895, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:30<16:00,  5.33s/it][A

	loss_cls: tensor(0.7941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1675, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:35<15:57,  5.35s/it][A

	loss_cls: tensor(0.5305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8935, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:40<15:48,  5.33s/it][A

	loss_cls: tensor(0.4986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9908, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:46<15:46,  5.35s/it][A

	loss_cls: tensor(0.8178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0406, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:51<15:39,  5.34s/it][A

	loss_cls: tensor(0.5462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1436, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6898, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [15:56<15:32,  5.33s/it][A

	loss_cls: tensor(0.5216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0908, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6125, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:02<15:30,  5.35s/it][A

	loss_cls: tensor(0.7369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0294, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:07<15:22,  5.33s/it][A

	loss_cls: tensor(0.5865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8639, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:12<15:19,  5.34s/it][A

	loss_cls: tensor(0.5906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8050, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:18<15:10,  5.32s/it][A

	loss_cls: tensor(0.5256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6816, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:23<15:09,  5.35s/it][A

	loss_cls: tensor(0.3949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4565, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:28<15:03,  5.34s/it][A

	loss_cls: tensor(0.4477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6036, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:34<14:56,  5.34s/it][A

	loss_cls: tensor(0.4447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5779, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:39<14:54,  5.36s/it][A

	loss_cls: tensor(0.6036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7123, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:44<14:46,  5.34s/it][A

	loss_cls: tensor(0.3287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3696, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:50<14:43,  5.35s/it][A

	loss_cls: tensor(0.8762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2056, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [16:55<14:35,  5.34s/it][A

	loss_cls: tensor(0.2867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3149, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:01<14:34,  5.36s/it][A

	loss_cls: tensor(0.7159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9475, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:06<14:26,  5.35s/it][A

	loss_cls: tensor(0.3126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3650, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:11<14:18,  5.33s/it][A

	loss_cls: tensor(0.7901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8824, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:16<14:14,  5.34s/it][A

	loss_cls: tensor(0.8935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0574, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:22<14:06,  5.33s/it][A

	loss_cls: tensor(1.1216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3130, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:27<14:02,  5.33s/it][A

	loss_cls: tensor(0.4253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4460, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:32<13:56,  5.33s/it][A

	loss_cls: tensor(0.9651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5616, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:38<13:52,  5.34s/it][A

	loss_cls: tensor(0.3461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5288, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:43<13:46,  5.33s/it][A

	loss_cls: tensor(0.3250, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0712, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3961, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:48<13:39,  5.32s/it][A

	loss_cls: tensor(0.4015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4376, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [17:54<13:36,  5.34s/it][A

	loss_cls: tensor(0.9751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.8626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.8376, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [17:59<13:28,  5.32s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7549, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:04<13:27,  5.35s/it][A

	loss_cls: tensor(0.5850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6849, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:10<13:20,  5.34s/it][A

	loss_cls: tensor(0.6308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8836, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:15<13:16,  5.35s/it][A

	loss_cls: tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9139, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:21<13:10,  5.34s/it][A

	loss_cls: tensor(0.7196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1543, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:26<13:04,  5.34s/it][A

	loss_cls: tensor(0.6409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8410, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:31<13:05,  5.38s/it][A

	loss_cls: tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6559, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:37<12:58,  5.37s/it][A

	loss_cls: tensor(0.5683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7960, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:42<12:53,  5.37s/it][A

	loss_cls: tensor(0.5632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7702, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:47<12:46,  5.36s/it][A

	loss_cls: tensor(0.5542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7400, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [18:53<12:43,  5.37s/it][A

	loss_cls: tensor(0.5726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7085, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [18:58<12:35,  5.36s/it][A

	loss_cls: tensor(0.7520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9227, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:03<12:29,  5.35s/it][A

	loss_cls: tensor(0.6809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7883, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:09<12:25,  5.36s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8082, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:14<12:17,  5.35s/it][A

	loss_cls: tensor(0.7835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0010, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:20<12:14,  5.36s/it][A

	loss_cls: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9248, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:25<12:07,  5.35s/it][A

	loss_cls: tensor(0.4645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7916, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:30<12:03,  5.36s/it][A

	loss_cls: tensor(0.8315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9936, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:36<11:57,  5.35s/it][A

	loss_cls: tensor(0.5110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5983, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:41<11:49,  5.34s/it][A

	loss_cls: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6976, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:46<11:46,  5.35s/it][A

	loss_cls: tensor(0.4540, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5912, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [19:52<11:38,  5.34s/it][A

	loss_cls: tensor(0.4373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6177, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [19:57<11:35,  5.35s/it][A

	loss_cls: tensor(0.5325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6083, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:02<11:27,  5.33s/it][A

	loss_cls: tensor(0.6421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8293, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:08<11:25,  5.35s/it][A

	loss_cls: tensor(0.6931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7837, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:13<11:18,  5.35s/it][A

	loss_cls: tensor(0.5083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6424, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:18<11:12,  5.34s/it][A

	loss_cls: tensor(0.8429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0469, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:24<11:09,  5.35s/it][A

	loss_cls: tensor(0.6667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9051, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:29<11:02,  5.34s/it][A

	loss_cls: tensor(0.7440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1204, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:34<10:59,  5.36s/it][A

	loss_cls: tensor(0.7649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0783, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:40<10:53,  5.36s/it][A

	loss_cls: tensor(0.3512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4765, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:45<10:49,  5.37s/it][A

	loss_cls: tensor(0.7206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8083, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [20:50<10:42,  5.35s/it][A

	loss_cls: tensor(0.7077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0572, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [20:56<10:35,  5.34s/it][A

	loss_cls: tensor(0.4353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9608, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:01<10:33,  5.37s/it][A

	loss_cls: tensor(0.3721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4891, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:07<10:28,  5.37s/it][A

	loss_cls: tensor(0.8710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0775, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:12<10:25,  5.39s/it][A

	loss_cls: tensor(0.8673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0702, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:17<10:18,  5.38s/it][A

	loss_cls: tensor(0.7104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9093, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:23<10:15,  5.40s/it][A

	loss_cls: tensor(0.4582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7533, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:28<10:08,  5.38s/it][A

	loss_cls: tensor(0.3806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4559, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:33<10:00,  5.36s/it][A

	loss_cls: tensor(0.8532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9722, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:39<09:56,  5.38s/it][A

	loss_cls: tensor(0.5526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6730, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:44<09:50,  5.37s/it][A

	loss_cls: tensor(0.4709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7828, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [21:50<09:46,  5.38s/it][A

	loss_cls: tensor(0.5998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9361, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [21:55<09:39,  5.37s/it][A

	loss_cls: tensor(0.7832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1536, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:00<09:36,  5.39s/it][A

	loss_cls: tensor(0.6793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0349, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:06<09:29,  5.38s/it][A

	loss_cls: tensor(0.5060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1502, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:11<09:23,  5.36s/it][A

	loss_cls: tensor(0.5499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7899, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:17<09:20,  5.39s/it][A

	loss_cls: tensor(0.6489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8673, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:22<09:13,  5.38s/it][A

	loss_cls: tensor(0.5719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7585, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:27<09:10,  5.39s/it][A

	loss_cls: tensor(0.6794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7511, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:33<09:02,  5.37s/it][A

	loss_cls: tensor(0.4139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6578, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:38<08:58,  5.39s/it][A

	loss_cls: tensor(0.6368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8617, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:43<08:52,  5.38s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8488, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [22:49<08:46,  5.38s/it][A

	loss_cls: tensor(0.5388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9102, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [22:54<08:43,  5.39s/it][A

	loss_cls: tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8436, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:00<08:36,  5.38s/it][A

	loss_cls: tensor(0.4872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5906, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:05<08:33,  5.41s/it][A

	loss_cls: tensor(0.5057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7786, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:10<08:26,  5.39s/it][A

	loss_cls: tensor(0.5777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7894, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:16<08:21,  5.40s/it][A

	loss_cls: tensor(0.6995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8513, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:21<08:15,  5.38s/it][A

	loss_cls: tensor(0.7044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9290, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:26<08:07,  5.36s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7734, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:32<08:04,  5.38s/it][A

	loss_cls: tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7656, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:37<07:57,  5.37s/it][A

	loss_cls: tensor(0.4955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6148, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:43<07:54,  5.39s/it][A

	loss_cls: tensor(0.8657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0767, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [23:48<07:46,  5.37s/it][A

	loss_cls: tensor(0.4637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7682, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [23:53<07:42,  5.38s/it][A

	loss_cls: tensor(0.5018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6099, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [23:59<07:35,  5.36s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7085, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:04<07:30,  5.37s/it][A

	loss_cls: tensor(0.6370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0048, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:09<07:25,  5.36s/it][A

	loss_cls: tensor(0.5086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7769, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:15<07:19,  5.35s/it][A

	loss_cls: tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:20<07:15,  5.38s/it][A

	loss_cls: tensor(0.6011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8041, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:26<07:08,  5.36s/it][A

	loss_cls: tensor(0.5529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8267, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:31<07:05,  5.38s/it][A

	loss_cls: tensor(0.4914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5790, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:36<06:59,  5.37s/it][A

	loss_cls: tensor(0.7393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9135, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:42<06:55,  5.40s/it][A

	loss_cls: tensor(0.5117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5952, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [24:47<06:49,  5.39s/it][A

	loss_cls: tensor(0.4169, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8910, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [24:52<06:43,  5.38s/it][A

	loss_cls: tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5666, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [24:58<06:39,  5.39s/it][A

	loss_cls: tensor(0.4825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0695, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:03<06:32,  5.38s/it][A

	loss_cls: tensor(0.5184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7257, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:09<06:28,  5.40s/it][A

	loss_cls: tensor(0.6351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9445, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:14<06:22,  5.39s/it][A

	loss_cls: tensor(0.5641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8555, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:20<06:18,  5.40s/it][A

	loss_cls: tensor(0.4281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7106, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:25<06:11,  5.39s/it][A

	loss_cls: tensor(0.6998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9007, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:30<06:04,  5.37s/it][A

	loss_cls: tensor(0.7302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2093, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:36<06:00,  5.39s/it][A

	loss_cls: tensor(0.4136, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1228, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5365, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:41<05:54,  5.37s/it][A

	loss_cls: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7895, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [25:46<05:49,  5.38s/it][A

	loss_cls: tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8227, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [25:52<05:43,  5.37s/it][A

	loss_cls: tensor(0.4238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6625, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [25:57<05:39,  5.39s/it][A

	loss_cls: tensor(0.4220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5513, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:02<05:33,  5.38s/it][A

	loss_cls: tensor(0.5236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6417, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:08<05:27,  5.37s/it][A

	loss_cls: tensor(0.8569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0519, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:13<05:23,  5.38s/it][A

	loss_cls: tensor(0.5033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7838, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:19<05:17,  5.38s/it][A

	loss_cls: tensor(0.6492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7857, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:24<05:12,  5.39s/it][A

	loss_cls: tensor(0.8331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1455, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:29<05:06,  5.37s/it][A

	loss_cls: tensor(0.4371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5416, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:35<05:02,  5.40s/it][A

	loss_cls: tensor(0.5248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5709, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:40<04:56,  5.38s/it][A

	loss_cls: tensor(0.4738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5470, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [26:46<04:50,  5.37s/it][A

	loss_cls: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6387, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [26:51<04:45,  5.39s/it][A

	loss_cls: tensor(0.9731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5024, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [26:56<04:39,  5.37s/it][A

	loss_cls: tensor(0.4768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5799, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:02<04:35,  5.40s/it][A

	loss_cls: tensor(0.3822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4537, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:07<04:29,  5.39s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6119, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:13<04:24,  5.40s/it][A

	loss_cls: tensor(1.0798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7493, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:18<04:18,  5.39s/it][A

	loss_cls: tensor(0.9051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2152, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:23<04:12,  5.38s/it][A

	loss_cls: tensor(0.5277, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6903, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:29<04:07,  5.39s/it][A

	loss_cls: tensor(0.4744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6095, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:34<04:01,  5.37s/it][A

	loss_cls: tensor(0.6635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9764, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:39<03:57,  5.39s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6486, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [27:45<03:51,  5.37s/it][A

	loss_cls: tensor(0.4020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4950, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [27:50<03:46,  5.39s/it][A

	loss_cls: tensor(0.8882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1088, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [27:56<03:40,  5.38s/it][A

	loss_cls: tensor(0.9774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3897, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:01<03:34,  5.37s/it][A

	loss_cls: tensor(0.5957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8425, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:06<03:29,  5.38s/it][A

	loss_cls: tensor(0.4493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7139, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:12<03:23,  5.36s/it][A

	loss_cls: tensor(0.8295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9596, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:17<03:19,  5.39s/it][A

	loss_cls: tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7227, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:22<03:13,  5.37s/it][A

	loss_cls: tensor(0.9689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1409, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:28<03:08,  5.38s/it][A

	loss_cls: tensor(0.5064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6957, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:33<03:02,  5.36s/it][A

	loss_cls: tensor(0.5518, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0356, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:38<02:56,  5.35s/it][A

	loss_cls: tensor(0.5805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7166, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:44<02:51,  5.37s/it][A

	loss_cls: tensor(0.3956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7607, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [28:49<02:46,  5.36s/it][A

	loss_cls: tensor(0.5213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7020, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [28:55<02:41,  5.39s/it][A

	loss_cls: tensor(0.6733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8000, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:00<02:36,  5.38s/it][A

	loss_cls: tensor(0.5468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7633, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:05<02:31,  5.40s/it][A

	loss_cls: tensor(0.7246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7918, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:11<02:25,  5.38s/it][A

	loss_cls: tensor(0.5654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6499, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:16<02:19,  5.37s/it][A

	loss_cls: tensor(0.6739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7352, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:22<02:14,  5.38s/it][A

	loss_cls: tensor(0.5393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7972, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:27<02:08,  5.37s/it][A

	loss_cls: tensor(0.4654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5527, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:32<02:03,  5.39s/it][A

	loss_cls: tensor(0.8100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1384, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:38<01:58,  5.37s/it][A

	loss_cls: tensor(0.4146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6219, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:43<01:53,  5.39s/it][A

	loss_cls: tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7403, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [29:48<01:47,  5.37s/it][A

	loss_cls: tensor(0.7010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1994, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9004, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [29:54<01:41,  5.35s/it][A

	loss_cls: tensor(0.7823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0823, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [29:59<01:36,  5.37s/it][A

	loss_cls: tensor(0.6340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7594, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:04<01:31,  5.36s/it][A

	loss_cls: tensor(0.6445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9792, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:10<01:26,  5.38s/it][A

	loss_cls: tensor(0.7064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1168, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:15<01:20,  5.36s/it][A

	loss_cls: tensor(0.7122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2537, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:21<01:15,  5.37s/it][A

	loss_cls: tensor(0.5961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8844, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:26<01:09,  5.35s/it][A

	loss_cls: tensor(0.5264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5798, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:31<01:04,  5.34s/it][A

	loss_cls: tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6971, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:37<00:58,  5.36s/it][A

	loss_cls: tensor(0.6341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7884, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:42<00:53,  5.34s/it][A

	loss_cls: tensor(0.6549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0573, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [30:47<00:48,  5.36s/it][A

	loss_cls: tensor(0.5685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8356, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [30:53<00:42,  5.36s/it][A

	loss_cls: tensor(0.7076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8151, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [30:58<00:37,  5.39s/it][A

	loss_cls: tensor(0.6677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0516, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:03<00:32,  5.36s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6829, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:09<00:26,  5.35s/it][A

	loss_cls: tensor(0.8564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1280, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9844, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:14<00:21,  5.37s/it][A

	loss_cls: tensor(0.4966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7894, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:19<00:16,  5.36s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8606, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:25<00:10,  5.38s/it][A

	loss_cls: tensor(0.7205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9510, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:30<00:05,  5.35s/it][A

	loss_cls: tensor(0.6707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8900, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:32<00:00,  5.35s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8564, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8363526279138307

	Training cls acc: 0.7022128060263654

	Training cls prec: 0.5735595599472718

	Training cls rec: 0.6128739316239316

	Training cls f1: 0.535541985976173

--
	Training ner acc: 0.955173799925294

	Training ner prec: 0.2617554104405658

	Training ner rec: 0.27050863706545747

	Training ner f1: 0.26584953907761744

	Current Learning rate:  0.0004857142857142857



  1%|          | 1/177 [00:00<01:55,  1.52it/s][A
  1%|          | 2/177 [00:01<02:02,  1.43it/s][A
  2%|▏         | 3/177 [00:02<02:04,  1.40it/s][A
  2%|▏         | 4/177 [00:02<01:58,  1.46it/s][A
  3%|▎         | 5/177 [00:03<01:59,  1.44it/s][A
  3%|▎         | 6/177 [00:04<02:01,  1.41it/s][A
  4%|▍         | 7/177 [00:04<02:01,  1.40it/s][A
  5%|▍         | 8/177 [00:05<01:56,  1.45it/s][A
  5%|▌         | 9/177 [00:06<01:57,  1.43it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.41it/s][A
  6%|▌         | 11/177 [00:07<01:58,  1.41it/s][A
  7%|▋         | 12/177 [00:08<01:53,  1.45it/s][A
  7%|▋         | 13/177 [00:09<01:54,  1.43it/s][A
  8%|▊         | 14/177 [00:09<01:55,  1.41it/s][A
  8%|▊         | 15/177 [00:10<01:50,  1.46it/s][A
  9%|▉         | 16/177 [00:11<01:51,  1.44it/s][A
 10%|▉         | 17/177 [00:11<01:52,  1.42it/s][A
 10%|█         | 18/177 [00:12<01:52,  1.41it/s][A
 11%|█         | 19/177 [00:13<01:48,  1.45it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8057208419856379

	Validation cls acc: 0.6431261770244822

	Validation cls prec: 0.6020312079634114

	Validation cls rec: 0.5751479687920366

	Validation cls f1: 0.5239669307465917

--
	Validation ner acc: 0.9525324744968209

	Validation ner prec: 0.41327917111162654

	Validation ner rec: 0.42401129943502824

	Validation ner f1: 0.4184026391998538



  0%|          | 1/354 [00:05<32:05,  5.46s/it][A

	loss_cls: tensor(0.3904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5200, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:39,  5.40s/it][A

	loss_cls: tensor(0.6030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9113, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:30,  5.39s/it][A

	loss_cls: tensor(0.4355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6935, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:31,  5.40s/it][A

	loss_cls: tensor(0.5673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0798, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:17,  5.38s/it][A

	loss_cls: tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8694, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:22,  5.41s/it][A

	loss_cls: tensor(0.5627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6842, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:13,  5.40s/it][A

	loss_cls: tensor(0.5981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9794, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:12,  5.41s/it][A

	loss_cls: tensor(0.4833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7068, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:02,  5.40s/it][A

	loss_cls: tensor(0.4877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5472, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:55,  5.39s/it][A

	loss_cls: tensor(0.3502, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4791, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:56,  5.41s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5848, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:43,  5.39s/it][A

	loss_cls: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7026, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:46,  5.42s/it][A

	loss_cls: tensor(0.5646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8225, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:36,  5.40s/it][A

	loss_cls: tensor(0.4176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6552, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:36,  5.42s/it][A

	loss_cls: tensor(1.1383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2979, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:27,  5.41s/it][A

	loss_cls: tensor(0.7253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8315, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:20,  5.40s/it][A

	loss_cls: tensor(0.9739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3283, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:19,  5.41s/it][A

	loss_cls: tensor(0.6933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9209, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:07,  5.40s/it][A

	loss_cls: tensor(0.5282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7554, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:06,  5.41s/it][A

	loss_cls: tensor(0.5812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8540, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<29:57,  5.40s/it][A

	loss_cls: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9369, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:57,  5.42s/it][A

	loss_cls: tensor(0.4840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6742, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:48,  5.40s/it][A

	loss_cls: tensor(0.4845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6688, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:37,  5.39s/it][A

	loss_cls: tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6032, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:36,  5.40s/it][A

	loss_cls: tensor(0.6561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7901, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:27,  5.39s/it][A

	loss_cls: tensor(0.4613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6082, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:25<29:27,  5.40s/it][A

	loss_cls: tensor(0.6587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8414, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:19,  5.40s/it][A

	loss_cls: tensor(0.5897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6688, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:19,  5.41s/it][A

	loss_cls: tensor(0.4522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5392, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:09,  5.40s/it][A

	loss_cls: tensor(0.6048, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1146, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:00,  5.39s/it][A

	loss_cls: tensor(0.5090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6804, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:52<29:00,  5.40s/it][A

	loss_cls: tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8661, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<29:02,  5.43s/it][A

	loss_cls: tensor(0.5595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7502, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<29:10,  5.47s/it][A

	loss_cls: tensor(0.4569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5833, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<29:05,  5.47s/it][A

	loss_cls: tensor(0.5533, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7125, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<29:08,  5.50s/it][A

	loss_cls: tensor(0.4283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8472, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:57,  5.48s/it][A

	loss_cls: tensor(0.4322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7908, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:56,  5.50s/it][A

	loss_cls: tensor(0.5420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7954, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:53,  5.50s/it][A

	loss_cls: tensor(0.4683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8454, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:35,  5.46s/it][A

	loss_cls: tensor(0.9121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2348, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:42<28:28,  5.46s/it][A

	loss_cls: tensor(0.6709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9389, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:11,  5.42s/it][A

	loss_cls: tensor(0.6149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9630, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:53<28:05,  5.42s/it][A

	loss_cls: tensor(0.6781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0176, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<27:54,  5.40s/it][A

	loss_cls: tensor(0.4885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6397, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:42,  5.38s/it][A

	loss_cls: tensor(0.6160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9320, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:39,  5.39s/it][A

	loss_cls: tensor(0.6847, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8157, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:29,  5.37s/it][A

	loss_cls: tensor(0.6726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8194, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:29,  5.39s/it][A

	loss_cls: tensor(0.7615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9938, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:19,  5.37s/it][A

	loss_cls: tensor(1.1859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2918, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:17,  5.39s/it][A

	loss_cls: tensor(0.3522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6125, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:35<27:06,  5.37s/it][A

	loss_cls: tensor(0.7843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9187, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<26:57,  5.36s/it][A

	loss_cls: tensor(0.7468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8681, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:46<26:57,  5.37s/it][A

	loss_cls: tensor(0.7261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8148, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<26:48,  5.36s/it][A

	loss_cls: tensor(0.8339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1577, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:57<26:47,  5.38s/it][A

	loss_cls: tensor(0.6050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7147, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:02<26:39,  5.37s/it][A

	loss_cls: tensor(0.5605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7187, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:38,  5.38s/it][A

	loss_cls: tensor(0.6845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0923, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:13<26:31,  5.38s/it][A

	loss_cls: tensor(0.8249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2463, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:32,  5.40s/it][A

	loss_cls: tensor(0.7557, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0505, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:24<26:23,  5.39s/it][A

	loss_cls: tensor(0.4989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8803, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:29<26:13,  5.37s/it][A

	loss_cls: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2779, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8662, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:35<26:16,  5.40s/it][A

	loss_cls: tensor(0.4258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8844, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:40<26:05,  5.38s/it][A

	loss_cls: tensor(0.6996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0866, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:45<26:03,  5.39s/it][A

	loss_cls: tensor(0.6519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7176, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:51<25:51,  5.37s/it][A

	loss_cls: tensor(0.5480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6066, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:56<25:52,  5.39s/it][A

	loss_cls: tensor(0.5318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6970, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:02<25:42,  5.38s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8481, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:07<25:36,  5.37s/it][A

	loss_cls: tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0033, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:12<25:34,  5.38s/it][A

	loss_cls: tensor(0.9688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1673, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:18<25:25,  5.37s/it][A

	loss_cls: tensor(0.5550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6706, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:23<25:20,  5.37s/it][A

	loss_cls: tensor(0.6152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7495, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:28<25:10,  5.36s/it][A

	loss_cls: tensor(0.4766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5706, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:34<25:13,  5.39s/it][A

	loss_cls: tensor(0.5445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7452, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:39<25:02,  5.37s/it][A

	loss_cls: tensor(0.5317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7375, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:44<24:54,  5.36s/it][A

	loss_cls: tensor(0.7298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9787, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:50<24:50,  5.36s/it][A

	loss_cls: tensor(0.6858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9868, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:55<24:42,  5.35s/it][A

	loss_cls: tensor(0.5209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8325, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:01<24:40,  5.36s/it][A

	loss_cls: tensor(0.5830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6163, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:06<24:31,  5.35s/it][A

	loss_cls: tensor(0.6624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8595, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:11<24:32,  5.37s/it][A

	loss_cls: tensor(0.6965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1298, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<24:24,  5.36s/it][A

	loss_cls: tensor(0.7934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1789, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9723, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<24:14,  5.35s/it][A

	loss_cls: tensor(0.5371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8192, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:27<24:14,  5.37s/it][A

	loss_cls: tensor(0.9196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0982, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:06,  5.36s/it][A

	loss_cls: tensor(0.7138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8172, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:38<24:08,  5.38s/it][A

	loss_cls: tensor(0.7108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8293, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:43<23:59,  5.37s/it][A

	loss_cls: tensor(0.8705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9380, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:49<23:56,  5.38s/it][A

	loss_cls: tensor(0.4735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8120, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:54<23:48,  5.37s/it][A

	loss_cls: tensor(0.6793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8472, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<23:39,  5.36s/it][A

	loss_cls: tensor(0.4044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6108, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:05<23:36,  5.37s/it][A

	loss_cls: tensor(0.6639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7394, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:10<23:25,  5.34s/it][A

	loss_cls: tensor(0.6791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8881, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<23:22,  5.35s/it][A

	loss_cls: tensor(0.6561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6937, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:21<23:16,  5.35s/it][A

	loss_cls: tensor(0.5262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6166, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:26<23:16,  5.37s/it][A

	loss_cls: tensor(0.6068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7074, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:32<23:09,  5.37s/it][A

	loss_cls: tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8005, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:37<23:01,  5.35s/it][A

	loss_cls: tensor(0.4248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4670, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:42<23:00,  5.37s/it][A

	loss_cls: tensor(0.7647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0910, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:48<22:52,  5.36s/it][A

	loss_cls: tensor(0.7631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2948, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:53<22:49,  5.37s/it][A

	loss_cls: tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7747, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:59<22:43,  5.37s/it][A

	loss_cls: tensor(0.7206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9792, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:04<22:43,  5.39s/it][A

	loss_cls: tensor(0.6352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9664, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:09<22:33,  5.37s/it][A

	loss_cls: tensor(0.4188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5648, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:15<22:25,  5.36s/it][A

	loss_cls: tensor(0.4895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8751, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:20<22:22,  5.37s/it][A

	loss_cls: tensor(0.4964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7087, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:25<22:12,  5.35s/it][A

	loss_cls: tensor(0.4401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5186, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:31<22:12,  5.37s/it][A

	loss_cls: tensor(0.5055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6899, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:36<22:03,  5.36s/it][A

	loss_cls: tensor(0.5389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9171, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:42<22:05,  5.39s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8051, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:47<21:57,  5.38s/it][A

	loss_cls: tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6414, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:52<21:50,  5.37s/it][A

	loss_cls: tensor(0.6105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8639, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:58<21:47,  5.38s/it][A

	loss_cls: tensor(0.7243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0187, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:03<21:35,  5.35s/it][A

	loss_cls: tensor(0.6516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8770, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:08<21:34,  5.37s/it][A

	loss_cls: tensor(0.5877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7579, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:14<21:23,  5.35s/it][A

	loss_cls: tensor(1.0045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2347, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:19<21:20,  5.36s/it][A

	loss_cls: tensor(0.5661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6249, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:24<21:12,  5.35s/it][A

	loss_cls: tensor(0.4880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7886, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:30<21:05,  5.34s/it][A

	loss_cls: tensor(0.5111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7645, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:35<21:03,  5.35s/it][A

	loss_cls: tensor(0.5878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7556, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:40<20:55,  5.34s/it][A

	loss_cls: tensor(0.3908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6473, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:46<20:55,  5.36s/it][A

	loss_cls: tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5808, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:51<20:48,  5.36s/it][A

	loss_cls: tensor(0.5366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7531, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:57<20:46,  5.37s/it][A

	loss_cls: tensor(0.6411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9483, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:02<20:37,  5.36s/it][A

	loss_cls: tensor(0.6120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9674, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:07<20:30,  5.35s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8024, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:13<20:29,  5.37s/it][A

	loss_cls: tensor(0.4402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6768, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:18<20:20,  5.35s/it][A

	loss_cls: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0983, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:23<20:20,  5.38s/it][A

	loss_cls: tensor(0.4068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7144, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:29<20:12,  5.36s/it][A

	loss_cls: tensor(0.7825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0681, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:34<20:09,  5.38s/it][A

	loss_cls: tensor(0.4558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5936, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:39<20:00,  5.36s/it][A

	loss_cls: tensor(0.6325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9367, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:45<19:55,  5.36s/it][A

	loss_cls: tensor(0.8201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0221, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:50<19:52,  5.37s/it][A

	loss_cls: tensor(0.5999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8000, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:56<19:44,  5.36s/it][A

	loss_cls: tensor(0.6585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9038, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:01<19:44,  5.39s/it][A

	loss_cls: tensor(0.5534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6865, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:06<19:37,  5.38s/it][A

	loss_cls: tensor(0.5701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8230, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:12<19:34,  5.39s/it][A

	loss_cls: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9954, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:17<19:26,  5.37s/it][A

	loss_cls: tensor(0.3961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6648, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:22<19:17,  5.36s/it][A

	loss_cls: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6361, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:28<19:16,  5.38s/it][A

	loss_cls: tensor(0.4519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7310, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:33<19:10,  5.38s/it][A

	loss_cls: tensor(0.7577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8097, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:39<19:08,  5.39s/it][A

	loss_cls: tensor(0.5734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6174, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:44<18:58,  5.37s/it][A

	loss_cls: tensor(0.7343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7764, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:49<18:57,  5.39s/it][A

	loss_cls: tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6359, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:55<18:47,  5.37s/it][A

	loss_cls: tensor(0.5599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7472, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:00<18:39,  5.36s/it][A

	loss_cls: tensor(0.6208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8284, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:05<18:39,  5.38s/it][A

	loss_cls: tensor(0.8228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0297, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:11<18:31,  5.37s/it][A

	loss_cls: tensor(1.0523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5437, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:16<18:30,  5.39s/it][A

	loss_cls: tensor(0.4654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2114, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6768, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:22<18:20,  5.37s/it][A

	loss_cls: tensor(0.4459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9446, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:27<18:17,  5.38s/it][A

	loss_cls: tensor(0.7409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1192, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:32<18:08,  5.36s/it][A

	loss_cls: tensor(0.3862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6354, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:38<18:01,  5.35s/it][A

	loss_cls: tensor(0.5204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2712, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7916, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:43<18:00,  5.38s/it][A

	loss_cls: tensor(1.0783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3040, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:48<17:56,  5.38s/it][A

	loss_cls: tensor(0.8181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9578, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:54<17:54,  5.40s/it][A

	loss_cls: tensor(0.5794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8831, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [13:59<17:47,  5.39s/it][A

	loss_cls: tensor(0.8162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9835, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:05<17:44,  5.40s/it][A

	loss_cls: tensor(1.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7163, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:10<17:36,  5.39s/it][A

	loss_cls: tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9726, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:15<17:32,  5.39s/it][A

	loss_cls: tensor(0.4215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6148, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:21<17:24,  5.38s/it][A

	loss_cls: tensor(0.5858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9128, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:26<17:16,  5.37s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7421, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:32<17:16,  5.40s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5707, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:37<17:07,  5.38s/it][A

	loss_cls: tensor(0.5680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6316, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:42<17:04,  5.39s/it][A

	loss_cls: tensor(0.6062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1673, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:48<16:54,  5.37s/it][A

	loss_cls: tensor(0.5275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5858, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:53<16:53,  5.39s/it][A

	loss_cls: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6828, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [14:59<16:46,  5.38s/it][A

	loss_cls: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1975, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7641, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:04<16:38,  5.37s/it][A

	loss_cls: tensor(0.7531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8758, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:09<16:36,  5.39s/it][A

	loss_cls: tensor(0.4822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7641, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:15<16:27,  5.37s/it][A

	loss_cls: tensor(0.4432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5066, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:20<16:25,  5.38s/it][A

	loss_cls: tensor(0.6772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8325, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:25<16:14,  5.36s/it][A

	loss_cls: tensor(0.4802, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5759, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:31<16:13,  5.38s/it][A

	loss_cls: tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9252, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:36<16:03,  5.35s/it][A

	loss_cls: tensor(0.5632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1330, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:41<15:56,  5.34s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8973, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:47<15:54,  5.36s/it][A

	loss_cls: tensor(0.4582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7781, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:52<15:49,  5.36s/it][A

	loss_cls: tensor(0.5717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6502, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:58<15:48,  5.39s/it][A

	loss_cls: tensor(1.0241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3073, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:03<15:40,  5.38s/it][A

	loss_cls: tensor(0.8189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1847, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:08<15:38,  5.40s/it][A

	loss_cls: tensor(0.6566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7866, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:14<15:31,  5.39s/it][A

	loss_cls: tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8471, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:19<15:23,  5.37s/it][A

	loss_cls: tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9605, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:24<15:20,  5.38s/it][A

	loss_cls: tensor(0.4472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5148, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:30<15:12,  5.37s/it][A

	loss_cls: tensor(0.5670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6628, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:35<15:09,  5.38s/it][A

	loss_cls: tensor(0.7462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9330, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:41<15:00,  5.36s/it][A

	loss_cls: tensor(0.6534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0759, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:46<14:58,  5.38s/it][A

	loss_cls: tensor(0.7109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9628, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:51<14:51,  5.37s/it][A

	loss_cls: tensor(0.4677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6643, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:57<14:45,  5.37s/it][A

	loss_cls: tensor(0.6224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8384, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:02<14:42,  5.38s/it][A

	loss_cls: tensor(0.4235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5177, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:07<14:35,  5.37s/it][A

	loss_cls: tensor(0.4245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5463, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:13<14:31,  5.38s/it][A

	loss_cls: tensor(0.7151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9653, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:18<14:23,  5.37s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5188, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:24<14:21,  5.38s/it][A

	loss_cls: tensor(0.5536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7457, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:29<14:14,  5.38s/it][A

	loss_cls: tensor(0.4938, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5308, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:34<14:08,  5.37s/it][A

	loss_cls: tensor(0.6536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8848, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:40<14:06,  5.39s/it][A

	loss_cls: tensor(0.5662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8912, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:45<13:59,  5.38s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6868, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:51<13:57,  5.40s/it][A

	loss_cls: tensor(0.8838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1716, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:56<13:51,  5.40s/it][A

	loss_cls: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8642, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:01<13:48,  5.42s/it][A

	loss_cls: tensor(0.5700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5944, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:07<13:41,  5.40s/it][A

	loss_cls: tensor(0.6609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7606, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:12<13:34,  5.39s/it][A

	loss_cls: tensor(0.5328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7398, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:18<13:31,  5.41s/it][A

	loss_cls: tensor(0.5502, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7934, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:23<13:22,  5.39s/it][A

	loss_cls: tensor(0.4718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7345, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:28<13:19,  5.40s/it][A

	loss_cls: tensor(0.5500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7349, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:34<13:12,  5.39s/it][A

	loss_cls: tensor(0.7670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9936, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:39<13:08,  5.40s/it][A

	loss_cls: tensor(0.8911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2952, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:44<13:00,  5.38s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7397, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:50<12:53,  5.37s/it][A

	loss_cls: tensor(1.2057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4416, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:55<12:49,  5.38s/it][A

	loss_cls: tensor(0.8221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1878, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:01<12:42,  5.37s/it][A

	loss_cls: tensor(0.4243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8766, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:06<12:39,  5.39s/it][A

	loss_cls: tensor(0.7234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9672, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:11<12:31,  5.37s/it][A

	loss_cls: tensor(0.7640, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9681, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:17<12:27,  5.38s/it][A

	loss_cls: tensor(0.6176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7205, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:22<12:20,  5.37s/it][A

	loss_cls: tensor(0.4642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6827, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:27<12:14,  5.36s/it][A

	loss_cls: tensor(0.4723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0789, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5512, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:33<12:10,  5.37s/it][A

	loss_cls: tensor(0.6195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8050, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:38<12:03,  5.36s/it][A

	loss_cls: tensor(0.5012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8613, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:44<12:00,  5.37s/it][A

	loss_cls: tensor(0.4170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5538, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:49<11:53,  5.36s/it][A

	loss_cls: tensor(0.5485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8863, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:54<11:49,  5.38s/it][A

	loss_cls: tensor(0.4709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0744, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5453, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:00<11:44,  5.38s/it][A

	loss_cls: tensor(0.5725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9329, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:05<11:39,  5.38s/it][A

	loss_cls: tensor(0.4671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5878, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:11<11:35,  5.39s/it][A

	loss_cls: tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6878, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:16<11:29,  5.39s/it][A

	loss_cls: tensor(0.4412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7527, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:21<11:25,  5.40s/it][A

	loss_cls: tensor(0.6103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1266, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:27<11:17,  5.38s/it][A

	loss_cls: tensor(0.6136, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0405, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:32<11:15,  5.40s/it][A

	loss_cls: tensor(0.6328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6730, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:37<11:07,  5.38s/it][A

	loss_cls: tensor(0.5548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7087, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:43<10:59,  5.36s/it][A

	loss_cls: tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8712, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:48<10:57,  5.39s/it][A

	loss_cls: tensor(0.8548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1423, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:54<10:50,  5.38s/it][A

	loss_cls: tensor(0.5948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7710, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [20:59<10:47,  5.39s/it][A

	loss_cls: tensor(0.4486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7662, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:04<10:39,  5.38s/it][A

	loss_cls: tensor(0.3784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7641, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:10<10:35,  5.38s/it][A

	loss_cls: tensor(0.7409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9061, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:15<10:28,  5.37s/it][A

	loss_cls: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1937, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:20<10:21,  5.35s/it][A

	loss_cls: tensor(0.5532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8982, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:26<10:17,  5.37s/it][A

	loss_cls: tensor(0.5095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6138, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:31<10:11,  5.37s/it][A

	loss_cls: tensor(0.5455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7524, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:37<10:09,  5.39s/it][A

	loss_cls: tensor(0.6423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7574, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:42<10:03,  5.39s/it][A

	loss_cls: tensor(1.2313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5266, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:47<09:58,  5.39s/it][A

	loss_cls: tensor(0.5627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7409, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:53<09:50,  5.37s/it][A

	loss_cls: tensor(0.7105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8925, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [21:58<09:43,  5.35s/it][A

	loss_cls: tensor(0.5826, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7963, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:03<09:40,  5.38s/it][A

	loss_cls: tensor(0.6181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8929, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:09<09:35,  5.38s/it][A

	loss_cls: tensor(0.3795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6395, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:14<09:32,  5.40s/it][A

	loss_cls: tensor(0.4465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6091, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:20<09:25,  5.38s/it][A

	loss_cls: tensor(0.7038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1072, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:25<09:20,  5.39s/it][A

	loss_cls: tensor(0.4780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8534, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:30<09:14,  5.38s/it][A

	loss_cls: tensor(0.3993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7538, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:36<09:07,  5.37s/it][A

	loss_cls: tensor(0.5156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6981, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:41<09:05,  5.40s/it][A

	loss_cls: tensor(0.5128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1842, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6971, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:47<08:59,  5.39s/it][A

	loss_cls: tensor(0.6114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6648, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:52<08:55,  5.41s/it][A

	loss_cls: tensor(0.4980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7677, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [22:57<08:47,  5.39s/it][A

	loss_cls: tensor(0.4737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5893, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:03<08:43,  5.39s/it][A

	loss_cls: tensor(0.6987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9355, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:08<08:36,  5.38s/it][A

	loss_cls: tensor(0.5440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6884, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:14<08:31,  5.39s/it][A

	loss_cls: tensor(0.7535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9224, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:19<08:25,  5.38s/it][A

	loss_cls: tensor(0.5216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7813, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:24<08:18,  5.36s/it][A

	loss_cls: tensor(0.6078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6590, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:30<08:15,  5.38s/it][A

	loss_cls: tensor(0.7436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9602, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:35<08:07,  5.36s/it][A

	loss_cls: tensor(0.4622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7443, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:40<08:03,  5.38s/it][A

	loss_cls: tensor(0.8874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0185, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:46<07:57,  5.36s/it][A

	loss_cls: tensor(0.8739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0665, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:51<07:53,  5.39s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7356, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [23:56<07:47,  5.37s/it][A

	loss_cls: tensor(0.6066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7512, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:02<07:41,  5.36s/it][A

	loss_cls: tensor(1.0352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1428, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:07<07:37,  5.38s/it][A

	loss_cls: tensor(0.6938, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9597, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:13<07:31,  5.38s/it][A

	loss_cls: tensor(0.9278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4281, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:18<07:27,  5.39s/it][A

	loss_cls: tensor(0.6268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9637, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:23<07:20,  5.38s/it][A

	loss_cls: tensor(0.8182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0291, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:29<07:16,  5.39s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0609, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:34<07:10,  5.38s/it][A

	loss_cls: tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8986, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:39<07:04,  5.37s/it][A

	loss_cls: tensor(0.9274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2445, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:45<07:00,  5.39s/it][A

	loss_cls: tensor(0.5393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9431, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:50<06:54,  5.38s/it][A

	loss_cls: tensor(0.7059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8156, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [24:56<06:49,  5.39s/it][A

	loss_cls: tensor(0.7524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8763, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:01<06:42,  5.37s/it][A

	loss_cls: tensor(0.5652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9479, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:06<06:38,  5.39s/it][A

	loss_cls: tensor(0.6612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0919, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7531, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:12<06:32,  5.38s/it][A

	loss_cls: tensor(0.5382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7942, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:17<06:26,  5.37s/it][A

	loss_cls: tensor(0.4129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7491, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:23<06:22,  5.38s/it][A

	loss_cls: tensor(0.6302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8090, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:28<06:15,  5.36s/it][A

	loss_cls: tensor(0.6885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9364, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:33<06:11,  5.38s/it][A

	loss_cls: tensor(0.7333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8947, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:39<06:05,  5.37s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7766, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:44<06:00,  5.39s/it][A

	loss_cls: tensor(0.5648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9153, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:49<05:54,  5.38s/it][A

	loss_cls: tensor(0.6373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7294, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [25:55<05:48,  5.36s/it][A

	loss_cls: tensor(0.3550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6029, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:00<05:44,  5.38s/it][A

	loss_cls: tensor(0.5654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7094, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:06<05:38,  5.37s/it][A

	loss_cls: tensor(0.6924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9454, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:11<05:33,  5.38s/it][A

	loss_cls: tensor(0.6181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7433, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:16<05:28,  5.39s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8008, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:22<05:24,  5.40s/it][A

	loss_cls: tensor(0.7675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0166, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:27<05:18,  5.39s/it][A

	loss_cls: tensor(0.3990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7286, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:33<05:12,  5.39s/it][A

	loss_cls: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7763, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:38<05:07,  5.40s/it][A

	loss_cls: tensor(0.6823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0209, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:43<05:01,  5.38s/it][A

	loss_cls: tensor(0.5683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8069, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:49<04:56,  5.39s/it][A

	loss_cls: tensor(0.5589, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7061, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [26:54<04:50,  5.38s/it][A

	loss_cls: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6358, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [26:59<04:45,  5.39s/it][A

	loss_cls: tensor(0.7812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8996, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:05<04:39,  5.38s/it][A

	loss_cls: tensor(0.3745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5390, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:10<04:33,  5.36s/it][A

	loss_cls: tensor(0.4697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6267, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:16<04:29,  5.39s/it][A

	loss_cls: tensor(0.6744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9712, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:21<04:22,  5.37s/it][A

	loss_cls: tensor(0.4737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7622, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:26<04:18,  5.39s/it][A

	loss_cls: tensor(0.5378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5955, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:32<04:12,  5.38s/it][A

	loss_cls: tensor(0.5799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:37<04:08,  5.40s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1125, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:42<04:02,  5.39s/it][A

	loss_cls: tensor(0.5041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7416, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:48<03:56,  5.37s/it][A

	loss_cls: tensor(0.5752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7557, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [27:53<03:51,  5.38s/it][A

	loss_cls: tensor(0.6374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8080, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [27:59<03:45,  5.36s/it][A

	loss_cls: tensor(0.8506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9078, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:04<03:40,  5.38s/it][A

	loss_cls: tensor(0.6213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7452, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:09<03:34,  5.37s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6774, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:15<03:30,  5.39s/it][A

	loss_cls: tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9015, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:20<03:25,  5.40s/it][A

	loss_cls: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7141, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:26<03:20,  5.43s/it][A

	loss_cls: tensor(0.5535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8554, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:31<03:15,  5.42s/it][A

	loss_cls: tensor(0.6202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7216, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:36<03:08,  5.39s/it][A

	loss_cls: tensor(0.5873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8041, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:42<03:03,  5.39s/it][A

	loss_cls: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9773, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:47<02:57,  5.36s/it][A

	loss_cls: tensor(0.4929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7159, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:52<02:51,  5.37s/it][A

	loss_cls: tensor(0.5839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8677, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [28:58<02:46,  5.36s/it][A

	loss_cls: tensor(0.5234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7575, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:03<02:40,  5.35s/it][A

	loss_cls: tensor(0.5248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8662, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:09<02:35,  5.36s/it][A

	loss_cls: tensor(0.5298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8377, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:14<02:29,  5.34s/it][A

	loss_cls: tensor(0.5097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5628, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:19<02:24,  5.37s/it][A

	loss_cls: tensor(0.5374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8599, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:25<02:19,  5.37s/it][A

	loss_cls: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8118, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:30<02:14,  5.38s/it][A

	loss_cls: tensor(0.6693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1295, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7988, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:35<02:08,  5.36s/it][A

	loss_cls: tensor(0.4548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8003, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:41<02:02,  5.34s/it][A

	loss_cls: tensor(0.5746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9021, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:46<01:57,  5.35s/it][A

	loss_cls: tensor(0.4435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6853, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:51<01:52,  5.34s/it][A

	loss_cls: tensor(0.6072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0308, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [29:57<01:47,  5.36s/it][A

	loss_cls: tensor(0.7882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2903, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:02<01:41,  5.34s/it][A

	loss_cls: tensor(0.5145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5695, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:07<01:36,  5.35s/it][A

	loss_cls: tensor(0.5845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9940, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:13<01:30,  5.34s/it][A

	loss_cls: tensor(0.5130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8527, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:18<01:25,  5.33s/it][A

	loss_cls: tensor(0.6306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0022, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:23<01:20,  5.36s/it][A

	loss_cls: tensor(0.5613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7400, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:29<01:15,  5.36s/it][A

	loss_cls: tensor(0.6554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9819, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:34<01:10,  5.39s/it][A

	loss_cls: tensor(0.8630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0397, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:40<01:04,  5.36s/it][A

	loss_cls: tensor(0.3555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5464, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:45<00:59,  5.38s/it][A

	loss_cls: tensor(0.6243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8848, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:50<00:53,  5.36s/it][A

	loss_cls: tensor(0.5087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7691, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [30:56<00:48,  5.34s/it][A

	loss_cls: tensor(0.7170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7727, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:01<00:42,  5.36s/it][A

	loss_cls: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6525, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:06<00:37,  5.35s/it][A

	loss_cls: tensor(0.4870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7475, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:12<00:32,  5.37s/it][A

	loss_cls: tensor(0.3436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6237, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:17<00:26,  5.36s/it][A

	loss_cls: tensor(0.8155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4203, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:22<00:21,  5.37s/it][A

	loss_cls: tensor(0.6052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8159, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:28<00:16,  5.35s/it][A

	loss_cls: tensor(0.6406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8799, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:33<00:10,  5.34s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5215, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:38<00:05,  5.36s/it][A

	loss_cls: tensor(0.4188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4848, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:41<00:00,  5.37s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.5886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6207, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8346751093190942

	Training cls acc: 0.6854990583804143

	Training cls prec: 0.5782282061943079

	Training cls rec: 0.622842776338539

	Training cls f1: 0.5369167690687878

--
	Training ner acc: 0.9551532371573729

	Training ner prec: 0.2786664879823223

	Training ner rec: 0.28674179659294985

	Training ner f1: 0.28219404579113977

	Current Learning rate:  0.00045714285714285713



  1%|          | 1/177 [00:00<02:10,  1.34it/s][A
  1%|          | 2/177 [00:01<02:09,  1.35it/s][A
  2%|▏         | 3/177 [00:02<02:06,  1.38it/s][A
  2%|▏         | 4/177 [00:02<02:00,  1.44it/s][A
  3%|▎         | 5/177 [00:03<02:01,  1.41it/s][A
  3%|▎         | 6/177 [00:04<02:02,  1.40it/s][A
  4%|▍         | 7/177 [00:04<01:57,  1.45it/s][A
  5%|▍         | 8/177 [00:05<01:58,  1.43it/s][A
  5%|▌         | 9/177 [00:06<01:59,  1.41it/s][A
  6%|▌         | 10/177 [00:07<01:59,  1.40it/s][A
  6%|▌         | 11/177 [00:07<01:54,  1.45it/s][A
  7%|▋         | 12/177 [00:08<01:56,  1.42it/s][A
  7%|▋         | 13/177 [00:09<01:56,  1.41it/s][A
  8%|▊         | 14/177 [00:09<01:52,  1.45it/s][A
  8%|▊         | 15/177 [00:10<01:52,  1.44it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.42it/s][A
 10%|▉         | 17/177 [00:12<01:54,  1.40it/s][A
 10%|█         | 18/177 [00:12<01:50,  1.44it/s][A
 11%|█         | 19/177 [00:13<01:51,  1.42it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7776200255263324

	Validation cls acc: 0.7551789077212807

	Validation cls prec: 0.6027845036319613

	Validation cls rec: 0.5852535647027173

	Validation cls f1: 0.5687952035409662

--
	Validation ner acc: 0.954235035275717

	Validation ner prec: 0.4380624339872068

	Validation ner rec: 0.4486817325800377

	Validation ner f1: 0.4431392818482719



  0%|          | 1/354 [00:05<31:19,  5.32s/it][A

	loss_cls: tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4526, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:37,  5.39s/it][A

	loss_cls: tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8317, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:26,  5.37s/it][A

	loss_cls: tensor(0.8505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0816, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:28,  5.40s/it][A

	loss_cls: tensor(0.7626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9416, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:14,  5.37s/it][A

	loss_cls: tensor(0.8681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0928, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:04,  5.36s/it][A

	loss_cls: tensor(0.5999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8988, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:03,  5.37s/it][A

	loss_cls: tensor(0.4757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7112, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:42<30:57,  5.37s/it][A

	loss_cls: tensor(0.6240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9383, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:01,  5.40s/it][A

	loss_cls: tensor(0.9639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2241, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:48,  5.37s/it][A

	loss_cls: tensor(0.9194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0965, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:48,  5.39s/it][A

	loss_cls: tensor(0.6886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9201, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:36,  5.37s/it][A

	loss_cls: tensor(0.3408, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4913, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:26,  5.36s/it][A

	loss_cls: tensor(0.4278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4677, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:27,  5.37s/it][A

	loss_cls: tensor(0.3712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6703, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:15,  5.35s/it][A

	loss_cls: tensor(0.7016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9107, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:25<30:15,  5.37s/it][A

	loss_cls: tensor(0.7275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0264, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:03,  5.35s/it][A

	loss_cls: tensor(0.7163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0719, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<30:01,  5.36s/it][A

	loss_cls: tensor(0.5714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7341, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:41<29:49,  5.34s/it][A

	loss_cls: tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5968, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<29:41,  5.33s/it][A

	loss_cls: tensor(0.3777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3151, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6928, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:40,  5.35s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7578, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:57<29:32,  5.34s/it][A

	loss_cls: tensor(0.5132, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7860, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:30,  5.35s/it][A

	loss_cls: tensor(0.8361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0399, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:21,  5.34s/it][A

	loss_cls: tensor(0.7016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9917, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:14<29:22,  5.36s/it][A

	loss_cls: tensor(0.5863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7117, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:12,  5.34s/it][A

	loss_cls: tensor(0.3824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6543, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:02,  5.33s/it][A

	loss_cls: tensor(0.4754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8313, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:30<29:02,  5.35s/it][A

	loss_cls: tensor(0.5588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7185, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<28:53,  5.33s/it][A

	loss_cls: tensor(0.4584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8109, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<28:55,  5.36s/it][A

	loss_cls: tensor(0.4790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7650, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:46<28:51,  5.36s/it][A

	loss_cls: tensor(0.6676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8166, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<28:51,  5.38s/it][A

	loss_cls: tensor(0.4560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7687, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:56<28:40,  5.36s/it][A

	loss_cls: tensor(0.4451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7654, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:02<28:30,  5.34s/it][A

	loss_cls: tensor(0.4131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5620, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:30,  5.36s/it][A

	loss_cls: tensor(0.5223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7525, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:12<28:22,  5.35s/it][A

	loss_cls: tensor(0.4985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6413, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:21,  5.37s/it][A

	loss_cls: tensor(0.7033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8458, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:23<28:10,  5.35s/it][A

	loss_cls: tensor(0.7307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9152, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:28<28:08,  5.36s/it][A

	loss_cls: tensor(0.4029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6750, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:34<27:57,  5.34s/it][A

	loss_cls: tensor(1.2324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3906, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:39<27:47,  5.33s/it][A

	loss_cls: tensor(0.5394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8618, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:44<27:50,  5.35s/it][A

	loss_cls: tensor(0.7894, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0136, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:50<27:43,  5.35s/it][A

	loss_cls: tensor(0.5869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7877, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:55<27:41,  5.36s/it][A

	loss_cls: tensor(0.5693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9073, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:01<27:30,  5.34s/it][A

	loss_cls: tensor(0.3662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6321, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:06<27:28,  5.35s/it][A

	loss_cls: tensor(1.2332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4234, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:11<27:17,  5.33s/it][A

	loss_cls: tensor(0.5205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5633, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:17<27:15,  5.34s/it][A

	loss_cls: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8076, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:22<27:09,  5.34s/it][A

	loss_cls: tensor(0.6552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7605, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:27<27:00,  5.33s/it][A

	loss_cls: tensor(0.6050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5313, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1363, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:33<27:00,  5.35s/it][A

	loss_cls: tensor(0.4648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5142, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:38<26:50,  5.33s/it][A

	loss_cls: tensor(0.8795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0558, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:43<26:50,  5.35s/it][A

	loss_cls: tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7751, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:49<26:39,  5.33s/it][A

	loss_cls: tensor(0.4842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6633, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:54<26:42,  5.36s/it][A

	loss_cls: tensor(0.7179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9795, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [04:59<26:32,  5.34s/it][A

	loss_cls: tensor(0.5694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:05<26:23,  5.33s/it][A

	loss_cls: tensor(0.4439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4869, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:10<26:22,  5.35s/it][A

	loss_cls: tensor(0.4865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7131, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:15<26:13,  5.33s/it][A

	loss_cls: tensor(0.5439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8098, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:21<26:11,  5.35s/it][A

	loss_cls: tensor(0.9448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2527, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:26<26:01,  5.33s/it][A

	loss_cls: tensor(0.4271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8260, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:31<26:02,  5.35s/it][A

	loss_cls: tensor(0.6897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8978, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:37<25:53,  5.34s/it][A

	loss_cls: tensor(0.7867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1826, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:42<25:45,  5.33s/it][A

	loss_cls: tensor(0.6227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7070, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:47<25:45,  5.35s/it][A

	loss_cls: tensor(0.5037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7622, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:53<25:36,  5.33s/it][A

	loss_cls: tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7076, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [05:58<25:33,  5.34s/it][A

	loss_cls: tensor(0.4349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7649, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:03<25:23,  5.33s/it][A

	loss_cls: tensor(0.7096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1942, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:09<25:23,  5.34s/it][A

	loss_cls: tensor(0.9077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9862, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:14<25:14,  5.33s/it][A

	loss_cls: tensor(0.5109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6781, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:19<25:06,  5.32s/it][A

	loss_cls: tensor(0.5339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7298, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:25<25:06,  5.34s/it][A

	loss_cls: tensor(0.6594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9130, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:30<24:57,  5.33s/it][A

	loss_cls: tensor(0.5191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6892, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:35<24:55,  5.34s/it][A

	loss_cls: tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0241, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:41<24:46,  5.33s/it][A

	loss_cls: tensor(0.5339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8554, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:46<24:48,  5.35s/it][A

	loss_cls: tensor(0.5567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7046, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:51<24:38,  5.34s/it][A

	loss_cls: tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6460, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [06:57<24:41,  5.37s/it][A

	loss_cls: tensor(0.6237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9003, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:02<24:37,  5.37s/it][A

	loss_cls: tensor(0.3943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6577, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:07<24:25,  5.35s/it][A

	loss_cls: tensor(0.7046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8339, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:13<24:22,  5.36s/it][A

	loss_cls: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6645, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:18<24:14,  5.35s/it][A

	loss_cls: tensor(0.5928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6922, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:24<24:11,  5.36s/it][A

	loss_cls: tensor(0.7130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0201, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:29<24:03,  5.35s/it][A

	loss_cls: tensor(0.5264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5574, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:34<23:54,  5.33s/it][A

	loss_cls: tensor(0.4209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5352, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:40<23:54,  5.35s/it][A

	loss_cls: tensor(0.5635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7799, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:45<23:45,  5.34s/it][A

	loss_cls: tensor(0.5052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7258, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:50<23:43,  5.35s/it][A

	loss_cls: tensor(0.4481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5527, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [07:56<23:33,  5.33s/it][A

	loss_cls: tensor(0.9101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.9459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.8561, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:01<23:34,  5.36s/it][A

	loss_cls: tensor(0.6839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0917, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:06<23:24,  5.34s/it][A

	loss_cls: tensor(0.5192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7976, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:12<23:15,  5.33s/it][A

	loss_cls: tensor(0.5051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5649, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:17<23:14,  5.34s/it][A

	loss_cls: tensor(0.5689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8820, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:22<23:04,  5.32s/it][A

	loss_cls: tensor(0.6630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7252, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:28<23:05,  5.35s/it][A

	loss_cls: tensor(1.3209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5446, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:33<22:56,  5.33s/it][A

	loss_cls: tensor(0.4976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6762, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:38<22:54,  5.35s/it][A

	loss_cls: tensor(0.5367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9537, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:44<22:47,  5.34s/it][A

	loss_cls: tensor(0.3851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5834, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:49<22:39,  5.33s/it][A

	loss_cls: tensor(0.4759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5457, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:54<22:38,  5.35s/it][A

	loss_cls: tensor(0.6539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9078, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:00<22:33,  5.35s/it][A

	loss_cls: tensor(0.4646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5855, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:05<22:34,  5.37s/it][A

	loss_cls: tensor(0.3818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1261, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5079, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:10<22:24,  5.36s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7423, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:16<22:22,  5.37s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0554, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:21<22:12,  5.35s/it][A

	loss_cls: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5101, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:26<22:03,  5.34s/it][A

	loss_cls: tensor(0.8247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0275, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:32<22:01,  5.35s/it][A

	loss_cls: tensor(0.8538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0317, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:37<21:53,  5.34s/it][A

	loss_cls: tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9451, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:43<21:53,  5.36s/it][A

	loss_cls: tensor(0.4452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5819, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:48<21:44,  5.35s/it][A

	loss_cls: tensor(0.5499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7683, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:53<21:41,  5.36s/it][A

	loss_cls: tensor(0.6488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1110, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [09:59<21:31,  5.34s/it][A

	loss_cls: tensor(0.5463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6108, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:04<21:23,  5.33s/it][A

	loss_cls: tensor(0.5294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7952, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:09<21:21,  5.34s/it][A

	loss_cls: tensor(0.5285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7045, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:15<21:13,  5.33s/it][A

	loss_cls: tensor(0.5821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7880, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:20<21:12,  5.35s/it][A

	loss_cls: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6133, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:25<21:04,  5.34s/it][A

	loss_cls: tensor(0.5641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7006, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:31<21:01,  5.34s/it][A

	loss_cls: tensor(0.7368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0864, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:36<20:53,  5.33s/it][A

	loss_cls: tensor(0.8413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1498, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:41<20:44,  5.32s/it][A

	loss_cls: tensor(0.5970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7739, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:47<20:43,  5.34s/it][A

	loss_cls: tensor(0.7037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8538, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:52<20:34,  5.32s/it][A

	loss_cls: tensor(0.6002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7177, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [10:57<20:34,  5.35s/it][A

	loss_cls: tensor(0.6282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8092, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:03<20:27,  5.34s/it][A

	loss_cls: tensor(0.6911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9717, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:08<20:31,  5.38s/it][A

	loss_cls: tensor(0.3804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4846, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:13<20:22,  5.36s/it][A

	loss_cls: tensor(0.3742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4796, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:19<20:12,  5.34s/it][A

	loss_cls: tensor(0.4983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7522, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:24<20:10,  5.35s/it][A

	loss_cls: tensor(0.5106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7843, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:29<20:01,  5.34s/it][A

	loss_cls: tensor(0.4261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4689, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:35<19:59,  5.35s/it][A

	loss_cls: tensor(0.4609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8444, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:40<19:51,  5.34s/it][A

	loss_cls: tensor(0.4441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5585, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:45<19:48,  5.36s/it][A

	loss_cls: tensor(0.5211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7139, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:51<19:39,  5.34s/it][A

	loss_cls: tensor(0.3668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4021, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [11:56<19:31,  5.33s/it][A

	loss_cls: tensor(0.6499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8685, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:01<19:30,  5.34s/it][A

	loss_cls: tensor(0.6787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0029, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:07<19:23,  5.34s/it][A

	loss_cls: tensor(0.7472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1111, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:12<19:21,  5.35s/it][A

	loss_cls: tensor(0.3669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5870, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:17<19:12,  5.34s/it][A

	loss_cls: tensor(0.5523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6603, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:23<19:09,  5.35s/it][A

	loss_cls: tensor(0.6875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7922, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:28<19:01,  5.34s/it][A

	loss_cls: tensor(0.3721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6787, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:33<18:53,  5.32s/it][A

	loss_cls: tensor(0.3037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4885, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:39<18:52,  5.34s/it][A

	loss_cls: tensor(0.4054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5605, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:44<18:45,  5.33s/it][A

	loss_cls: tensor(0.8021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0004, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:49<18:43,  5.35s/it][A

	loss_cls: tensor(0.6750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9546, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [12:55<18:38,  5.35s/it][A

	loss_cls: tensor(0.7613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0173, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:00<18:37,  5.37s/it][A

	loss_cls: tensor(0.5082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7297, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:06<18:28,  5.35s/it][A

	loss_cls: tensor(0.5239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6995, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:11<18:30,  5.39s/it][A

	loss_cls: tensor(1.1905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3895, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5799, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:16<18:21,  5.37s/it][A

	loss_cls: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8738, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:22<18:15,  5.37s/it][A

	loss_cls: tensor(0.5039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7423, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:27<18:13,  5.39s/it][A

	loss_cls: tensor(0.4852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6957, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:32<18:03,  5.37s/it][A

	loss_cls: tensor(0.5805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6988, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:38<18:01,  5.38s/it][A

	loss_cls: tensor(0.6228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7936, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:43<17:53,  5.37s/it][A

	loss_cls: tensor(0.7191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7906, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:49<17:50,  5.38s/it][A

	loss_cls: tensor(0.6360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7415, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [13:54<17:43,  5.37s/it][A

	loss_cls: tensor(0.6977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7702, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [13:59<17:35,  5.36s/it][A

	loss_cls: tensor(0.4783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7858, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:05<17:33,  5.37s/it][A

	loss_cls: tensor(0.8115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8882, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:10<17:26,  5.37s/it][A

	loss_cls: tensor(0.6135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8503, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:15<17:22,  5.38s/it][A

	loss_cls: tensor(0.8192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0244, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:21<17:13,  5.36s/it][A

	loss_cls: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6540, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:26<17:11,  5.37s/it][A

	loss_cls: tensor(0.4414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4792, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:32<17:02,  5.35s/it][A

	loss_cls: tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2107, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:37<16:55,  5.35s/it][A

	loss_cls: tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8226, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:42<16:53,  5.36s/it][A

	loss_cls: tensor(0.4799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7017, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:48<16:46,  5.35s/it][A

	loss_cls: tensor(0.6816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7602, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [14:53<16:42,  5.36s/it][A

	loss_cls: tensor(0.2324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2564, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [14:58<16:34,  5.35s/it][A

	loss_cls: tensor(0.6663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8798, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:04<16:33,  5.37s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7922, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:09<16:24,  5.35s/it][A

	loss_cls: tensor(0.7697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.8888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6585, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:14<16:23,  5.37s/it][A

	loss_cls: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7855, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:20<16:19,  5.38s/it][A

	loss_cls: tensor(0.3884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6524, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:25<16:10,  5.36s/it][A

	loss_cls: tensor(0.4938, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3228, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8166, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:31<16:07,  5.37s/it][A

	loss_cls: tensor(0.4115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6378, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:36<15:59,  5.36s/it][A

	loss_cls: tensor(0.5273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9242, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:41<15:56,  5.38s/it][A

	loss_cls: tensor(0.6706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2116, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8823, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:47<15:49,  5.36s/it][A

	loss_cls: tensor(0.5984, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8384, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:52<15:41,  5.35s/it][A

	loss_cls: tensor(0.6314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9135, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [15:57<15:39,  5.37s/it][A

	loss_cls: tensor(0.6911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9674, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:03<15:30,  5.35s/it][A

	loss_cls: tensor(0.5581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8256, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:08<15:27,  5.36s/it][A

	loss_cls: tensor(0.5254, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8144, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:13<15:20,  5.35s/it][A

	loss_cls: tensor(0.7461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0684, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:19<15:18,  5.37s/it][A

	loss_cls: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6173, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:24<15:10,  5.36s/it][A

	loss_cls: tensor(0.4808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6320, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:29<15:03,  5.34s/it][A

	loss_cls: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9759, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:35<14:59,  5.35s/it][A

	loss_cls: tensor(0.5534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:40<14:51,  5.34s/it][A

	loss_cls: tensor(0.8609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0675, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:46<14:49,  5.36s/it][A

	loss_cls: tensor(0.6228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1397, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:51<14:42,  5.35s/it][A

	loss_cls: tensor(0.8518, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9616, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [16:56<14:41,  5.38s/it][A

	loss_cls: tensor(0.5541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8394, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:02<14:32,  5.36s/it][A

	loss_cls: tensor(0.8236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8810, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:07<14:26,  5.35s/it][A

	loss_cls: tensor(0.9249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1201, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:12<14:24,  5.37s/it][A

	loss_cls: tensor(0.4730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6606, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:18<14:21,  5.38s/it][A

	loss_cls: tensor(0.4365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6911, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:23<14:18,  5.40s/it][A

	loss_cls: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8762, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:29<14:10,  5.39s/it][A

	loss_cls: tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0248, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:34<14:08,  5.40s/it][A

	loss_cls: tensor(0.6270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7145, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:39<14:00,  5.39s/it][A

	loss_cls: tensor(0.4618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8120, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:45<13:52,  5.37s/it][A

	loss_cls: tensor(0.5583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8603, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:50<13:49,  5.38s/it][A

	loss_cls: tensor(0.6057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9640, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [17:55<13:39,  5.36s/it][A

	loss_cls: tensor(0.4597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5903, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:01<13:37,  5.38s/it][A

	loss_cls: tensor(0.4801, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6259, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:06<13:29,  5.36s/it][A

	loss_cls: tensor(0.8268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2509, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:12<13:26,  5.38s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5903, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:17<13:19,  5.36s/it][A

	loss_cls: tensor(0.7029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1055, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:22<13:11,  5.35s/it][A

	loss_cls: tensor(0.7152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2840, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9992, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:28<13:08,  5.36s/it][A

	loss_cls: tensor(0.6922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8724, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:33<13:00,  5.35s/it][A

	loss_cls: tensor(0.3002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6027, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:38<12:57,  5.36s/it][A

	loss_cls: tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5329, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:44<12:50,  5.35s/it][A

	loss_cls: tensor(0.5624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2947, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8570, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:49<12:47,  5.37s/it][A

	loss_cls: tensor(0.5512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7890, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [18:54<12:39,  5.35s/it][A

	loss_cls: tensor(0.4861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6015, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:00<12:33,  5.34s/it][A

	loss_cls: tensor(0.6578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8901, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:05<12:31,  5.37s/it][A

	loss_cls: tensor(0.6845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9161, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:10<12:25,  5.36s/it][A

	loss_cls: tensor(0.7381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8687, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:16<12:21,  5.37s/it][A

	loss_cls: tensor(0.6551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0432, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:21<12:15,  5.37s/it][A

	loss_cls: tensor(0.4596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6590, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:27<12:12,  5.39s/it][A

	loss_cls: tensor(0.6539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9959, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:32<12:04,  5.37s/it][A

	loss_cls: tensor(0.7544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0522, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:37<11:57,  5.36s/it][A

	loss_cls: tensor(0.5168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6897, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:43<11:54,  5.38s/it][A

	loss_cls: tensor(0.6075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8507, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:48<11:48,  5.36s/it][A

	loss_cls: tensor(0.4214, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5389, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [19:53<11:43,  5.37s/it][A

	loss_cls: tensor(0.5867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7862, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [19:59<11:35,  5.35s/it][A

	loss_cls: tensor(0.5066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6889, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:04<11:32,  5.37s/it][A

	loss_cls: tensor(0.5387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7815, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:09<11:25,  5.35s/it][A

	loss_cls: tensor(0.5673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:15<11:18,  5.34s/it][A

	loss_cls: tensor(0.7216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9094, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:20<11:15,  5.36s/it][A

	loss_cls: tensor(0.5624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8731, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:25<11:09,  5.35s/it][A

	loss_cls: tensor(0.4249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7453, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:31<11:05,  5.37s/it][A

	loss_cls: tensor(0.5918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7809, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:36<10:59,  5.36s/it][A

	loss_cls: tensor(0.8502, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0294, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:42<10:55,  5.37s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9122, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:47<10:47,  5.35s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8258, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [20:52<10:40,  5.34s/it][A

	loss_cls: tensor(0.4966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6812, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [20:58<10:37,  5.35s/it][A

	loss_cls: tensor(0.5162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6185, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:03<10:31,  5.35s/it][A

	loss_cls: tensor(0.7033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9132, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:08<10:28,  5.37s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6688, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:14<10:21,  5.36s/it][A

	loss_cls: tensor(0.5576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7114, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:19<10:17,  5.37s/it][A

	loss_cls: tensor(0.6883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8692, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:24<10:10,  5.36s/it][A

	loss_cls: tensor(0.4276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5775, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:30<10:04,  5.35s/it][A

	loss_cls: tensor(0.5169, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6967, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:35<10:01,  5.37s/it][A

	loss_cls: tensor(0.4837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5298, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:41<09:54,  5.36s/it][A

	loss_cls: tensor(0.6782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8033, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:46<09:50,  5.37s/it][A

	loss_cls: tensor(0.4075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6091, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [21:51<09:44,  5.36s/it][A

	loss_cls: tensor(0.4270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6329, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [21:57<09:40,  5.38s/it][A

	loss_cls: tensor(0.3535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3845, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:02<09:33,  5.36s/it][A

	loss_cls: tensor(0.6043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7034, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:07<09:28,  5.37s/it][A

	loss_cls: tensor(0.6090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8838, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:13<09:23,  5.37s/it][A

	loss_cls: tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8477, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:18<09:18,  5.37s/it][A

	loss_cls: tensor(0.7630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1492, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:23<09:13,  5.38s/it][A

	loss_cls: tensor(0.4652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8622, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:29<09:07,  5.37s/it][A

	loss_cls: tensor(0.6990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1925, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:34<09:03,  5.38s/it][A

	loss_cls: tensor(0.4848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5191, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:40<08:56,  5.36s/it][A

	loss_cls: tensor(0.4902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5234, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:45<08:52,  5.38s/it][A

	loss_cls: tensor(0.3085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3744, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [22:50<08:46,  5.37s/it][A

	loss_cls: tensor(0.5110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6129, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [22:56<08:39,  5.36s/it][A

	loss_cls: tensor(0.5245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8189, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:01<08:35,  5.37s/it][A

	loss_cls: tensor(0.4235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4604, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:06<08:28,  5.35s/it][A

	loss_cls: tensor(0.5054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7061, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:12<08:25,  5.37s/it][A

	loss_cls: tensor(0.4742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0504, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:17<08:18,  5.36s/it][A

	loss_cls: tensor(0.7239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9404, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:23<08:14,  5.38s/it][A

	loss_cls: tensor(0.4178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6068, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:28<08:08,  5.37s/it][A

	loss_cls: tensor(0.9366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0277, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:33<08:03,  5.37s/it][A

	loss_cls: tensor(1.0279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3121, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:39<07:59,  5.39s/it][A

	loss_cls: tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9659, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:44<07:53,  5.38s/it][A

	loss_cls: tensor(1.3113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6887, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [23:49<07:48,  5.38s/it][A

	loss_cls: tensor(0.2791, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4394, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [23:55<07:41,  5.36s/it][A

	loss_cls: tensor(0.8365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9735, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:00<07:37,  5.38s/it][A

	loss_cls: tensor(0.4067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4768, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:06<07:30,  5.36s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6135, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:11<07:24,  5.36s/it][A

	loss_cls: tensor(0.3300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4693, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:16<07:20,  5.37s/it][A

	loss_cls: tensor(0.3633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6665, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:22<07:13,  5.35s/it][A

	loss_cls: tensor(0.4222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6166, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:27<07:09,  5.37s/it][A

	loss_cls: tensor(0.7151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8295, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:32<07:03,  5.36s/it][A

	loss_cls: tensor(0.5317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6448, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:38<06:58,  5.37s/it][A

	loss_cls: tensor(0.5910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9845, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:43<06:52,  5.35s/it][A

	loss_cls: tensor(0.5049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7905, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [24:48<06:46,  5.34s/it][A

	loss_cls: tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6723, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [24:54<06:41,  5.36s/it][A

	loss_cls: tensor(0.6142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9816, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [24:59<06:35,  5.35s/it][A

	loss_cls: tensor(0.5229, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7290, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:04<06:31,  5.36s/it][A

	loss_cls: tensor(0.6822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8974, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:10<06:25,  5.35s/it][A

	loss_cls: tensor(0.3922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7360, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:15<06:21,  5.37s/it][A

	loss_cls: tensor(0.7608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2543, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:21<06:15,  5.36s/it][A

	loss_cls: tensor(0.5514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6107, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:26<06:09,  5.36s/it][A

	loss_cls: tensor(0.5610, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9679, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:31<06:05,  5.38s/it][A

	loss_cls: tensor(0.6838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1472, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:37<06:00,  5.38s/it][A

	loss_cls: tensor(0.6076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8796, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:42<05:55,  5.39s/it][A

	loss_cls: tensor(0.4055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5929, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [25:47<05:49,  5.37s/it][A

	loss_cls: tensor(0.5177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7965, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [25:53<05:44,  5.38s/it][A

	loss_cls: tensor(0.6279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7189, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [25:58<05:37,  5.36s/it][A

	loss_cls: tensor(0.4891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8043, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:03<05:31,  5.35s/it][A

	loss_cls: tensor(0.6654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9365, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:09<05:27,  5.37s/it][A

	loss_cls: tensor(0.5611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6878, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:14<05:21,  5.35s/it][A

	loss_cls: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8409, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:20<05:16,  5.37s/it][A

	loss_cls: tensor(0.8909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0000, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:25<05:11,  5.36s/it][A

	loss_cls: tensor(0.5578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7688, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:30<05:06,  5.38s/it][A

	loss_cls: tensor(0.4221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5124, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:36<05:00,  5.37s/it][A

	loss_cls: tensor(0.7535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1015, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:41<04:57,  5.40s/it][A

	loss_cls: tensor(1.0548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3344, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [26:46<04:49,  5.36s/it][A

	loss_cls: tensor(0.3551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4850, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [26:51<04:36,  5.22s/it][A

	loss_cls: tensor(0.6064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8314, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [26:56<04:27,  5.15s/it][A

	loss_cls: tensor(0.4544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6946, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:01<04:19,  5.08s/it][A

	loss_cls: tensor(0.7568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3261, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0829, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:06<04:12,  5.05s/it][A

	loss_cls: tensor(0.4927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8729, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:11<04:05,  5.01s/it][A

	loss_cls: tensor(0.4283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5796, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:16<03:59,  4.98s/it][A

	loss_cls: tensor(0.8468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9231, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:21<03:59,  5.09s/it][A

	loss_cls: tensor(0.9271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4595, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:27<03:59,  5.20s/it][A

	loss_cls: tensor(1.1257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4755, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:32<03:59,  5.32s/it][A

	loss_cls: tensor(1.1045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4930, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:38<03:55,  5.36s/it][A

	loss_cls: tensor(0.6800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6280, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3080, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [27:43<03:52,  5.40s/it][A

	loss_cls: tensor(0.6006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6613, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [27:49<03:47,  5.43s/it][A

	loss_cls: tensor(0.5909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6646, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [27:54<03:43,  5.44s/it][A

	loss_cls: tensor(0.8187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9044, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:00<03:38,  5.47s/it][A

	loss_cls: tensor(0.6782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7900, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:05<03:34,  5.49s/it][A

	loss_cls: tensor(0.6833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1665, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:11<03:29,  5.51s/it][A

	loss_cls: tensor(0.5600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7578, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:17<03:23,  5.51s/it][A

	loss_cls: tensor(0.5140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6726, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:22<03:19,  5.53s/it][A

	loss_cls: tensor(0.5690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8936, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:28<03:13,  5.52s/it][A

	loss_cls: tensor(0.4760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8317, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:33<03:07,  5.51s/it][A

	loss_cls: tensor(0.5694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7083, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:39<03:02,  5.52s/it][A

	loss_cls: tensor(0.4892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6306, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:44<02:56,  5.51s/it][A

	loss_cls: tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6584, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [28:50<02:51,  5.53s/it][A

	loss_cls: tensor(0.5280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8734, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [28:55<02:45,  5.52s/it][A

	loss_cls: tensor(0.4869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5726, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:01<02:40,  5.54s/it][A

	loss_cls: tensor(0.5476, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0965, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:06<02:34,  5.53s/it][A

	loss_cls: tensor(0.6444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9134, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:12<02:28,  5.51s/it][A

	loss_cls: tensor(0.5877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8694, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:17<02:23,  5.52s/it][A

	loss_cls: tensor(0.5262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0768, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:23<02:17,  5.51s/it][A

	loss_cls: tensor(0.4915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6343, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:28<02:12,  5.53s/it][A

	loss_cls: tensor(0.7673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0759, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:34<02:06,  5.51s/it][A

	loss_cls: tensor(0.5785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6328, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:39<02:01,  5.53s/it][A

	loss_cls: tensor(0.5491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6300, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:45<01:55,  5.51s/it][A

	loss_cls: tensor(0.5562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7377, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [29:50<01:49,  5.50s/it][A

	loss_cls: tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6378, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [29:56<01:44,  5.52s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7870, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:01<01:39,  5.51s/it][A

	loss_cls: tensor(0.4607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4329, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8936, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:07<01:34,  5.53s/it][A

	loss_cls: tensor(0.4524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:12<01:28,  5.52s/it][A

	loss_cls: tensor(0.7349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0430, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:18<01:23,  5.53s/it][A

	loss_cls: tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7431, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:23<01:17,  5.51s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6561, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:29<01:11,  5.50s/it][A

	loss_cls: tensor(0.2868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6526, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:35<01:06,  5.51s/it][A

	loss_cls: tensor(0.4925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7012, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:40<01:00,  5.51s/it][A

	loss_cls: tensor(0.4601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6386, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:46<00:55,  5.53s/it][A

	loss_cls: tensor(0.5501, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7497, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [30:51<00:49,  5.52s/it][A

	loss_cls: tensor(0.6200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7068, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [30:57<00:44,  5.53s/it][A

	loss_cls: tensor(0.5734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6558, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:02<00:38,  5.51s/it][A

	loss_cls: tensor(0.8133, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9859, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:08<00:33,  5.52s/it][A

	loss_cls: tensor(0.6520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9282, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:13<00:27,  5.51s/it][A

	loss_cls: tensor(0.5375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6302, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:19<00:22,  5.50s/it][A

	loss_cls: tensor(0.6292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0343, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:24<00:16,  5.53s/it][A

	loss_cls: tensor(0.4179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5139, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:30<00:11,  5.51s/it][A

	loss_cls: tensor(0.4320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4667, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:35<00:05,  5.53s/it][A

	loss_cls: tensor(0.6562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8783, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:37<00:00,  5.36s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2805, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8157978105848118

	Training cls acc: 0.7063912429378532

	Training cls prec: 0.5878757894647725

	Training cls rec: 0.6333898798941172

	Training cls f1: 0.5521775753062086

--
	Training ner acc: 0.9559163330637964

	Training ner prec: 0.2777465098611161

	Training ner rec: 0.2855009946148508

	Training ner f1: 0.2810542446672259

	Current Learning rate:  0.00042857142857142855



  1%|          | 1/177 [00:00<02:14,  1.31it/s][A
  1%|          | 2/177 [00:01<02:11,  1.34it/s][A
  2%|▏         | 3/177 [00:02<02:04,  1.40it/s][A
  2%|▏         | 4/177 [00:02<02:06,  1.37it/s][A
  3%|▎         | 5/177 [00:03<02:06,  1.35it/s][A
  3%|▎         | 6/177 [00:04<02:02,  1.40it/s][A
  4%|▍         | 7/177 [00:05<02:03,  1.37it/s][A
  5%|▍         | 8/177 [00:05<02:04,  1.36it/s][A
  5%|▌         | 9/177 [00:06<02:04,  1.35it/s][A
  6%|▌         | 10/177 [00:07<02:01,  1.38it/s][A
  6%|▌         | 11/177 [00:08<02:00,  1.38it/s][A
  7%|▋         | 12/177 [00:08<01:58,  1.39it/s][A
  7%|▋         | 13/177 [00:09<01:55,  1.41it/s][A
  8%|▊         | 14/177 [00:10<01:55,  1.41it/s][A
  8%|▊         | 15/177 [00:10<01:55,  1.40it/s][A
  9%|▉         | 16/177 [00:11<01:58,  1.36it/s][A
 10%|▉         | 17/177 [00:12<01:54,  1.40it/s][A
 10%|█         | 18/177 [00:13<01:55,  1.37it/s][A
 11%|█         | 19/177 [00:13<01:56,  1.36it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.790059017680459

	Validation cls acc: 0.7645951035781543

	Validation cls prec: 0.6113666935700834

	Validation cls rec: 0.6204028786655905

	Validation cls f1: 0.588031271929577

--
	Validation ner acc: 0.9535199030053423

	Validation ner prec: 0.38435777599852533

	Validation ner rec: 0.39510357815442565

	Validation ner f1: 0.3895310437196512



  0%|          | 1/354 [00:05<31:52,  5.42s/it][A

	loss_cls: tensor(0.4871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5868, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:47,  5.42s/it][A

	loss_cls: tensor(0.4719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7243, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:59,  5.47s/it][A

	loss_cls: tensor(0.5247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8242, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:51,  5.46s/it][A

	loss_cls: tensor(0.6375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9041, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:58,  5.50s/it][A

	loss_cls: tensor(0.3007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6183, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:51,  5.49s/it][A

	loss_cls: tensor(0.7974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2758, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:54,  5.52s/it][A

	loss_cls: tensor(0.7000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7892, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:42,  5.50s/it][A

	loss_cls: tensor(0.2941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3671, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:30,  5.48s/it][A

	loss_cls: tensor(0.7445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3644, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:27,  5.49s/it][A

	loss_cls: tensor(0.3983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4436, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:20,  5.48s/it][A

	loss_cls: tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6125, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:21,  5.50s/it][A

	loss_cls: tensor(0.5788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7946, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<31:14,  5.50s/it][A

	loss_cls: tensor(0.3042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5305, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<31:13,  5.51s/it][A

	loss_cls: tensor(0.4860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7915, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:22<31:01,  5.49s/it][A

	loss_cls: tensor(0.7305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1891, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<30:51,  5.48s/it][A

	loss_cls: tensor(0.4207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7599, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:33<30:50,  5.49s/it][A

	loss_cls: tensor(0.5104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6445, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:43,  5.49s/it][A

	loss_cls: tensor(0.6652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3102, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9754, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:44<30:44,  5.51s/it][A

	loss_cls: tensor(0.7041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1917, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8957, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:34,  5.49s/it][A

	loss_cls: tensor(0.9299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2359, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:55<30:35,  5.51s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7296, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:24,  5.49s/it][A

	loss_cls: tensor(1.3427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5018, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:06<30:00,  5.44s/it][A

	loss_cls: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5639, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<29:52,  5.43s/it][A

	loss_cls: tensor(0.5742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6619, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:16<29:37,  5.40s/it][A

	loss_cls: tensor(0.4539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6562, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:22<29:33,  5.41s/it][A

	loss_cls: tensor(0.4886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5771, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:27<29:21,  5.39s/it][A

	loss_cls: tensor(0.3684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4164, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:33<29:19,  5.40s/it][A

	loss_cls: tensor(0.6764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0541, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:38<29:09,  5.38s/it][A

	loss_cls: tensor(0.6871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0347, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:43<28:57,  5.36s/it][A

	loss_cls: tensor(0.7633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0449, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:49<28:59,  5.39s/it][A

	loss_cls: tensor(0.5783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7844, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:54<28:50,  5.37s/it][A

	loss_cls: tensor(0.5016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5776, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:59<28:50,  5.39s/it][A

	loss_cls: tensor(0.5149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7714, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:05<28:39,  5.37s/it][A

	loss_cls: tensor(0.6835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9655, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:10<28:38,  5.39s/it][A

	loss_cls: tensor(0.4788, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8070, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:15<28:28,  5.37s/it][A

	loss_cls: tensor(0.4481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5120, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:21<28:25,  5.38s/it][A

	loss_cls: tensor(0.7736, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2280, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0016, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:26<28:19,  5.38s/it][A

	loss_cls: tensor(0.3856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5565, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:32<28:12,  5.37s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5990, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:37<28:11,  5.39s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0185, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:42<28:06,  5.39s/it][A

	loss_cls: tensor(0.6213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9790, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:48<28:02,  5.39s/it][A

	loss_cls: tensor(1.0354, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2164, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:53<27:52,  5.38s/it][A

	loss_cls: tensor(0.3103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3507, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:59<27:52,  5.40s/it][A

	loss_cls: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7815, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:04<27:41,  5.38s/it][A

	loss_cls: tensor(1.0091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2797, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:31,  5.36s/it][A

	loss_cls: tensor(0.6266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7675, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:15<27:32,  5.38s/it][A

	loss_cls: tensor(0.5035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8625, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:20<27:23,  5.37s/it][A

	loss_cls: tensor(0.6551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9181, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:19,  5.37s/it][A

	loss_cls: tensor(0.5276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7565, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:31<27:08,  5.36s/it][A

	loss_cls: tensor(0.9104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1066, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:11,  5.39s/it][A

	loss_cls: tensor(0.8219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0023, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:42<27:02,  5.37s/it][A

	loss_cls: tensor(0.5433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8287, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:47<26:56,  5.37s/it][A

	loss_cls: tensor(1.0685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4569, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<26:56,  5.39s/it][A

	loss_cls: tensor(0.6351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0280, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:58<26:46,  5.37s/it][A

	loss_cls: tensor(0.4475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6476, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<26:42,  5.38s/it][A

	loss_cls: tensor(0.5170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7511, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:33,  5.36s/it][A

	loss_cls: tensor(0.3196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5728, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:32,  5.38s/it][A

	loss_cls: tensor(0.4772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9758, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:26,  5.38s/it][A

	loss_cls: tensor(0.6273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3895, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0168, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:25<26:16,  5.36s/it][A

	loss_cls: tensor(0.7781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0019, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:30<26:15,  5.38s/it][A

	loss_cls: tensor(0.5646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0901, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:35<26:07,  5.37s/it][A

	loss_cls: tensor(0.6277, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7812, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:05,  5.38s/it][A

	loss_cls: tensor(0.7920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9237, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:46<25:56,  5.37s/it][A

	loss_cls: tensor(0.3865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6468, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:51<25:56,  5.39s/it][A

	loss_cls: tensor(0.7600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8778, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:57<25:46,  5.37s/it][A

	loss_cls: tensor(0.6862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8368, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:02<25:38,  5.36s/it][A

	loss_cls: tensor(0.6270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9091, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:08<25:35,  5.37s/it][A

	loss_cls: tensor(0.6389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8078, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:13<25:25,  5.35s/it][A

	loss_cls: tensor(0.4816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6231, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:18<25:25,  5.37s/it][A

	loss_cls: tensor(0.6094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6930, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:24<25:17,  5.36s/it][A

	loss_cls: tensor(0.6088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8767, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:29<25:17,  5.38s/it][A

	loss_cls: tensor(0.8485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1827, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:34<25:08,  5.37s/it][A

	loss_cls: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6781, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:40<25:02,  5.36s/it][A

	loss_cls: tensor(0.6970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1130, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:45<25:00,  5.38s/it][A

	loss_cls: tensor(0.6094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7345, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:50<24:50,  5.36s/it][A

	loss_cls: tensor(0.4666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5632, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:56<24:50,  5.38s/it][A

	loss_cls: tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7748, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:01<24:41,  5.37s/it][A

	loss_cls: tensor(0.6064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7903, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:07<24:41,  5.39s/it][A

	loss_cls: tensor(0.7641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1178, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:12<24:30,  5.37s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7369, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<24:21,  5.35s/it][A

	loss_cls: tensor(0.4603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6475, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:23<24:21,  5.37s/it][A

	loss_cls: tensor(0.4729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5809, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:28<24:11,  5.36s/it][A

	loss_cls: tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1036, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:14,  5.39s/it][A

	loss_cls: tensor(0.7236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9209, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:39<24:05,  5.37s/it][A

	loss_cls: tensor(0.8262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1628, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:06,  5.40s/it][A

	loss_cls: tensor(0.4166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6558, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:50<23:57,  5.38s/it][A

	loss_cls: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7886, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<23:47,  5.37s/it][A

	loss_cls: tensor(0.5585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6183, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<23:46,  5.38s/it][A

	loss_cls: tensor(0.4803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8333, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:06<23:35,  5.36s/it][A

	loss_cls: tensor(0.6395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8885, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:36,  5.38s/it][A

	loss_cls: tensor(0.4643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6925, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<23:26,  5.37s/it][A

	loss_cls: tensor(0.4286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5235, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<23:23,  5.38s/it][A

	loss_cls: tensor(0.6709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8555, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:27<23:17,  5.37s/it][A

	loss_cls: tensor(0.8892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0966, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:33<23:11,  5.37s/it][A

	loss_cls: tensor(0.5386, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6769, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:38<23:08,  5.38s/it][A

	loss_cls: tensor(0.5687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8410, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:43<23:01,  5.37s/it][A

	loss_cls: tensor(0.6377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9780, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:49<22:57,  5.38s/it][A

	loss_cls: tensor(0.5007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7655, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:54<22:47,  5.36s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6366, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:59<22:45,  5.38s/it][A

	loss_cls: tensor(0.5807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7270, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:05<22:37,  5.37s/it][A

	loss_cls: tensor(0.6333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7835, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:10<22:26,  5.34s/it][A

	loss_cls: tensor(0.4380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9033, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:26,  5.37s/it][A

	loss_cls: tensor(0.7428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8274, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:21<22:17,  5.35s/it][A

	loss_cls: tensor(0.5836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7317, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:26<22:18,  5.38s/it][A

	loss_cls: tensor(0.5063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8764, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:32<22:08,  5.36s/it][A

	loss_cls: tensor(0.4380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4790, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:37<22:07,  5.37s/it][A

	loss_cls: tensor(0.6256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8258, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:42<21:57,  5.36s/it][A

	loss_cls: tensor(0.4620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4961, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:48<21:49,  5.35s/it][A

	loss_cls: tensor(0.6680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9818, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:53<21:46,  5.35s/it][A

	loss_cls: tensor(0.4591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5906, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:58<21:38,  5.35s/it][A

	loss_cls: tensor(0.7242, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0457, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:04<21:36,  5.36s/it][A

	loss_cls: tensor(0.3331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3659, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:09<21:28,  5.35s/it][A

	loss_cls: tensor(0.5924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7822, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:14<21:27,  5.36s/it][A

	loss_cls: tensor(0.4863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5661, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:20<21:20,  5.36s/it][A

	loss_cls: tensor(0.5456, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6888, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:25<21:13,  5.35s/it][A

	loss_cls: tensor(0.2856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5845, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:31<21:16,  5.39s/it][A

	loss_cls: tensor(0.4459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6971, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:36<21:15,  5.41s/it][A

	loss_cls: tensor(0.5365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6685, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:42<21:21,  5.45s/it][A

	loss_cls: tensor(0.4310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2298, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6608, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:47<21:01,  5.39s/it][A

	loss_cls: tensor(0.4603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7176, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:52<20:52,  5.38s/it][A

	loss_cls: tensor(0.5178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7693, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:58<20:53,  5.40s/it][A

	loss_cls: tensor(0.5189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6428, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:03<20:50,  5.41s/it][A

	loss_cls: tensor(0.5822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8061, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:09<20:53,  5.45s/it][A

	loss_cls: tensor(0.6497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7997, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:14<20:49,  5.45s/it][A

	loss_cls: tensor(0.6505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1648, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:20<20:49,  5.48s/it][A

	loss_cls: tensor(0.6494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9321, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:25<20:42,  5.47s/it][A

	loss_cls: tensor(0.7615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9521, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:31<20:41,  5.49s/it][A

	loss_cls: tensor(0.3077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3541, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:36<20:33,  5.48s/it][A

	loss_cls: tensor(0.6756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9558, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:42<20:24,  5.47s/it][A

	loss_cls: tensor(0.4282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6578, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:47<20:25,  5.50s/it][A

	loss_cls: tensor(0.7734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1164, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:53<20:17,  5.48s/it][A

	loss_cls: tensor(0.5221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:58<20:15,  5.50s/it][A

	loss_cls: tensor(1.0915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2612, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:04<20:07,  5.49s/it][A

	loss_cls: tensor(0.4860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8533, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:09<20:06,  5.51s/it][A

	loss_cls: tensor(0.3467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5248, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:15<19:57,  5.49s/it][A

	loss_cls: tensor(0.5062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6251, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:20<19:52,  5.49s/it][A

	loss_cls: tensor(0.6856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7586, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:26<19:45,  5.49s/it][A

	loss_cls: tensor(0.4186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8657, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:31<19:36,  5.47s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8357, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:36<19:32,  5.48s/it][A

	loss_cls: tensor(0.4312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1840, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6151, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:42<19:28,  5.48s/it][A

	loss_cls: tensor(0.4775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5687, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:47<19:25,  5.50s/it][A

	loss_cls: tensor(0.7755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0808, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:53<19:18,  5.49s/it][A

	loss_cls: tensor(0.5274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8270, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:58<19:15,  5.50s/it][A

	loss_cls: tensor(0.6467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8272, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:04<19:07,  5.49s/it][A

	loss_cls: tensor(0.5131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7801, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:09<19:01,  5.49s/it][A

	loss_cls: tensor(0.7658, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8883, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:15<18:59,  5.50s/it][A

	loss_cls: tensor(0.5925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9194, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:20<18:50,  5.49s/it][A

	loss_cls: tensor(0.6618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1433, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:26<18:47,  5.50s/it][A

	loss_cls: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9584, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:31<18:37,  5.48s/it][A

	loss_cls: tensor(0.3715, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4653, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:37<18:36,  5.50s/it][A

	loss_cls: tensor(0.6519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9040, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:42<18:28,  5.49s/it][A

	loss_cls: tensor(0.4274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7190, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:48<18:21,  5.48s/it][A

	loss_cls: tensor(0.7131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7720, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:53<18:18,  5.49s/it][A

	loss_cls: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6786, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:59<18:10,  5.48s/it][A

	loss_cls: tensor(0.4490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6218, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:04<18:09,  5.50s/it][A

	loss_cls: tensor(1.4779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.8229, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:10<18:01,  5.49s/it][A

	loss_cls: tensor(0.5918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7819, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:15<17:58,  5.50s/it][A

	loss_cls: tensor(0.5221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6688, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:21<17:53,  5.51s/it][A

	loss_cls: tensor(0.3797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7097, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:26<17:45,  5.49s/it][A

	loss_cls: tensor(0.5852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7981, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:32<17:42,  5.51s/it][A

	loss_cls: tensor(0.4859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5777, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:37<17:31,  5.47s/it][A

	loss_cls: tensor(1.6924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7503, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:43<17:28,  5.49s/it][A

	loss_cls: tensor(0.8890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9766, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:48<17:20,  5.48s/it][A

	loss_cls: tensor(0.6217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8586, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:54<17:19,  5.50s/it][A

	loss_cls: tensor(0.7355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0085, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:59<17:11,  5.49s/it][A

	loss_cls: tensor(0.4469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6050, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:05<17:05,  5.48s/it][A

	loss_cls: tensor(0.5941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6247, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:10<17:03,  5.50s/it][A

	loss_cls: tensor(0.3322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7816, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:16<16:47,  5.44s/it][A

	loss_cls: tensor(0.8412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9918, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:21<16:38,  5.43s/it][A

	loss_cls: tensor(0.5523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8386, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:26<16:27,  5.40s/it][A

	loss_cls: tensor(0.6122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8328, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:32<16:23,  5.41s/it][A

	loss_cls: tensor(0.6703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8704, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:37<16:14,  5.39s/it][A

	loss_cls: tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5582, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:42<16:07,  5.38s/it][A

	loss_cls: tensor(0.4941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7087, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:48<16:03,  5.39s/it][A

	loss_cls: tensor(0.4792, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5159, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:53<15:55,  5.37s/it][A

	loss_cls: tensor(0.6595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8042, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:59<15:54,  5.39s/it][A

	loss_cls: tensor(0.6463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0356, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:04<15:47,  5.39s/it][A

	loss_cls: tensor(0.8733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3733, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:09<15:43,  5.39s/it][A

	loss_cls: tensor(0.9913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2806, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:15<15:36,  5.38s/it][A

	loss_cls: tensor(0.8415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0666, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:20<15:29,  5.37s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7382, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:26<15:25,  5.38s/it][A

	loss_cls: tensor(0.4594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5341, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:31<15:17,  5.37s/it][A

	loss_cls: tensor(0.5798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7497, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:36<15:14,  5.38s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6981, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:42<15:08,  5.38s/it][A

	loss_cls: tensor(0.6192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8573, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:47<15:05,  5.39s/it][A

	loss_cls: tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7707, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:52<14:55,  5.36s/it][A

	loss_cls: tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7036, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:58<14:49,  5.36s/it][A

	loss_cls: tensor(0.6755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9273, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:03<14:48,  5.38s/it][A

	loss_cls: tensor(0.7488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9718, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:08<14:38,  5.36s/it][A

	loss_cls: tensor(0.4303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5584, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:14<14:37,  5.38s/it][A

	loss_cls: tensor(0.6254, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9669, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:19<14:30,  5.38s/it][A

	loss_cls: tensor(0.5159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7984, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:25<14:27,  5.39s/it][A

	loss_cls: tensor(0.7109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7934, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:30<14:20,  5.38s/it][A

	loss_cls: tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5741, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:35<14:14,  5.37s/it][A

	loss_cls: tensor(0.6299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2790, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:41<14:09,  5.38s/it][A

	loss_cls: tensor(0.6452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9685, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:46<14:02,  5.36s/it][A

	loss_cls: tensor(0.5113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7081, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:52<14:00,  5.39s/it][A

	loss_cls: tensor(0.4994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9475, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:57<13:53,  5.38s/it][A

	loss_cls: tensor(0.7395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8149, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:02<13:51,  5.40s/it][A

	loss_cls: tensor(0.4909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:08<13:44,  5.39s/it][A

	loss_cls: tensor(0.4347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8144, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:13<13:37,  5.38s/it][A

	loss_cls: tensor(0.3733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6334, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:18<13:34,  5.39s/it][A

	loss_cls: tensor(0.7437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8358, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:24<13:27,  5.39s/it][A

	loss_cls: tensor(0.4983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6227, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:29<13:27,  5.42s/it][A

	loss_cls: tensor(0.5267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8888, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:35<13:19,  5.40s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1840, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7106, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:40<13:16,  5.42s/it][A

	loss_cls: tensor(0.6763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7591, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:45<13:07,  5.40s/it][A

	loss_cls: tensor(0.5374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7135, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:51<13:00,  5.38s/it][A

	loss_cls: tensor(0.4942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5404, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:56<12:58,  5.40s/it][A

	loss_cls: tensor(0.8497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2088, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:02<12:51,  5.39s/it][A

	loss_cls: tensor(0.4399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7881, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:07<12:47,  5.40s/it][A

	loss_cls: tensor(0.5781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8112, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:12<12:39,  5.39s/it][A

	loss_cls: tensor(0.5109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0635, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5743, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:18<12:36,  5.40s/it][A

	loss_cls: tensor(1.0197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4438, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:23<12:29,  5.39s/it][A

	loss_cls: tensor(0.6611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0571, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:29<12:23,  5.39s/it][A

	loss_cls: tensor(0.6366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6827, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:34<12:20,  5.40s/it][A

	loss_cls: tensor(0.7373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8520, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:39<12:14,  5.40s/it][A

	loss_cls: tensor(0.5761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9044, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:45<12:10,  5.41s/it][A

	loss_cls: tensor(0.5780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9718, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:50<12:04,  5.40s/it][A

	loss_cls: tensor(0.4112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5001, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:56<12:00,  5.42s/it][A

	loss_cls: tensor(0.3285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3718, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:01<11:51,  5.39s/it][A

	loss_cls: tensor(0.4866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7410, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:06<11:44,  5.38s/it][A

	loss_cls: tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6223, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:12<11:42,  5.40s/it][A

	loss_cls: tensor(0.8428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9976, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:17<11:35,  5.39s/it][A

	loss_cls: tensor(0.3842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4600, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:23<11:32,  5.41s/it][A

	loss_cls: tensor(0.3755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6290, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:28<11:25,  5.40s/it][A

	loss_cls: tensor(0.4674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7863, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:34<11:22,  5.42s/it][A

	loss_cls: tensor(0.4410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6221, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:39<11:15,  5.40s/it][A

	loss_cls: tensor(1.2674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5136, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:44<11:08,  5.39s/it][A

	loss_cls: tensor(0.8190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3107, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:50<11:06,  5.42s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7471, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:55<11:00,  5.41s/it][A

	loss_cls: tensor(1.4176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6985, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:01<10:57,  5.44s/it][A

	loss_cls: tensor(0.8440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0460, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:06<10:50,  5.42s/it][A

	loss_cls: tensor(0.6010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7977, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:11<10:46,  5.43s/it][A

	loss_cls: tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4998, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:17<10:38,  5.41s/it][A

	loss_cls: tensor(0.5375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1203, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:22<10:34,  5.42s/it][A

	loss_cls: tensor(0.7692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9827, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:28<10:28,  5.42s/it][A

	loss_cls: tensor(0.5170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6122, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:33<10:22,  5.41s/it][A

	loss_cls: tensor(0.5703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8074, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:39<10:18,  5.43s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8084, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:44<10:11,  5.41s/it][A

	loss_cls: tensor(0.5756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7898, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:49<10:07,  5.42s/it][A

	loss_cls: tensor(0.4492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8210, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:55<10:00,  5.41s/it][A

	loss_cls: tensor(0.6392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0761, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:00<09:56,  5.42s/it][A

	loss_cls: tensor(0.4483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8740, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:06<09:49,  5.40s/it][A

	loss_cls: tensor(0.7096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9966, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:11<09:42,  5.39s/it][A

	loss_cls: tensor(0.8238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0495, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:16<09:38,  5.41s/it][A

	loss_cls: tensor(0.5079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7586, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:22<09:31,  5.40s/it][A

	loss_cls: tensor(0.9050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9791, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:27<09:27,  5.41s/it][A

	loss_cls: tensor(0.8004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9306, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:33<09:20,  5.39s/it][A

	loss_cls: tensor(0.5663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7162, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:38<09:17,  5.42s/it][A

	loss_cls: tensor(0.6230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8782, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:43<09:11,  5.41s/it][A

	loss_cls: tensor(0.6373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9512, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:49<09:05,  5.40s/it][A

	loss_cls: tensor(0.7243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0754, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:54<09:00,  5.41s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6764, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:00<08:54,  5.40s/it][A

	loss_cls: tensor(0.5578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6786, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:05<08:51,  5.42s/it][A

	loss_cls: tensor(0.6142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8770, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:10<08:45,  5.41s/it][A

	loss_cls: tensor(0.6100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8073, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:16<08:42,  5.44s/it][A

	loss_cls: tensor(0.6001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6389, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:21<08:35,  5.43s/it][A

	loss_cls: tensor(0.6710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9541, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:27<08:29,  5.42s/it][A

	loss_cls: tensor(0.6118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0508, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:32<08:25,  5.43s/it][A

	loss_cls: tensor(0.5573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6305, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:38<08:17,  5.41s/it][A

	loss_cls: tensor(0.8778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1684, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:43<08:14,  5.43s/it][A

	loss_cls: tensor(0.4887, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6353, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:48<08:07,  5.42s/it][A

	loss_cls: tensor(0.5812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7856, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:54<08:04,  5.44s/it][A

	loss_cls: tensor(0.8973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1652, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:59<07:58,  5.44s/it][A

	loss_cls: tensor(0.7441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9035, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:05<07:51,  5.42s/it][A

	loss_cls: tensor(0.6207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7720, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:10<07:47,  5.43s/it][A

	loss_cls: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0056, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:16<07:40,  5.42s/it][A

	loss_cls: tensor(0.9067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1634, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:21<07:35,  5.42s/it][A

	loss_cls: tensor(0.5816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6477, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:26<07:28,  5.41s/it][A

	loss_cls: tensor(0.5727, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0783, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:32<07:25,  5.43s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9147, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:37<07:18,  5.42s/it][A

	loss_cls: tensor(0.4632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6126, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:43<07:12,  5.41s/it][A

	loss_cls: tensor(0.4656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9307, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:48<07:09,  5.43s/it][A

	loss_cls: tensor(0.5765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7611, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:53<07:02,  5.41s/it][A

	loss_cls: tensor(0.4562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5870, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:59<06:57,  5.42s/it][A

	loss_cls: tensor(0.6232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0742, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:04<06:51,  5.42s/it][A

	loss_cls: tensor(0.6107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7781, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:10<06:47,  5.44s/it][A

	loss_cls: tensor(0.5585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7901, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:15<06:41,  5.43s/it][A

	loss_cls: tensor(0.4289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6324, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:21<06:35,  5.42s/it][A

	loss_cls: tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6095, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:26<06:30,  5.43s/it][A

	loss_cls: tensor(0.7101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9562, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:31<06:24,  5.41s/it][A

	loss_cls: tensor(0.4369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6266, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:37<06:20,  5.43s/it][A

	loss_cls: tensor(0.7065, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8794, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:42<06:13,  5.41s/it][A

	loss_cls: tensor(0.6416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1482, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:48<06:08,  5.42s/it][A

	loss_cls: tensor(0.5651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6787, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:53<06:02,  5.40s/it][A

	loss_cls: tensor(0.7560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9464, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:58<05:55,  5.39s/it][A

	loss_cls: tensor(0.7798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2646, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:04<05:51,  5.40s/it][A

	loss_cls: tensor(0.6843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8190, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:09<05:44,  5.38s/it][A

	loss_cls: tensor(0.4321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6385, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:15<05:41,  5.42s/it][A

	loss_cls: tensor(0.8350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0733, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:20<05:34,  5.39s/it][A

	loss_cls: tensor(0.5141, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6149, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:26<05:30,  5.42s/it][A

	loss_cls: tensor(0.5375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9003, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:31<05:24,  5.40s/it][A

	loss_cls: tensor(0.5345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8532, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:36<05:17,  5.38s/it][A

	loss_cls: tensor(0.6361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8449, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:42<05:12,  5.39s/it][A

	loss_cls: tensor(0.5600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8652, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:47<05:06,  5.37s/it][A

	loss_cls: tensor(0.5920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9641, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:52<05:01,  5.39s/it][A

	loss_cls: tensor(0.5085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6492, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:58<04:56,  5.38s/it][A

	loss_cls: tensor(0.6876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8240, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:03<04:51,  5.39s/it][A

	loss_cls: tensor(0.6218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8120, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:09<04:45,  5.38s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9206, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:14<04:39,  5.37s/it][A

	loss_cls: tensor(0.6321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8183, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:19<04:34,  5.38s/it][A

	loss_cls: tensor(0.5863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6989, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:25<04:28,  5.37s/it][A

	loss_cls: tensor(1.1379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3842, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:30<04:24,  5.39s/it][A

	loss_cls: tensor(0.6852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9266, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:35<04:18,  5.39s/it][A

	loss_cls: tensor(0.5785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8889, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:41<04:13,  5.40s/it][A

	loss_cls: tensor(0.5238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8514, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:46<04:07,  5.38s/it][A

	loss_cls: tensor(0.6535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0029, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:52<04:01,  5.37s/it][A

	loss_cls: tensor(1.0486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2493, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:57<03:57,  5.39s/it][A

	loss_cls: tensor(0.6197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7195, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:02<03:51,  5.37s/it][A

	loss_cls: tensor(0.6871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7420, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:08<03:46,  5.39s/it][A

	loss_cls: tensor(0.5376, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7790, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:13<03:40,  5.37s/it][A

	loss_cls: tensor(0.5334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6428, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:19<03:35,  5.39s/it][A

	loss_cls: tensor(0.4335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8149, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:24<03:29,  5.38s/it][A

	loss_cls: tensor(0.4995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6514, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:29<03:23,  5.36s/it][A

	loss_cls: tensor(0.7416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0414, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:35<03:18,  5.38s/it][A

	loss_cls: tensor(0.7692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9914, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:40<03:13,  5.37s/it][A

	loss_cls: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7018, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:45<03:08,  5.39s/it][A

	loss_cls: tensor(0.7490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1403, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:51<03:02,  5.37s/it][A

	loss_cls: tensor(0.5803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5127, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0929, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:56<02:58,  5.40s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7810, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:02<02:52,  5.38s/it][A

	loss_cls: tensor(0.6120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7928, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:07<02:46,  5.37s/it][A

	loss_cls: tensor(0.6080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6616, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:12<02:41,  5.38s/it][A

	loss_cls: tensor(0.3734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5533, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:18<02:36,  5.38s/it][A

	loss_cls: tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6725, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:23<02:31,  5.41s/it][A

	loss_cls: tensor(0.6498, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7893, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:29<02:25,  5.39s/it][A

	loss_cls: tensor(0.4787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6883, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:34<02:20,  5.42s/it][A

	loss_cls: tensor(0.9006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0315, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:39<02:15,  5.40s/it][A

	loss_cls: tensor(0.4629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6816, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:45<02:09,  5.39s/it][A

	loss_cls: tensor(0.4700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5063, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:50<02:04,  5.42s/it][A

	loss_cls: tensor(0.5020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6824, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:56<01:58,  5.41s/it][A

	loss_cls: tensor(0.6932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9621, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:01<01:53,  5.43s/it][A

	loss_cls: tensor(0.4611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6126, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:06<01:48,  5.41s/it][A

	loss_cls: tensor(0.8623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9204, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:12<01:43,  5.42s/it][A

	loss_cls: tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7273, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:17<01:37,  5.41s/it][A

	loss_cls: tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7196, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:23<01:32,  5.43s/it][A

	loss_cls: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7132, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:28<01:26,  5.43s/it][A

	loss_cls: tensor(0.7873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9840, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:34<01:21,  5.42s/it][A

	loss_cls: tensor(0.6252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9953, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:39<01:16,  5.43s/it][A

	loss_cls: tensor(0.7191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0230, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:44<01:10,  5.42s/it][A

	loss_cls: tensor(0.7816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0074, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:50<01:05,  5.43s/it][A

	loss_cls: tensor(0.5123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7051, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:55<00:59,  5.42s/it][A

	loss_cls: tensor(0.4782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5072, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:01<00:54,  5.44s/it][A

	loss_cls: tensor(0.4879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5469, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:06<00:48,  5.43s/it][A

	loss_cls: tensor(0.7058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8709, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:12<00:43,  5.41s/it][A

	loss_cls: tensor(0.6703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8995, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:17<00:37,  5.43s/it][A

	loss_cls: tensor(0.5037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9686, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:22<00:32,  5.42s/it][A

	loss_cls: tensor(0.8491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1701, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:28<00:27,  5.44s/it][A

	loss_cls: tensor(0.7006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8482, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:33<00:21,  5.42s/it][A

	loss_cls: tensor(0.6907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9443, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:39<00:16,  5.44s/it][A

	loss_cls: tensor(0.4260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6479, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:44<00:10,  5.43s/it][A

	loss_cls: tensor(0.4353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5821, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:50<00:05,  5.42s/it][A

	loss_cls: tensor(0.6120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8060, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:52<00:00,  5.40s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3840, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8316294728531002

	Training cls acc: 0.6995645009416196

	Training cls prec: 0.5769023192752006

	Training cls rec: 0.6125087541930763

	Training cls f1: 0.5412407788512581

--
	Training ner acc: 0.9555432506174818

	Training ner prec: 0.27186580714574626

	Training ner rec: 0.28050388305657464

	Training ner f1: 0.27575615964763595

	Current Learning rate:  0.0004



  1%|          | 1/177 [00:00<02:08,  1.37it/s][A
  1%|          | 2/177 [00:01<01:59,  1.46it/s][A
  2%|▏         | 3/177 [00:02<02:04,  1.39it/s][A
  2%|▏         | 4/177 [00:02<02:05,  1.38it/s][A
  3%|▎         | 5/177 [00:03<02:00,  1.43it/s][A
  3%|▎         | 6/177 [00:04<02:01,  1.41it/s][A
  4%|▍         | 7/177 [00:05<02:02,  1.39it/s][A
  5%|▍         | 8/177 [00:05<02:02,  1.38it/s][A
  5%|▌         | 9/177 [00:06<01:57,  1.43it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.41it/s][A
  6%|▌         | 11/177 [00:07<01:59,  1.39it/s][A
  7%|▋         | 12/177 [00:08<01:55,  1.43it/s][A
  7%|▋         | 13/177 [00:09<01:55,  1.42it/s][A
  8%|▊         | 14/177 [00:09<01:56,  1.40it/s][A
  8%|▊         | 15/177 [00:10<01:56,  1.39it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.42it/s][A
 10%|▉         | 17/177 [00:12<01:53,  1.40it/s][A
 10%|█         | 18/177 [00:12<01:54,  1.39it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.38it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7781760056140059

	Validation cls acc: 0.7798964218455745

	Validation cls prec: 0.6544626042507399

	Validation cls rec: 0.646835485606672

	Validation cls f1: 0.6266206016206016

--
	Validation ner acc: 0.9531655142008643

	Validation ner prec: 0.4117582576129999

	Validation ner rec: 0.42250470809792845

	Validation ner f1: 0.4169051436634744



  0%|          | 1/354 [00:05<32:05,  5.46s/it][A

	loss_cls: tensor(0.4526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5064, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:35,  5.38s/it][A

	loss_cls: tensor(1.0615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3628, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:37,  5.41s/it][A

	loss_cls: tensor(0.4487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7180, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:23,  5.38s/it][A

	loss_cls: tensor(0.8945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1205, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:12,  5.36s/it][A

	loss_cls: tensor(0.3329, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3950, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:11,  5.38s/it][A

	loss_cls: tensor(0.5192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6644, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:05,  5.38s/it][A

	loss_cls: tensor(0.4523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4917, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:05,  5.39s/it][A

	loss_cls: tensor(0.4811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7093, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:53,  5.37s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6709, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:52,  5.39s/it][A

	loss_cls: tensor(0.7640, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8777, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:41,  5.37s/it][A

	loss_cls: tensor(0.5531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6340, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:30,  5.35s/it][A

	loss_cls: tensor(0.3910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4683, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:28,  5.36s/it][A

	loss_cls: tensor(0.8245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2261, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0506, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:19,  5.35s/it][A

	loss_cls: tensor(0.3541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3800, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:20,  5.37s/it][A

	loss_cls: tensor(0.3110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3292, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:25<30:10,  5.36s/it][A

	loss_cls: tensor(0.6234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8736, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:12,  5.38s/it][A

	loss_cls: tensor(0.7198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8883, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<30:02,  5.36s/it][A

	loss_cls: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1623, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<29:51,  5.35s/it][A

	loss_cls: tensor(0.8009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0882, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<29:52,  5.37s/it][A

	loss_cls: tensor(0.2946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3521, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:44,  5.36s/it][A

	loss_cls: tensor(0.7671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9753, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:46,  5.38s/it][A

	loss_cls: tensor(0.7497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0913, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:34,  5.36s/it][A

	loss_cls: tensor(0.9538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1795, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:32,  5.37s/it][A

	loss_cls: tensor(0.5230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8591, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:14<29:20,  5.35s/it][A

	loss_cls: tensor(0.8522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0801, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:10,  5.34s/it][A

	loss_cls: tensor(0.7539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4652, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:12,  5.36s/it][A

	loss_cls: tensor(0.7544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9984, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:30<29:03,  5.35s/it][A

	loss_cls: tensor(0.6323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8000, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<29:03,  5.37s/it][A

	loss_cls: tensor(0.5017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5746, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<28:54,  5.35s/it][A

	loss_cls: tensor(0.5524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7482, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:46<28:52,  5.36s/it][A

	loss_cls: tensor(0.7868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9003, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<28:41,  5.35s/it][A

	loss_cls: tensor(0.6675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1692, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8367, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:57<28:40,  5.36s/it][A

	loss_cls: tensor(0.6216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8301, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:02<28:33,  5.36s/it][A

	loss_cls: tensor(0.8554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2420, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:26,  5.35s/it][A

	loss_cls: tensor(0.6189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9461, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:13<28:26,  5.37s/it][A

	loss_cls: tensor(0.6224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0106, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:17,  5.35s/it][A

	loss_cls: tensor(0.5878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7574, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:23<28:14,  5.36s/it][A

	loss_cls: tensor(0.5985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7674, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:29<28:07,  5.36s/it][A

	loss_cls: tensor(0.5237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6068, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:34<28:05,  5.37s/it][A

	loss_cls: tensor(0.5901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9258, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:39<27:55,  5.35s/it][A

	loss_cls: tensor(0.6015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8911, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:45<27:46,  5.34s/it][A

	loss_cls: tensor(0.6784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9007, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:50<27:47,  5.36s/it][A

	loss_cls: tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8358, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:55<27:36,  5.35s/it][A

	loss_cls: tensor(0.6736, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8911, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:01<27:35,  5.36s/it][A

	loss_cls: tensor(0.6282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6650, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:06<27:25,  5.34s/it][A

	loss_cls: tensor(0.6063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8666, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:12<27:26,  5.36s/it][A

	loss_cls: tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9173, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:17<27:18,  5.35s/it][A

	loss_cls: tensor(0.4726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7281, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:22<27:09,  5.34s/it][A

	loss_cls: tensor(0.4848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7290, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:28<27:07,  5.35s/it][A

	loss_cls: tensor(0.2934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6070, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:33<27:00,  5.35s/it][A

	loss_cls: tensor(0.4463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5492, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:38<26:59,  5.36s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6128, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:44<26:47,  5.34s/it][A

	loss_cls: tensor(0.6804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9172, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:49<26:46,  5.35s/it][A

	loss_cls: tensor(0.4099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7004, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:54<26:37,  5.34s/it][A

	loss_cls: tensor(0.5465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2994, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8459, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:00<26:28,  5.33s/it][A

	loss_cls: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8689, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:05<26:28,  5.35s/it][A

	loss_cls: tensor(0.4063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5204, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:10<26:16,  5.33s/it][A

	loss_cls: tensor(0.5223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2280, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7503, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:16<26:16,  5.35s/it][A

	loss_cls: tensor(0.4796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1524, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:21<26:06,  5.33s/it][A

	loss_cls: tensor(0.5623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7539, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:26<26:07,  5.35s/it][A

	loss_cls: tensor(0.7245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8647, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:32<26:00,  5.35s/it][A

	loss_cls: tensor(0.9007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9979, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:37<26:02,  5.37s/it][A

	loss_cls: tensor(0.4708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7132, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:43<26:12,  5.42s/it][A

	loss_cls: tensor(0.4859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5776, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:48<26:11,  5.44s/it][A

	loss_cls: tensor(0.3615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5719, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:54<26:15,  5.47s/it][A

	loss_cls: tensor(0.5551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8244, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [05:59<26:12,  5.48s/it][A

	loss_cls: tensor(0.8728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1864, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:05<26:15,  5.51s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7449, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:10<26:06,  5.50s/it][A

	loss_cls: tensor(0.4925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6481, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:16<25:58,  5.49s/it][A

	loss_cls: tensor(0.4404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5126, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:21<25:57,  5.50s/it][A

	loss_cls: tensor(0.4776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6700, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:27<25:48,  5.49s/it][A

	loss_cls: tensor(0.3564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7540, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:32<25:48,  5.51s/it][A

	loss_cls: tensor(0.5400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6445, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:38<25:41,  5.50s/it][A

	loss_cls: tensor(0.5304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6612, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:43<25:38,  5.52s/it][A

	loss_cls: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7078, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:49<25:29,  5.50s/it][A

	loss_cls: tensor(0.6607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0970, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:54<25:20,  5.49s/it][A

	loss_cls: tensor(0.4551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9421, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:00<25:20,  5.51s/it][A

	loss_cls: tensor(0.2926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4547, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:05<25:11,  5.50s/it][A

	loss_cls: tensor(0.6471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8230, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:11<25:11,  5.52s/it][A

	loss_cls: tensor(0.4207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4621, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:16<25:02,  5.50s/it][A

	loss_cls: tensor(0.5282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7958, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<25:02,  5.52s/it][A

	loss_cls: tensor(0.5050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7911, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:27<24:52,  5.51s/it][A

	loss_cls: tensor(0.5127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7380, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:45,  5.50s/it][A

	loss_cls: tensor(0.3202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3643, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:38<24:38,  5.50s/it][A

	loss_cls: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6870, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:31,  5.49s/it][A

	loss_cls: tensor(0.8583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2223, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:49<24:32,  5.51s/it][A

	loss_cls: tensor(0.3624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5554, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<24:24,  5.50s/it][A

	loss_cls: tensor(0.4176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5956, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<24:22,  5.52s/it][A

	loss_cls: tensor(0.6588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9212, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:06<24:13,  5.51s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0668, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:21,  5.33s/it][A

	loss_cls: tensor(0.3482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8640, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<22:47,  5.22s/it][A

	loss_cls: tensor(0.6022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7791, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:21<22:20,  5.14s/it][A

	loss_cls: tensor(0.4587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6632, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:26<22:04,  5.10s/it][A

	loss_cls: tensor(0.8017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3356, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1372, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:31<21:49,  5.05s/it][A

	loss_cls: tensor(0.6120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8276, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:36<22:14,  5.17s/it][A

	loss_cls: tensor(0.6154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9221, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:42<22:31,  5.26s/it][A

	loss_cls: tensor(0.4025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4632, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:47<22:42,  5.32s/it][A

	loss_cls: tensor(0.5957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8499, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:53<22:54,  5.39s/it][A

	loss_cls: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6909, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:58<22:56,  5.42s/it][A

	loss_cls: tensor(0.8423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1559, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:04<23:02,  5.46s/it][A

	loss_cls: tensor(0.6644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0594, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:09<22:58,  5.47s/it][A

	loss_cls: tensor(0.4518, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5086, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:14<22:20,  5.34s/it][A

	loss_cls: tensor(0.8027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0855, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:20<22:25,  5.38s/it][A

	loss_cls: tensor(0.5920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7804, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:25<22:31,  5.43s/it][A

	loss_cls: tensor(0.3989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5718, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:31<22:36,  5.47s/it][A

	loss_cls: tensor(0.4145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6335, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:36<22:32,  5.48s/it][A

	loss_cls: tensor(0.4029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7913, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:42<22:28,  5.48s/it][A

	loss_cls: tensor(1.0200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4708, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:47<22:23,  5.48s/it][A

	loss_cls: tensor(0.3321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6630, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:53<22:26,  5.52s/it][A

	loss_cls: tensor(0.5821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7686, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:58<22:18,  5.51s/it][A

	loss_cls: tensor(1.1024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1579, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:04<22:14,  5.52s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7459, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:09<21:32,  5.36s/it][A

	loss_cls: tensor(0.6063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8393, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:14<21:19,  5.33s/it][A

	loss_cls: tensor(0.6549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8568, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:20<21:33,  5.41s/it][A

	loss_cls: tensor(0.5011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5818, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:25<21:34,  5.44s/it][A

	loss_cls: tensor(0.7738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0454, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:31<21:38,  5.48s/it][A

	loss_cls: tensor(0.6036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8039, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:36<21:33,  5.48s/it][A

	loss_cls: tensor(0.5114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7060, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:42<21:26,  5.48s/it][A

	loss_cls: tensor(0.3381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5568, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:47<21:28,  5.51s/it][A

	loss_cls: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.8914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4114, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:53<21:20,  5.50s/it][A

	loss_cls: tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6774, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:58<21:21,  5.52s/it][A

	loss_cls: tensor(0.4832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7866, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:04<21:12,  5.51s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6309, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:09<21:08,  5.52s/it][A

	loss_cls: tensor(0.8863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1321, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:15<20:58,  5.50s/it][A

	loss_cls: tensor(0.5650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8029, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:20<20:41,  5.44s/it][A

	loss_cls: tensor(0.3681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5920, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:26<20:44,  5.48s/it][A

	loss_cls: tensor(0.5191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8546, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:31<20:39,  5.49s/it][A

	loss_cls: tensor(0.6557, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8967, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:36<19:56,  5.32s/it][A

	loss_cls: tensor(0.4674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7435, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:41<19:23,  5.19s/it][A

	loss_cls: tensor(0.5639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0692, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6331, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:46<19:31,  5.26s/it][A

	loss_cls: tensor(0.3929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7373, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:52<19:39,  5.31s/it][A

	loss_cls: tensor(0.5394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8624, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:57<19:50,  5.39s/it][A

	loss_cls: tensor(1.0160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3663, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:03<19:53,  5.42s/it][A

	loss_cls: tensor(0.6932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9977, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:08<19:52,  5.45s/it][A

	loss_cls: tensor(0.7897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8520, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:14<19:54,  5.48s/it][A

	loss_cls: tensor(0.5522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8504, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:19<19:47,  5.47s/it][A

	loss_cls: tensor(0.6710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7721, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:25<19:31,  5.43s/it][A

	loss_cls: tensor(0.3752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6284, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:30<19:28,  5.44s/it][A

	loss_cls: tensor(0.5809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8279, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:36<19:31,  5.48s/it][A

	loss_cls: tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8143, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:41<19:27,  5.48s/it][A

	loss_cls: tensor(0.6067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6520, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:47<19:22,  5.49s/it][A

	loss_cls: tensor(0.7408, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1892, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:52<19:21,  5.50s/it][A

	loss_cls: tensor(0.6947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9143, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:58<19:13,  5.49s/it][A

	loss_cls: tensor(0.7336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8813, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:03<19:11,  5.51s/it][A

	loss_cls: tensor(0.4810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5998, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:09<19:02,  5.49s/it][A

	loss_cls: tensor(0.6422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9479, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:14<19:01,  5.52s/it][A

	loss_cls: tensor(0.5221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6288, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:20<18:55,  5.51s/it][A

	loss_cls: tensor(0.6715, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9612, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:25<18:47,  5.50s/it][A

	loss_cls: tensor(0.7607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0355, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:31<18:46,  5.52s/it][A

	loss_cls: tensor(0.5621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6358, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:36<18:37,  5.51s/it][A

	loss_cls: tensor(0.3912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6499, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:42<18:35,  5.52s/it][A

	loss_cls: tensor(0.6324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8585, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:47<18:25,  5.50s/it][A

	loss_cls: tensor(1.2000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4597, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:53<18:12,  5.46s/it][A

	loss_cls: tensor(0.3385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4185, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:58<18:08,  5.47s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4942, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:04<18:03,  5.47s/it][A

	loss_cls: tensor(0.3848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4747, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:09<18:02,  5.50s/it][A

	loss_cls: tensor(0.3191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3526, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:15<17:55,  5.49s/it][A

	loss_cls: tensor(0.5114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1863, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:20<17:54,  5.51s/it][A

	loss_cls: tensor(0.2933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4818, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:26<17:49,  5.52s/it][A

	loss_cls: tensor(0.7588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2028, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:31<17:47,  5.53s/it][A

	loss_cls: tensor(0.5315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6354, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:37<17:39,  5.52s/it][A

	loss_cls: tensor(0.3624, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6115, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:42<17:32,  5.51s/it][A

	loss_cls: tensor(0.3835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6255, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:48<17:32,  5.54s/it][A

	loss_cls: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9276, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:53<17:23,  5.52s/it][A

	loss_cls: tensor(0.4313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5739, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:59<17:19,  5.53s/it][A

	loss_cls: tensor(0.4701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7587, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:04<17:11,  5.51s/it][A

	loss_cls: tensor(0.4815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6012, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:10<17:09,  5.53s/it][A

	loss_cls: tensor(0.7797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0727, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:16<17:01,  5.52s/it][A

	loss_cls: tensor(0.8028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2388, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:21<16:53,  5.51s/it][A

	loss_cls: tensor(1.0312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2945, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:26<16:16,  5.34s/it][A

	loss_cls: tensor(0.4562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8096, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:31<15:47,  5.20s/it][A

	loss_cls: tensor(0.4433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6687, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:36<15:28,  5.13s/it][A

	loss_cls: tensor(0.6725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9483, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:41<15:12,  5.07s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7745, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:46<15:22,  5.15s/it][A

	loss_cls: tensor(0.6269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7710, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:52<15:35,  5.25s/it][A

	loss_cls: tensor(0.4980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6452, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:57<15:37,  5.30s/it][A

	loss_cls: tensor(0.6623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7896, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:03<15:45,  5.37s/it][A

	loss_cls: tensor(0.8700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0611, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:08<15:45,  5.40s/it][A

	loss_cls: tensor(0.7451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9703, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:14<15:48,  5.45s/it][A

	loss_cls: tensor(1.0647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2396, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:19<15:43,  5.45s/it][A

	loss_cls: tensor(0.7043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8438, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:25<15:42,  5.48s/it][A

	loss_cls: tensor(0.6195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8369, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:30<15:36,  5.48s/it][A

	loss_cls: tensor(0.6252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8440, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:35<15:30,  5.47s/it][A

	loss_cls: tensor(0.7923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1825, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:41<15:28,  5.49s/it][A

	loss_cls: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7448, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:46<15:19,  5.48s/it][A

	loss_cls: tensor(0.6240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7083, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:52<15:18,  5.50s/it][A

	loss_cls: tensor(0.5789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7821, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:57<15:11,  5.49s/it][A

	loss_cls: tensor(0.5696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7334, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:03<15:08,  5.50s/it][A

	loss_cls: tensor(0.3766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7345, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:08<14:58,  5.48s/it][A

	loss_cls: tensor(0.8409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0193, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:14<14:52,  5.48s/it][A

	loss_cls: tensor(0.6197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7165, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:19<14:50,  5.50s/it][A

	loss_cls: tensor(0.4107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8985, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:25<14:43,  5.49s/it][A

	loss_cls: tensor(0.5499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9386, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:30<14:41,  5.51s/it][A

	loss_cls: tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6161, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:36<14:34,  5.50s/it][A

	loss_cls: tensor(0.5785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7880, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:42<14:31,  5.52s/it][A

	loss_cls: tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7563, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:47<14:23,  5.50s/it][A

	loss_cls: tensor(0.6650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1614, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8264, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:52<14:16,  5.49s/it][A

	loss_cls: tensor(0.6653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8819, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:58<14:12,  5.50s/it][A

	loss_cls: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6790, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:03<14:01,  5.47s/it][A

	loss_cls: tensor(0.4207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6016, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:09<14:01,  5.50s/it][A

	loss_cls: tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7677, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:14<13:55,  5.50s/it][A

	loss_cls: tensor(0.8137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9599, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:20<13:52,  5.51s/it][A

	loss_cls: tensor(0.7077, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8978, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:25<13:44,  5.50s/it][A

	loss_cls: tensor(0.5308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2314, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7622, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:31<13:36,  5.48s/it][A

	loss_cls: tensor(0.4163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6642, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:36<13:35,  5.51s/it][A

	loss_cls: tensor(0.4957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5506, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:42<13:28,  5.50s/it][A

	loss_cls: tensor(0.8049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1970, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:48<13:26,  5.53s/it][A

	loss_cls: tensor(0.5621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8544, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:53<13:17,  5.50s/it][A

	loss_cls: tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1579, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:59<13:13,  5.51s/it][A

	loss_cls: tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8177, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:04<13:06,  5.50s/it][A

	loss_cls: tensor(0.3739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6025, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:09<12:58,  5.48s/it][A

	loss_cls: tensor(0.5769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7861, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:15<12:54,  5.50s/it][A

	loss_cls: tensor(0.6879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9484, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:20<12:47,  5.48s/it][A

	loss_cls: tensor(0.4803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5294, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:26<12:44,  5.50s/it][A

	loss_cls: tensor(1.0472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2842, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:31<12:37,  5.49s/it][A

	loss_cls: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0292, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:37<12:33,  5.50s/it][A

	loss_cls: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8010, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:42<12:25,  5.49s/it][A

	loss_cls: tensor(0.3534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5465, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:48<12:18,  5.47s/it][A

	loss_cls: tensor(0.6745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8844, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:53<12:16,  5.50s/it][A

	loss_cls: tensor(0.3984, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5534, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:59<12:10,  5.49s/it][A

	loss_cls: tensor(0.2458, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3740, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:04<12:05,  5.50s/it][A

	loss_cls: tensor(0.3714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7239, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:10<11:56,  5.47s/it][A

	loss_cls: tensor(0.4582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5915, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:15<11:52,  5.48s/it][A

	loss_cls: tensor(0.5317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6532, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:21<11:45,  5.47s/it][A

	loss_cls: tensor(0.4564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6251, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:26<11:38,  5.45s/it][A

	loss_cls: tensor(0.9385, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2354, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:32<11:36,  5.49s/it][A

	loss_cls: tensor(0.6675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0185, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:37<11:30,  5.48s/it][A

	loss_cls: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9789, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:43<11:27,  5.50s/it][A

	loss_cls: tensor(0.5581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7571, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:48<11:20,  5.49s/it][A

	loss_cls: tensor(0.7617, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0621, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:54<11:15,  5.49s/it][A

	loss_cls: tensor(0.6208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8016, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:59<11:07,  5.47s/it][A

	loss_cls: tensor(1.1421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1507, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2928, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:05<11:03,  5.48s/it][A

	loss_cls: tensor(0.4541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5442, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:10<10:57,  5.48s/it][A

	loss_cls: tensor(0.4472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7775, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:16<10:51,  5.48s/it][A

	loss_cls: tensor(0.6862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1618, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:21<10:48,  5.50s/it][A

	loss_cls: tensor(0.5819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7858, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:27<10:42,  5.49s/it][A

	loss_cls: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1004, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:32<10:39,  5.51s/it][A

	loss_cls: tensor(0.6121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9406, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:38<10:31,  5.50s/it][A

	loss_cls: tensor(1.1266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3695, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:43<10:27,  5.51s/it][A

	loss_cls: tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1597, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:49<10:21,  5.50s/it][A

	loss_cls: tensor(0.8609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9470, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:54<10:15,  5.49s/it][A

	loss_cls: tensor(0.7836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8889, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:00<10:11,  5.51s/it][A

	loss_cls: tensor(0.4880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6805, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:05<10:04,  5.50s/it][A

	loss_cls: tensor(0.4852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5983, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:11<09:57,  5.49s/it][A

	loss_cls: tensor(0.7774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9732, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:16<09:50,  5.47s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7925, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:22<09:48,  5.50s/it][A

	loss_cls: tensor(0.7621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1117, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:27<09:41,  5.49s/it][A

	loss_cls: tensor(0.6673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0227, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:32<09:35,  5.48s/it][A

	loss_cls: tensor(0.4822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8216, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:38<09:31,  5.50s/it][A

	loss_cls: tensor(0.6019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7720, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:43<09:25,  5.49s/it][A

	loss_cls: tensor(0.4196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7611, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:49<09:21,  5.50s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6748, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:54<09:13,  5.48s/it][A

	loss_cls: tensor(0.3660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5243, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:00<09:10,  5.50s/it][A

	loss_cls: tensor(0.4405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4940, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:05<09:03,  5.49s/it][A

	loss_cls: tensor(0.4276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6052, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:11<08:56,  5.48s/it][A

	loss_cls: tensor(0.6018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7906, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:16<08:52,  5.49s/it][A

	loss_cls: tensor(0.4672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6376, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:22<08:46,  5.48s/it][A

	loss_cls: tensor(0.9421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1437, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:27<08:42,  5.50s/it][A

	loss_cls: tensor(0.3500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6400, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:32<08:23,  5.36s/it][A

	loss_cls: tensor(0.7012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9656, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:38<08:11,  5.29s/it][A

	loss_cls: tensor(0.5680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0676, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:43<08:06,  5.29s/it][A

	loss_cls: tensor(0.5262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9051, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:48<08:05,  5.34s/it][A

	loss_cls: tensor(0.7217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8610, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:54<08:05,  5.40s/it][A

	loss_cls: tensor(0.5657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9329, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:59<08:02,  5.42s/it][A

	loss_cls: tensor(0.7819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1882, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:05<08:00,  5.46s/it][A

	loss_cls: tensor(0.5989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7215, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:10<07:55,  5.46s/it][A

	loss_cls: tensor(0.5964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6990, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:16<07:49,  5.45s/it][A

	loss_cls: tensor(0.5893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7700, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:21<07:43,  5.46s/it][A

	loss_cls: tensor(0.4469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8395, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:27<07:38,  5.46s/it][A

	loss_cls: tensor(0.8806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0857, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:32<07:34,  5.48s/it][A

	loss_cls: tensor(0.5222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7595, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:38<07:28,  5.47s/it][A

	loss_cls: tensor(0.6074, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7621, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:43<07:24,  5.49s/it][A

	loss_cls: tensor(0.5738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6472, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:49<07:18,  5.48s/it][A

	loss_cls: tensor(0.4153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7353, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:54<07:14,  5.50s/it][A

	loss_cls: tensor(0.6145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7995, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:00<07:09,  5.50s/it][A

	loss_cls: tensor(0.8416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9271, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:05<07:02,  5.49s/it][A

	loss_cls: tensor(0.5446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6356, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:11<06:58,  5.51s/it][A

	loss_cls: tensor(0.6066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6457, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:16<06:51,  5.49s/it][A

	loss_cls: tensor(0.7446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9862, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:22<06:47,  5.51s/it][A

	loss_cls: tensor(0.9019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0002, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:27<06:41,  5.50s/it][A

	loss_cls: tensor(0.8704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0812, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:33<06:37,  5.51s/it][A

	loss_cls: tensor(0.6821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0484, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:38<06:30,  5.50s/it][A

	loss_cls: tensor(0.5914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6306, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:44<06:23,  5.48s/it][A

	loss_cls: tensor(0.4579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4865, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:49<06:19,  5.51s/it][A

	loss_cls: tensor(0.6312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9919, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:55<06:13,  5.49s/it][A

	loss_cls: tensor(0.6315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7762, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:00<06:09,  5.51s/it][A

	loss_cls: tensor(0.5911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9536, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:06<06:03,  5.51s/it][A

	loss_cls: tensor(0.5775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8135, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:11<05:58,  5.52s/it][A

	loss_cls: tensor(0.4557, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6667, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:17<05:52,  5.50s/it][A

	loss_cls: tensor(0.5191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7078, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:22<05:44,  5.46s/it][A

	loss_cls: tensor(0.4601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5455, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:28<05:39,  5.48s/it][A

	loss_cls: tensor(0.5590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7881, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:33<05:34,  5.48s/it][A

	loss_cls: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6841, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:39<05:30,  5.50s/it][A

	loss_cls: tensor(0.6655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8352, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:44<05:24,  5.49s/it][A

	loss_cls: tensor(0.7694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1677, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:50<05:19,  5.51s/it][A

	loss_cls: tensor(0.6200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7072, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:55<05:13,  5.49s/it][A

	loss_cls: tensor(0.9429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1427, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:01<05:07,  5.49s/it][A

	loss_cls: tensor(0.2546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3710, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:06<05:02,  5.50s/it][A

	loss_cls: tensor(0.4260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4731, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:12<04:56,  5.48s/it][A

	loss_cls: tensor(1.1306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4322, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:17<04:51,  5.51s/it][A

	loss_cls: tensor(0.6235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8131, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:23<04:45,  5.49s/it][A

	loss_cls: tensor(0.8963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0532, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:28<04:41,  5.52s/it][A

	loss_cls: tensor(0.4168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6110, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:34<04:32,  5.46s/it][A

	loss_cls: tensor(0.6692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8757, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:39<04:25,  5.42s/it][A

	loss_cls: tensor(0.8535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1926, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:44<04:19,  5.40s/it][A

	loss_cls: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7402, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:50<04:12,  5.38s/it][A

	loss_cls: tensor(0.8054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1901, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:55<04:08,  5.40s/it][A

	loss_cls: tensor(0.5922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7066, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:00<04:02,  5.38s/it][A

	loss_cls: tensor(0.5053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8606, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:06<03:57,  5.39s/it][A

	loss_cls: tensor(0.7220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0763, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:11<03:50,  5.37s/it][A

	loss_cls: tensor(0.5424, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6761, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:16<03:44,  5.35s/it][A

	loss_cls: tensor(0.5971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8182, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:22<03:40,  5.37s/it][A

	loss_cls: tensor(0.4801, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6561, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:27<03:34,  5.36s/it][A

	loss_cls: tensor(0.6986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8162, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:33<03:29,  5.38s/it][A

	loss_cls: tensor(0.7497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9602, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:38<03:23,  5.36s/it][A

	loss_cls: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7997, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:43<03:18,  5.36s/it][A

	loss_cls: tensor(0.5491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8172, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:49<03:12,  5.35s/it][A

	loss_cls: tensor(0.6166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7639, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:54<03:06,  5.34s/it][A

	loss_cls: tensor(0.5239, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6939, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:59<03:02,  5.37s/it][A

	loss_cls: tensor(0.6902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8849, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:05<02:56,  5.36s/it][A

	loss_cls: tensor(0.7495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9075, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:10<02:52,  5.39s/it][A

	loss_cls: tensor(0.8010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0020, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:15<02:46,  5.37s/it][A

	loss_cls: tensor(0.6196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7890, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:21<02:41,  5.39s/it][A

	loss_cls: tensor(0.4946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6018, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:26<02:35,  5.37s/it][A

	loss_cls: tensor(0.5890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7398, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:32<02:29,  5.35s/it][A

	loss_cls: tensor(0.5756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7877, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:37<02:25,  5.38s/it][A

	loss_cls: tensor(0.6572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9148, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:42<02:19,  5.36s/it][A

	loss_cls: tensor(0.4964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7133, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:48<02:14,  5.38s/it][A

	loss_cls: tensor(0.5787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0812, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6600, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:53<02:08,  5.37s/it][A

	loss_cls: tensor(0.5917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7376, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:58<02:03,  5.38s/it][A

	loss_cls: tensor(0.7532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9590, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:04<01:58,  5.37s/it][A

	loss_cls: tensor(0.4674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6906, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:09<01:52,  5.38s/it][A

	loss_cls: tensor(0.3770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4890, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:15<01:47,  5.37s/it][A

	loss_cls: tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9219, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:20<01:41,  5.36s/it][A

	loss_cls: tensor(0.6323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9232, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:25<01:36,  5.39s/it][A

	loss_cls: tensor(0.6527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8851, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:31<01:31,  5.37s/it][A

	loss_cls: tensor(0.4851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6757, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:36<01:26,  5.39s/it][A

	loss_cls: tensor(0.4663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5923, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:41<01:20,  5.38s/it][A

	loss_cls: tensor(0.6763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7178, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:47<01:15,  5.38s/it][A

	loss_cls: tensor(0.5301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6235, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:52<01:10,  5.39s/it][A

	loss_cls: tensor(0.4886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2858, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:58<01:04,  5.37s/it][A

	loss_cls: tensor(0.7915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0022, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:03<00:59,  5.39s/it][A

	loss_cls: tensor(0.9856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3654, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:08<00:53,  5.37s/it][A

	loss_cls: tensor(0.6722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1789, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:14<00:48,  5.39s/it][A

	loss_cls: tensor(1.0108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2000, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:19<00:42,  5.36s/it][A

	loss_cls: tensor(0.6348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9737, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:24<00:37,  5.38s/it][A

	loss_cls: tensor(0.6342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7387, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:30<00:32,  5.37s/it][A

	loss_cls: tensor(0.5246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7745, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:35<00:26,  5.36s/it][A

	loss_cls: tensor(0.6090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7628, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:41<00:21,  5.38s/it][A

	loss_cls: tensor(0.4753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7399, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:46<00:16,  5.37s/it][A

	loss_cls: tensor(0.4414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7326, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:51<00:10,  5.37s/it][A

	loss_cls: tensor(0.5993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7920, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:57<00:05,  5.37s/it][A

	loss_cls: tensor(0.6067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0031, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:59<00:00,  5.42s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.4750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0372, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8255670296270295

	Training cls acc: 0.7069209039548022

	Training cls prec: 0.5790637799112376

	Training cls rec: 0.618858501432654

	Training cls f1: 0.5460198265709799

--
	Training ner acc: 0.955119274241577

	Training ner prec: 0.2758498077142352

	Training ner rec: 0.28342345906919664

	Training ner f1: 0.2789011369642465

	Current Learning rate:  0.00037142857142857143



  1%|          | 1/177 [00:00<02:01,  1.45it/s][A
  1%|          | 2/177 [00:01<02:05,  1.39it/s][A
  2%|▏         | 3/177 [00:02<02:06,  1.38it/s][A
  2%|▏         | 4/177 [00:02<02:00,  1.44it/s][A
  3%|▎         | 5/177 [00:03<02:02,  1.41it/s][A
  3%|▎         | 6/177 [00:04<02:02,  1.39it/s][A
  4%|▍         | 7/177 [00:05<02:02,  1.38it/s][A
  5%|▍         | 8/177 [00:05<01:58,  1.43it/s][A
  5%|▌         | 9/177 [00:06<01:59,  1.41it/s][A
  6%|▌         | 10/177 [00:07<02:00,  1.38it/s][A
  6%|▌         | 11/177 [00:07<01:56,  1.42it/s][A
  7%|▋         | 12/177 [00:08<01:57,  1.41it/s][A
  7%|▋         | 13/177 [00:09<01:57,  1.39it/s][A
  8%|▊         | 14/177 [00:10<01:58,  1.38it/s][A
  8%|▊         | 15/177 [00:10<01:53,  1.42it/s][A
  9%|▉         | 16/177 [00:11<01:54,  1.40it/s][A
 10%|▉         | 17/177 [00:12<01:55,  1.39it/s][A
 10%|█         | 18/177 [00:12<01:55,  1.38it/s][A
 11%|█         | 19/177 [00:13<01:51,  1.42it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7889634151916719

	Validation cls acc: 0.6313559322033898

	Validation cls prec: 0.5870056497175141

	Validation cls rec: 0.5620191014258811

	Validation cls f1: 0.5092499151821186

--
	Validation ner acc: 0.9549056097304583

	Validation ner prec: 0.42641242436443016

	Validation ner rec: 0.43681732580037663

	Validation ner f1: 0.4314057963319963



  0%|          | 1/354 [00:05<31:43,  5.39s/it][A

	loss_cls: tensor(0.4126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6980, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:54,  5.44s/it][A

	loss_cls: tensor(0.6512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8193, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:43,  5.42s/it][A

	loss_cls: tensor(0.7103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0284, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:47,  5.45s/it][A

	loss_cls: tensor(0.6569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8849, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:34,  5.43s/it][A

	loss_cls: tensor(0.5177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6881, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:37,  5.45s/it][A

	loss_cls: tensor(0.6202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8009, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:21,  5.42s/it][A

	loss_cls: tensor(0.5723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8336, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:10,  5.41s/it][A

	loss_cls: tensor(0.9721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1803, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:14,  5.43s/it][A

	loss_cls: tensor(0.3871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8034, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:00,  5.41s/it][A

	loss_cls: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9376, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:58,  5.42s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7744, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:45,  5.40s/it][A

	loss_cls: tensor(0.9236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9846, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:41,  5.40s/it][A

	loss_cls: tensor(0.6992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9872, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:28,  5.38s/it][A

	loss_cls: tensor(0.5623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8680, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:15,  5.36s/it][A

	loss_cls: tensor(0.5837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1988, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7825, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:18,  5.38s/it][A

	loss_cls: tensor(0.5197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7809, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:10,  5.37s/it][A

	loss_cls: tensor(0.7713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8840, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:12,  5.39s/it][A

	loss_cls: tensor(0.9585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2600, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:00,  5.37s/it][A

	loss_cls: tensor(0.5794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6428, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<29:59,  5.39s/it][A

	loss_cls: tensor(0.6690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7496, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<29:48,  5.37s/it][A

	loss_cls: tensor(0.6592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7847, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:47,  5.38s/it][A

	loss_cls: tensor(0.3958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7066, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:36,  5.37s/it][A

	loss_cls: tensor(0.5125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1206, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:28,  5.36s/it][A

	loss_cls: tensor(0.7873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1483, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:14<29:29,  5.38s/it][A

	loss_cls: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8305, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:20,  5.37s/it][A

	loss_cls: tensor(0.4014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5430, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:25<29:20,  5.38s/it][A

	loss_cls: tensor(0.4755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5852, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:30<29:08,  5.36s/it][A

	loss_cls: tensor(0.8985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1768, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:11,  5.39s/it][A

	loss_cls: tensor(0.5207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8015, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:41<29:02,  5.38s/it][A

	loss_cls: tensor(0.3930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6359, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<28:53,  5.37s/it][A

	loss_cls: tensor(0.7516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2000, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:52<28:53,  5.38s/it][A

	loss_cls: tensor(0.5626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6565, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:57<28:43,  5.37s/it][A

	loss_cls: tensor(0.5284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6839, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<28:41,  5.38s/it][A

	loss_cls: tensor(0.6131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6702, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:08<28:31,  5.36s/it][A

	loss_cls: tensor(0.4573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5153, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:31,  5.38s/it][A

	loss_cls: tensor(0.6924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8916, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:19<28:22,  5.37s/it][A

	loss_cls: tensor(0.4823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8801, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:24<28:12,  5.36s/it][A

	loss_cls: tensor(0.3409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5255, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:30<28:12,  5.37s/it][A

	loss_cls: tensor(0.6015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6908, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:35<28:02,  5.36s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5791, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:40<28:03,  5.38s/it][A

	loss_cls: tensor(0.5581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9188, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:46<27:54,  5.37s/it][A

	loss_cls: tensor(0.5267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7019, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:51<27:56,  5.39s/it][A

	loss_cls: tensor(1.0482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3497, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:56<27:46,  5.38s/it][A

	loss_cls: tensor(0.3285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4593, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:02<27:38,  5.37s/it][A

	loss_cls: tensor(0.4313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6053, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:07<27:39,  5.39s/it][A

	loss_cls: tensor(0.4103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6308, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:13<27:28,  5.37s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6062, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:18<27:26,  5.38s/it][A

	loss_cls: tensor(0.4774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6441, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:23<27:17,  5.37s/it][A

	loss_cls: tensor(0.4343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5348, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:29<27:18,  5.39s/it][A

	loss_cls: tensor(0.8832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3927, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:34<27:09,  5.38s/it][A

	loss_cls: tensor(0.9382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1983, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:40<27:04,  5.38s/it][A

	loss_cls: tensor(0.4804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5607, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:45<27:07,  5.41s/it][A

	loss_cls: tensor(0.6762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8462, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:50<26:55,  5.39s/it][A

	loss_cls: tensor(0.4007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4831, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:56<26:53,  5.40s/it][A

	loss_cls: tensor(0.4490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8264, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:01<26:42,  5.38s/it][A

	loss_cls: tensor(0.3089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4918, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:06<26:42,  5.39s/it][A

	loss_cls: tensor(0.3328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5679, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:12<26:31,  5.38s/it][A

	loss_cls: tensor(0.4413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5627, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:17<26:23,  5.37s/it][A

	loss_cls: tensor(0.5592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7289, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:23<26:21,  5.38s/it][A

	loss_cls: tensor(0.7583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9107, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:28<26:09,  5.36s/it][A

	loss_cls: tensor(0.4506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5357, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:33<26:10,  5.38s/it][A

	loss_cls: tensor(0.4615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6342, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:39<26:02,  5.37s/it][A

	loss_cls: tensor(0.7033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9463, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:44<26:03,  5.39s/it][A

	loss_cls: tensor(1.0061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1293, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:49<25:54,  5.38s/it][A

	loss_cls: tensor(0.5957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6954, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:55<25:47,  5.37s/it][A

	loss_cls: tensor(0.3175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4086, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:00<25:47,  5.39s/it][A

	loss_cls: tensor(0.7628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9312, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:06<25:35,  5.37s/it][A

	loss_cls: tensor(0.7086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0956, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:11<25:35,  5.39s/it][A

	loss_cls: tensor(0.7343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0253, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:16<25:26,  5.37s/it][A

	loss_cls: tensor(0.7189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9076, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:22<25:27,  5.40s/it][A

	loss_cls: tensor(0.6545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8914, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:27<25:19,  5.39s/it][A

	loss_cls: tensor(0.6434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8234, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:33<25:10,  5.37s/it][A

	loss_cls: tensor(0.9711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2365, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:38<25:11,  5.40s/it][A

	loss_cls: tensor(0.3892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4793, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:43<25:01,  5.38s/it][A

	loss_cls: tensor(0.5782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8752, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:49<24:59,  5.40s/it][A

	loss_cls: tensor(0.3666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6157, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:54<24:52,  5.39s/it][A

	loss_cls: tensor(0.4772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9015, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:00<24:51,  5.40s/it][A

	loss_cls: tensor(0.7442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8837, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:05<24:40,  5.38s/it][A

	loss_cls: tensor(0.8007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9517, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:10<24:33,  5.38s/it][A

	loss_cls: tensor(0.8557, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9706, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:16<24:33,  5.40s/it][A

	loss_cls: tensor(0.7143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7654, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:21<24:25,  5.39s/it][A

	loss_cls: tensor(0.6995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8485, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:26<24:23,  5.40s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7009, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:32<24:15,  5.39s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8696, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:37<24:12,  5.40s/it][A

	loss_cls: tensor(0.6499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9501, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:43<24:02,  5.38s/it][A

	loss_cls: tensor(0.4741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8026, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:48<23:55,  5.38s/it][A

	loss_cls: tensor(0.5537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8226, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:53<23:53,  5.39s/it][A

	loss_cls: tensor(0.5890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6761, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [07:59<23:44,  5.38s/it][A

	loss_cls: tensor(0.5291, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7769, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:04<23:42,  5.39s/it][A

	loss_cls: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0184, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:10<23:34,  5.38s/it][A

	loss_cls: tensor(0.8920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1342, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:15<23:31,  5.39s/it][A

	loss_cls: tensor(0.6005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6417, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:20<23:21,  5.37s/it][A

	loss_cls: tensor(0.8405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1120, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:26<23:12,  5.36s/it][A

	loss_cls: tensor(0.4985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0272, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:31<23:11,  5.37s/it][A

	loss_cls: tensor(0.6285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9119, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:36<23:01,  5.36s/it][A

	loss_cls: tensor(0.4946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6265, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:42<23:01,  5.38s/it][A

	loss_cls: tensor(0.4484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5956, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:47<22:51,  5.36s/it][A

	loss_cls: tensor(0.5693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7749, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:52<22:51,  5.38s/it][A

	loss_cls: tensor(0.8238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0495, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:58<22:45,  5.38s/it][A

	loss_cls: tensor(0.5217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8855, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:03<22:35,  5.36s/it][A

	loss_cls: tensor(0.8131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0625, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:09<22:45,  5.42s/it][A

	loss_cls: tensor(0.7931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0442, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:14<22:46,  5.44s/it][A

	loss_cls: tensor(0.7260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0444, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:20<22:51,  5.49s/it][A

	loss_cls: tensor(0.4819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:25<22:47,  5.49s/it][A

	loss_cls: tensor(0.6324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7603, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:31<22:46,  5.51s/it][A

	loss_cls: tensor(0.5244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7002, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:36<21:55,  5.33s/it][A

	loss_cls: tensor(0.4662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7057, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:41<21:19,  5.20s/it][A

	loss_cls: tensor(0.3780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7377, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:46<21:39,  5.30s/it][A

	loss_cls: tensor(0.4393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6317, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:52<21:47,  5.36s/it][A

	loss_cls: tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6543, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:57<21:14,  5.24s/it][A

	loss_cls: tensor(0.5896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7923, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:02<20:45,  5.15s/it][A

	loss_cls: tensor(0.4564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7020, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:07<20:41,  5.15s/it][A

	loss_cls: tensor(0.6189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8490, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:12<20:59,  5.25s/it][A

	loss_cls: tensor(0.4964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6584, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:18<21:11,  5.32s/it][A

	loss_cls: tensor(0.6472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2483, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8955, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:23<21:24,  5.40s/it][A

	loss_cls: tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5871, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:29<21:26,  5.43s/it][A

	loss_cls: tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6344, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:34<21:30,  5.47s/it][A

	loss_cls: tensor(0.5940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7781, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:40<21:29,  5.49s/it][A

	loss_cls: tensor(0.4724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7674, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:45<21:31,  5.52s/it][A

	loss_cls: tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8586, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:51<21:21,  5.50s/it][A

	loss_cls: tensor(0.4790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8315, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:56<21:15,  5.50s/it][A

	loss_cls: tensor(0.6410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0137, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:02<21:10,  5.50s/it][A

	loss_cls: tensor(0.6601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6972, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:07<21:03,  5.50s/it][A

	loss_cls: tensor(0.7782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0123, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:13<20:47,  5.45s/it][A

	loss_cls: tensor(0.5595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2011, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:18<20:35,  5.42s/it][A

	loss_cls: tensor(0.6822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9371, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:24<20:32,  5.43s/it][A

	loss_cls: tensor(0.7426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9412, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:29<20:19,  5.40s/it][A

	loss_cls: tensor(0.4721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5747, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:34<20:17,  5.41s/it][A

	loss_cls: tensor(0.5564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7390, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:40<20:08,  5.39s/it][A

	loss_cls: tensor(0.5221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7363, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:45<19:59,  5.38s/it][A

	loss_cls: tensor(0.5319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8869, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:50<19:57,  5.39s/it][A

	loss_cls: tensor(0.5975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6913, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:56<19:48,  5.38s/it][A

	loss_cls: tensor(0.5693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6940, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:01<19:44,  5.39s/it][A

	loss_cls: tensor(0.4546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5805, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:07<19:34,  5.36s/it][A

	loss_cls: tensor(0.5263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5948, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:12<19:32,  5.38s/it][A

	loss_cls: tensor(0.4891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7298, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:17<19:23,  5.36s/it][A

	loss_cls: tensor(0.8142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0555, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:23<19:15,  5.35s/it][A

	loss_cls: tensor(0.6831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1761, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:28<19:12,  5.36s/it][A

	loss_cls: tensor(0.9326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2313, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:33<19:04,  5.35s/it][A

	loss_cls: tensor(0.6984, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2922, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:39<19:01,  5.36s/it][A

	loss_cls: tensor(0.4536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5653, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:44<18:53,  5.35s/it][A

	loss_cls: tensor(0.7246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8634, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:49<18:52,  5.37s/it][A

	loss_cls: tensor(0.4588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8707, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:55<18:46,  5.37s/it][A

	loss_cls: tensor(0.6295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6853, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:00<18:40,  5.36s/it][A

	loss_cls: tensor(0.7496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9896, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:06<18:39,  5.38s/it][A

	loss_cls: tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9244, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:11<18:31,  5.37s/it][A

	loss_cls: tensor(0.4771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7911, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:16<18:28,  5.38s/it][A

	loss_cls: tensor(0.5246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6763, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:22<18:21,  5.37s/it][A

	loss_cls: tensor(0.5315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7381, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:27<18:19,  5.39s/it][A

	loss_cls: tensor(0.4772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8973, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:32<18:12,  5.38s/it][A

	loss_cls: tensor(0.6266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9189, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:38<18:04,  5.37s/it][A

	loss_cls: tensor(0.8207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1975, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0182, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:43<18:02,  5.39s/it][A

	loss_cls: tensor(0.4804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5689, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:49<17:54,  5.37s/it][A

	loss_cls: tensor(0.6905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8767, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:54<17:52,  5.39s/it][A

	loss_cls: tensor(1.0106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0940, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1046, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [13:59<17:45,  5.38s/it][A

	loss_cls: tensor(0.7332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1008, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:05<17:42,  5.39s/it][A

	loss_cls: tensor(0.9118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0573, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:10<17:34,  5.38s/it][A

	loss_cls: tensor(0.5967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2112, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8080, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:15<17:27,  5.37s/it][A

	loss_cls: tensor(0.8139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8988, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:21<17:24,  5.38s/it][A

	loss_cls: tensor(0.4749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9296, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:26<17:15,  5.37s/it][A

	loss_cls: tensor(0.5749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8618, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:32<17:14,  5.39s/it][A

	loss_cls: tensor(0.4245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5321, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:37<17:08,  5.38s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6481, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:42<17:05,  5.40s/it][A

	loss_cls: tensor(0.5435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7215, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:48<16:57,  5.38s/it][A

	loss_cls: tensor(0.7194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8558, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:53<16:49,  5.37s/it][A

	loss_cls: tensor(0.4929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6254, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [14:59<16:48,  5.40s/it][A

	loss_cls: tensor(0.4447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6477, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:04<16:41,  5.38s/it][A

	loss_cls: tensor(0.6263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8123, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:09<16:36,  5.39s/it][A

	loss_cls: tensor(0.5847, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6174, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:15<16:32,  5.39s/it][A

	loss_cls: tensor(0.4994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9585, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:20<16:29,  5.41s/it][A

	loss_cls: tensor(0.7256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7847, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:26<16:22,  5.40s/it][A

	loss_cls: tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7072, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:31<16:13,  5.38s/it][A

	loss_cls: tensor(0.4940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7469, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:36<16:11,  5.40s/it][A

	loss_cls: tensor(0.3937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5285, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:42<16:04,  5.39s/it][A

	loss_cls: tensor(0.5462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6417, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:47<16:01,  5.40s/it][A

	loss_cls: tensor(0.7467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2311, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9778, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:53<15:54,  5.39s/it][A

	loss_cls: tensor(0.7319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8948, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:58<15:52,  5.41s/it][A

	loss_cls: tensor(0.4768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7402, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:03<15:43,  5.39s/it][A

	loss_cls: tensor(0.6079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7787, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:09<15:35,  5.38s/it][A

	loss_cls: tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4817, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:14<15:31,  5.39s/it][A

	loss_cls: tensor(0.3874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5219, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:19<15:23,  5.37s/it][A

	loss_cls: tensor(0.6742, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7707, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:25<15:21,  5.39s/it][A

	loss_cls: tensor(0.4100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4497, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:30<15:15,  5.38s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5693, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:36<15:12,  5.40s/it][A

	loss_cls: tensor(0.3555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4347, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:41<15:05,  5.39s/it][A

	loss_cls: tensor(0.3143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3613, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:46<14:58,  5.38s/it][A

	loss_cls: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3049, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:52<14:55,  5.39s/it][A

	loss_cls: tensor(0.6701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9577, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:57<14:48,  5.38s/it][A

	loss_cls: tensor(0.7573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0678, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:03<14:47,  5.41s/it][A

	loss_cls: tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4097, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:08<14:40,  5.40s/it][A

	loss_cls: tensor(0.6410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8751, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:13<14:35,  5.40s/it][A

	loss_cls: tensor(0.4823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5862, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:19<14:28,  5.39s/it][A

	loss_cls: tensor(0.6521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8900, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:24<14:22,  5.39s/it][A

	loss_cls: tensor(0.7087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2826, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9914, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:30<14:18,  5.40s/it][A

	loss_cls: tensor(0.7536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9789, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:35<14:11,  5.39s/it][A

	loss_cls: tensor(1.1661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4878, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:40<14:10,  5.42s/it][A

	loss_cls: tensor(0.3335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3770, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:46<14:02,  5.40s/it][A

	loss_cls: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9290, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:51<13:59,  5.42s/it][A

	loss_cls: tensor(0.6175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1642, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7817, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:57<13:53,  5.41s/it][A

	loss_cls: tensor(0.6594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8854, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:02<13:45,  5.39s/it][A

	loss_cls: tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1495, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:07<13:43,  5.42s/it][A

	loss_cls: tensor(0.4432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4904, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:13<13:37,  5.41s/it][A

	loss_cls: tensor(0.6500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9174, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:18<13:33,  5.42s/it][A

	loss_cls: tensor(0.5650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9559, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:24<13:25,  5.41s/it][A

	loss_cls: tensor(0.4173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7901, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:29<13:22,  5.42s/it][A

	loss_cls: tensor(0.6204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9113, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:34<13:13,  5.40s/it][A

	loss_cls: tensor(0.5105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6839, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:40<13:05,  5.38s/it][A

	loss_cls: tensor(0.4620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5751, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:45<13:02,  5.40s/it][A

	loss_cls: tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8193, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:51<12:55,  5.39s/it][A

	loss_cls: tensor(0.6820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9408, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:56<12:53,  5.41s/it][A

	loss_cls: tensor(0.5241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7400, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:01<12:46,  5.40s/it][A

	loss_cls: tensor(0.5937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7758, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:07<12:43,  5.41s/it][A

	loss_cls: tensor(0.3578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5506, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:12<12:35,  5.39s/it][A

	loss_cls: tensor(0.6146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9218, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:18<12:27,  5.38s/it][A

	loss_cls: tensor(0.4020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1962, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5982, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:23<12:26,  5.41s/it][A

	loss_cls: tensor(0.5631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7596, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:29<12:21,  5.41s/it][A

	loss_cls: tensor(0.5354, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6362, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:34<12:17,  5.42s/it][A

	loss_cls: tensor(1.1873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3856, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:39<12:10,  5.41s/it][A

	loss_cls: tensor(0.6448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0486, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:45<12:05,  5.41s/it][A

	loss_cls: tensor(0.5565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7072, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:50<11:58,  5.40s/it][A

	loss_cls: tensor(0.6505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8287, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:56<11:53,  5.41s/it][A

	loss_cls: tensor(0.6696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7137, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:01<11:47,  5.40s/it][A

	loss_cls: tensor(0.5869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8303, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:06<11:42,  5.40s/it][A

	loss_cls: tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0642, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:12<11:37,  5.41s/it][A

	loss_cls: tensor(0.5932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7909, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:17<11:30,  5.39s/it][A

	loss_cls: tensor(0.6339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8039, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:23<11:27,  5.41s/it][A

	loss_cls: tensor(0.8703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9560, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:28<11:19,  5.40s/it][A

	loss_cls: tensor(0.5153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7378, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:33<11:16,  5.41s/it][A

	loss_cls: tensor(0.9649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2716, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:39<11:09,  5.40s/it][A

	loss_cls: tensor(0.4942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5204, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:44<11:03,  5.39s/it][A

	loss_cls: tensor(0.5585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8987, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:50<10:59,  5.40s/it][A

	loss_cls: tensor(0.7036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0971, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:55<10:52,  5.39s/it][A

	loss_cls: tensor(0.4892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7957, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:00<10:48,  5.40s/it][A

	loss_cls: tensor(0.4092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5904, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:06<10:40,  5.38s/it][A

	loss_cls: tensor(0.6908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0838, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:11<10:38,  5.41s/it][A

	loss_cls: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9658, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:17<10:31,  5.40s/it][A

	loss_cls: tensor(0.7406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1852, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:22<10:24,  5.39s/it][A

	loss_cls: tensor(0.4173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4600, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:27<10:25,  5.44s/it][A

	loss_cls: tensor(0.8197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0108, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:33<10:21,  5.45s/it][A

	loss_cls: tensor(0.5769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7174, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:38<10:19,  5.48s/it][A

	loss_cls: tensor(0.6326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8711, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:44<10:13,  5.48s/it][A

	loss_cls: tensor(0.5054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8011, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:50<10:11,  5.51s/it][A

	loss_cls: tensor(0.6182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9197, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:55<10:05,  5.50s/it][A

	loss_cls: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6282, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:00<09:58,  5.49s/it][A

	loss_cls: tensor(0.6857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9056, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:06<09:54,  5.50s/it][A

	loss_cls: tensor(0.5410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6340, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:11<09:47,  5.49s/it][A

	loss_cls: tensor(0.9380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1590, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:17<09:44,  5.51s/it][A

	loss_cls: tensor(0.7549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8678, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:23<09:37,  5.50s/it][A

	loss_cls: tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8383, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:28<09:35,  5.53s/it][A

	loss_cls: tensor(0.7498, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9408, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:34<09:28,  5.52s/it][A

	loss_cls: tensor(0.6027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0001, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:39<09:22,  5.51s/it][A

	loss_cls: tensor(0.4020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6212, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:45<09:18,  5.53s/it][A

	loss_cls: tensor(0.6867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1270, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:50<09:11,  5.51s/it][A

	loss_cls: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1283, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:56<09:07,  5.53s/it][A

	loss_cls: tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6914, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:01<09:00,  5.51s/it][A

	loss_cls: tensor(0.5449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8589, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:07<08:56,  5.53s/it][A

	loss_cls: tensor(0.5991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7565, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:12<08:49,  5.51s/it][A

	loss_cls: tensor(0.7980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1427, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:18<08:42,  5.50s/it][A

	loss_cls: tensor(0.6065, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0845, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:23<08:37,  5.51s/it][A

	loss_cls: tensor(0.4962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5686, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:29<08:30,  5.49s/it][A

	loss_cls: tensor(0.5747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2705, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8452, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:34<08:27,  5.51s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6715, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:40<08:20,  5.50s/it][A

	loss_cls: tensor(0.4511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9100, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:45<08:17,  5.52s/it][A

	loss_cls: tensor(0.5180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9895, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:51<08:10,  5.51s/it][A

	loss_cls: tensor(0.5800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8292, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:56<08:03,  5.50s/it][A

	loss_cls: tensor(0.7833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9844, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:02<07:59,  5.51s/it][A

	loss_cls: tensor(0.8130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0450, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:07<07:52,  5.50s/it][A

	loss_cls: tensor(0.6507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9326, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:13<07:49,  5.52s/it][A

	loss_cls: tensor(0.5714, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8958, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:18<07:43,  5.51s/it][A

	loss_cls: tensor(0.5641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7815, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:24<07:38,  5.53s/it][A

	loss_cls: tensor(0.5844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6464, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:29<07:32,  5.52s/it][A

	loss_cls: tensor(0.4623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6701, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:35<07:27,  5.52s/it][A

	loss_cls: tensor(0.6467, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9953, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:40<07:22,  5.54s/it][A

	loss_cls: tensor(0.8536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9642, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:46<07:15,  5.51s/it][A

	loss_cls: tensor(0.5685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8281, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:52<07:11,  5.53s/it][A

	loss_cls: tensor(0.7620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9574, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:57<07:05,  5.52s/it][A

	loss_cls: tensor(0.5410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8828, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:03<07:00,  5.53s/it][A

	loss_cls: tensor(0.5016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7522, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:08<06:53,  5.52s/it][A

	loss_cls: tensor(0.6724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9051, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:14<06:47,  5.50s/it][A

	loss_cls: tensor(0.7066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7887, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:19<06:36,  5.43s/it][A

	loss_cls: tensor(0.5455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0317, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:24<06:31,  5.44s/it][A

	loss_cls: tensor(0.6431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0544, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:30<06:28,  5.47s/it][A

	loss_cls: tensor(0.5432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1356, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6789, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:35<06:22,  5.46s/it][A

	loss_cls: tensor(0.8927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0409, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:41<06:18,  5.49s/it][A

	loss_cls: tensor(0.5051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7524, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:46<06:04,  5.36s/it][A

	loss_cls: tensor(0.6782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8022, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:51<05:55,  5.30s/it][A

	loss_cls: tensor(0.6312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1563, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:57<05:54,  5.38s/it][A

	loss_cls: tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3096, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8787, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:02<05:51,  5.41s/it][A

	loss_cls: tensor(0.6159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9343, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:08<05:49,  5.46s/it][A

	loss_cls: tensor(0.6921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8928, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:13<05:44,  5.46s/it][A

	loss_cls: tensor(0.5190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8912, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:19<05:40,  5.49s/it][A

	loss_cls: tensor(0.8601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0002, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:24<05:34,  5.48s/it][A

	loss_cls: tensor(0.6192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9243, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:30<05:28,  5.47s/it][A

	loss_cls: tensor(0.3553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5285, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:35<05:23,  5.49s/it][A

	loss_cls: tensor(0.5527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7035, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:41<05:17,  5.48s/it][A

	loss_cls: tensor(0.6685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3151, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9836, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:46<05:14,  5.52s/it][A

	loss_cls: tensor(0.4823, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6060, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:52<05:08,  5.51s/it][A

	loss_cls: tensor(0.7939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1793, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9732, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:57<05:04,  5.53s/it][A

	loss_cls: tensor(0.5312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6667, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:03<04:57,  5.51s/it][A

	loss_cls: tensor(1.0348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2719, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:08<04:49,  5.46s/it][A

	loss_cls: tensor(0.4967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8640, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:14<04:45,  5.50s/it][A

	loss_cls: tensor(0.4861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5225, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:19<04:40,  5.50s/it][A

	loss_cls: tensor(0.8317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1068, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:25<04:34,  5.48s/it][A

	loss_cls: tensor(0.6807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9463, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:30<04:28,  5.49s/it][A

	loss_cls: tensor(0.4402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4990, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:36<04:23,  5.50s/it][A

	loss_cls: tensor(0.7142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9882, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:41<04:17,  5.48s/it][A

	loss_cls: tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7607, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:47<04:11,  5.48s/it][A

	loss_cls: tensor(0.4449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5503, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:52<04:07,  5.51s/it][A

	loss_cls: tensor(0.4448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5668, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:58<04:02,  5.51s/it][A

	loss_cls: tensor(0.5196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1702, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:03<03:58,  5.54s/it][A

	loss_cls: tensor(0.5850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6897, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:09<03:52,  5.52s/it][A

	loss_cls: tensor(0.6777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1023, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:14<03:46,  5.54s/it][A

	loss_cls: tensor(0.4903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5673, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:20<03:40,  5.52s/it][A

	loss_cls: tensor(0.8741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0443, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:25<03:34,  5.51s/it][A

	loss_cls: tensor(0.6260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8225, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:31<03:30,  5.53s/it][A

	loss_cls: tensor(0.3773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5456, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:36<03:24,  5.53s/it][A

	loss_cls: tensor(0.5409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9350, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:42<03:19,  5.55s/it][A

	loss_cls: tensor(0.8326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1700, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:47<03:13,  5.54s/it][A

	loss_cls: tensor(0.6713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8206, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:53<03:08,  5.55s/it][A

	loss_cls: tensor(0.4834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7215, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:58<03:02,  5.52s/it][A

	loss_cls: tensor(0.5249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7149, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:04<02:56,  5.52s/it][A

	loss_cls: tensor(0.6440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8475, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:10<02:51,  5.52s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7234, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:15<02:45,  5.51s/it][A

	loss_cls: tensor(0.6171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7303, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:21<02:40,  5.52s/it][A

	loss_cls: tensor(0.6273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8588, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:26<02:34,  5.50s/it][A

	loss_cls: tensor(0.6216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8397, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:32<02:28,  5.52s/it][A

	loss_cls: tensor(0.4590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4022, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8613, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:37<02:23,  5.51s/it][A

	loss_cls: tensor(0.4137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6346, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:43<02:17,  5.50s/it][A

	loss_cls: tensor(0.5605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8038, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:48<02:11,  5.50s/it][A

	loss_cls: tensor(0.5145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7561, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:54<02:06,  5.50s/it][A

	loss_cls: tensor(0.5047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8947, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:59<02:01,  5.52s/it][A

	loss_cls: tensor(0.4617, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6717, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:05<01:55,  5.52s/it][A

	loss_cls: tensor(0.5007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7306, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:10<01:50,  5.53s/it][A

	loss_cls: tensor(0.6352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8999, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:16<01:44,  5.51s/it][A

	loss_cls: tensor(0.7332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9075, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:21<01:39,  5.53s/it][A

	loss_cls: tensor(0.7493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9307, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:27<01:33,  5.51s/it][A

	loss_cls: tensor(0.5104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5852, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:32<01:27,  5.49s/it][A

	loss_cls: tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3736, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8736, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:38<01:22,  5.51s/it][A

	loss_cls: tensor(0.3545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6782, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:43<01:17,  5.52s/it][A

	loss_cls: tensor(0.8379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0474, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8853, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:49<01:11,  5.53s/it][A

	loss_cls: tensor(0.6864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9042, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:54<01:06,  5.52s/it][A

	loss_cls: tensor(0.4716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6719, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:00<01:00,  5.52s/it][A

	loss_cls: tensor(0.6752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8109, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:05<00:55,  5.51s/it][A

	loss_cls: tensor(0.8509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0984, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:11<00:49,  5.50s/it][A

	loss_cls: tensor(0.4243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6187, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:16<00:44,  5.51s/it][A

	loss_cls: tensor(0.4104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6670, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:22<00:38,  5.49s/it][A

	loss_cls: tensor(0.4627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7148, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:27<00:33,  5.50s/it][A

	loss_cls: tensor(1.1756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4031, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:33<00:27,  5.49s/it][A

	loss_cls: tensor(0.4342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8363, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:38<00:22,  5.51s/it][A

	loss_cls: tensor(1.1194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1996, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:44<00:16,  5.49s/it][A

	loss_cls: tensor(0.5166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1155, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6321, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:49<00:10,  5.46s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6266, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:55<00:05,  5.49s/it][A

	loss_cls: tensor(0.4616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7846, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:57<00:00,  5.42s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4478, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8266013630029172

	Training cls acc: 0.6916784369114878

	Training cls prec: 0.5757281540650184

	Training cls rec: 0.6150961973525532

	Training cls f1: 0.5380665869739759

--
	Training ner acc: 0.9558042366023172

	Training ner prec: 0.2706444028192085

	Training ner rec: 0.2788292962303642

	Training ner f1: 0.2742496115647602

	Current Learning rate:  0.00034285714285714285



  1%|          | 1/177 [00:00<02:14,  1.31it/s][A
  1%|          | 2/177 [00:01<02:13,  1.31it/s][A
  2%|▏         | 3/177 [00:02<02:04,  1.39it/s][A
  2%|▏         | 4/177 [00:02<02:06,  1.37it/s][A
  3%|▎         | 5/177 [00:03<02:06,  1.36it/s][A
  3%|▎         | 6/177 [00:04<02:06,  1.36it/s][A
  4%|▍         | 7/177 [00:05<02:01,  1.40it/s][A
  5%|▍         | 8/177 [00:05<02:02,  1.38it/s][A
  5%|▌         | 9/177 [00:06<02:02,  1.37it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.41it/s][A
  6%|▌         | 11/177 [00:07<01:59,  1.39it/s][A
  7%|▋         | 12/177 [00:08<01:59,  1.38it/s][A
  7%|▋         | 13/177 [00:09<02:00,  1.37it/s][A
  8%|▊         | 14/177 [00:10<01:56,  1.40it/s][A
  8%|▊         | 15/177 [00:10<01:57,  1.38it/s][A
  9%|▉         | 16/177 [00:11<01:57,  1.37it/s][A
 10%|▉         | 17/177 [00:12<01:57,  1.36it/s][A
 10%|█         | 18/177 [00:13<01:53,  1.40it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.38it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.788709026272014

	Validation cls acc: 0.7062146892655368

	Validation cls prec: 0.6033662900188324

	Validation cls rec: 0.5972020446596717

	Validation cls f1: 0.5540277675870896

--
	Validation ner acc: 0.9535746935751271

	Validation ner prec: 0.42373004445097423

	Validation ner rec: 0.4344632768361582

	Validation ner f1: 0.4288712992033852



  0%|          | 1/354 [00:05<32:12,  5.47s/it][A

	loss_cls: tensor(0.6538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1114, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7652, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:11<32:21,  5.51s/it][A

	loss_cls: tensor(0.5880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6917, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:05,  5.48s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5906, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:55,  5.47s/it][A

	loss_cls: tensor(0.5782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7223, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<32:03,  5.51s/it][A

	loss_cls: tensor(0.4500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4847, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:53,  5.50s/it][A

	loss_cls: tensor(0.6308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7015, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:51,  5.51s/it][A

	loss_cls: tensor(0.5018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5800, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:37,  5.48s/it][A

	loss_cls: tensor(0.5979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6724, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:36,  5.50s/it][A

	loss_cls: tensor(0.5007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1692, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6699, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:25,  5.48s/it][A

	loss_cls: tensor(0.6960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8169, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:30,  5.51s/it][A

	loss_cls: tensor(0.5645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9253, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:24,  5.51s/it][A

	loss_cls: tensor(0.6276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7413, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<31:17,  5.51s/it][A

	loss_cls: tensor(0.3076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6989, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:17<31:18,  5.52s/it][A

	loss_cls: tensor(0.7131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3693, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:22<31:07,  5.51s/it][A

	loss_cls: tensor(0.8017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1077, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:28<31:07,  5.52s/it][A

	loss_cls: tensor(0.4753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8070, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:33<30:37,  5.45s/it][A

	loss_cls: tensor(0.6901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7900, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:43,  5.49s/it][A

	loss_cls: tensor(0.6882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2305, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:44<30:35,  5.48s/it][A

	loss_cls: tensor(0.7973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9361, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:27,  5.47s/it][A

	loss_cls: tensor(0.4152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5301, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:55<30:28,  5.49s/it][A

	loss_cls: tensor(0.6859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8598, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:22,  5.49s/it][A

	loss_cls: tensor(0.8733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2232, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:06<30:24,  5.51s/it][A

	loss_cls: tensor(0.6626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0008, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<30:19,  5.51s/it][A

	loss_cls: tensor(0.6302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8772, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:17<30:22,  5.54s/it][A

	loss_cls: tensor(0.5269, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7408, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:23<30:15,  5.54s/it][A

	loss_cls: tensor(0.6334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7439, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:28<30:03,  5.51s/it][A

	loss_cls: tensor(0.6362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9441, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:34<30:06,  5.54s/it][A

	loss_cls: tensor(0.6493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8837, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:39<29:54,  5.52s/it][A

	loss_cls: tensor(0.5434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6926, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:45<29:38,  5.49s/it][A

	loss_cls: tensor(0.5140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6752, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:50<29:31,  5.48s/it][A

	loss_cls: tensor(0.4294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6326, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:56<29:35,  5.51s/it][A

	loss_cls: tensor(0.4963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6350, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [03:01<29:24,  5.50s/it][A

	loss_cls: tensor(1.0529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4105, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:07<29:16,  5.49s/it][A

	loss_cls: tensor(0.5752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2096, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7848, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:12<29:17,  5.51s/it][A

	loss_cls: tensor(0.8536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2789, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:18<29:08,  5.50s/it][A

	loss_cls: tensor(0.5554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9282, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:23<29:09,  5.52s/it][A

	loss_cls: tensor(0.5910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6355, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:29<29:01,  5.51s/it][A

	loss_cls: tensor(0.8353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1157, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:34<29:02,  5.53s/it][A

	loss_cls: tensor(0.6390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7890, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:40<28:53,  5.52s/it][A

	loss_cls: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7514, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:45<28:43,  5.51s/it][A

	loss_cls: tensor(0.8389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8814, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:51<28:43,  5.52s/it][A

	loss_cls: tensor(0.8589, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0339, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:56<28:28,  5.49s/it][A

	loss_cls: tensor(0.7450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1826, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [04:02<28:25,  5.50s/it][A

	loss_cls: tensor(0.5070, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8181, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:07<28:18,  5.50s/it][A

	loss_cls: tensor(0.6323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6723, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:13<28:20,  5.52s/it][A

	loss_cls: tensor(0.8215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9730, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:18<28:10,  5.51s/it][A

	loss_cls: tensor(0.5853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:24<28:02,  5.50s/it][A

	loss_cls: tensor(0.6891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2161, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:29<28:02,  5.52s/it][A

	loss_cls: tensor(0.6040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:35<27:52,  5.50s/it][A

	loss_cls: tensor(0.4896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6754, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:40<27:02,  5.36s/it][A

	loss_cls: tensor(0.6137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7591, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:45<27:09,  5.40s/it][A

	loss_cls: tensor(0.6190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7428, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:51<27:20,  5.45s/it][A

	loss_cls: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6698, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:56<27:18,  5.46s/it][A

	loss_cls: tensor(0.6931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9187, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [05:02<27:15,  5.47s/it][A

	loss_cls: tensor(0.4491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9692, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:07<27:18,  5.50s/it][A

	loss_cls: tensor(0.6782, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7891, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:12<26:32,  5.36s/it][A

	loss_cls: tensor(0.6504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0908, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7412, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:17<25:53,  5.25s/it][A

	loss_cls: tensor(0.5368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8398, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:22<25:19,  5.15s/it][A

	loss_cls: tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7220, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:27<24:58,  5.10s/it][A

	loss_cls: tensor(0.4180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4625, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:32<24:37,  5.04s/it][A

	loss_cls: tensor(0.4007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4419, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:37<24:42,  5.08s/it][A

	loss_cls: tensor(0.6324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7006, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:43<25:18,  5.22s/it][A

	loss_cls: tensor(0.4428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5823, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:48<25:33,  5.29s/it][A

	loss_cls: tensor(0.4944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7681, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:54<25:55,  5.38s/it][A

	loss_cls: tensor(0.5591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9276, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:59<25:55,  5.40s/it][A

	loss_cls: tensor(0.5593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9686, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:05<26:00,  5.44s/it][A

	loss_cls: tensor(0.8410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9997, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:10<25:59,  5.45s/it][A

	loss_cls: tensor(0.6621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8160, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:16<25:56,  5.46s/it][A

	loss_cls: tensor(0.6603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9490, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:21<25:59,  5.49s/it][A

	loss_cls: tensor(0.6288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9210, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:27<25:53,  5.49s/it][A

	loss_cls: tensor(0.6970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9312, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:33<25:57,  5.52s/it][A

	loss_cls: tensor(0.5308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8392, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:38<25:47,  5.51s/it][A

	loss_cls: tensor(0.5878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6702, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:43<25:28,  5.46s/it][A

	loss_cls: tensor(0.8253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0918, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:49<25:22,  5.46s/it][A

	loss_cls: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8619, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:54<25:18,  5.46s/it][A

	loss_cls: tensor(0.6349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7884, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [07:00<25:20,  5.49s/it][A

	loss_cls: tensor(0.8795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1937, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:05<25:15,  5.49s/it][A

	loss_cls: tensor(0.5546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7781, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:11<25:18,  5.52s/it][A

	loss_cls: tensor(0.6231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7687, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:16<25:10,  5.51s/it][A

	loss_cls: tensor(0.3342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5602, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:22<25:10,  5.53s/it][A

	loss_cls: tensor(0.4850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6197, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:27<25:00,  5.52s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7882, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:33<24:51,  5.50s/it][A

	loss_cls: tensor(0.7395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9875, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:38<23:59,  5.33s/it][A

	loss_cls: tensor(0.4916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6808, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:43<23:19,  5.20s/it][A

	loss_cls: tensor(0.4475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6439, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:48<22:57,  5.14s/it][A

	loss_cls: tensor(0.9710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0779, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0490, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:53<23:19,  5.24s/it][A

	loss_cls: tensor(0.5690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9087, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:59<23:40,  5.34s/it][A

	loss_cls: tensor(0.4790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6547, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:04<23:42,  5.37s/it][A

	loss_cls: tensor(0.7231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0052, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:10<23:39,  5.38s/it][A

	loss_cls: tensor(0.6398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6829, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:15<23:45,  5.42s/it][A

	loss_cls: tensor(0.7222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8975, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:21<23:44,  5.44s/it][A

	loss_cls: tensor(0.8922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1165, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:26<23:45,  5.46s/it][A

	loss_cls: tensor(0.4208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7937, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:32<23:38,  5.46s/it][A

	loss_cls: tensor(0.4832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6062, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:37<23:39,  5.48s/it][A

	loss_cls: tensor(0.5109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7988, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:43<23:31,  5.47s/it][A

	loss_cls: tensor(0.4501, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7337, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:48<23:26,  5.47s/it][A

	loss_cls: tensor(0.6189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9196, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:54<23:25,  5.49s/it][A

	loss_cls: tensor(0.7190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1337, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:59<23:19,  5.49s/it][A

	loss_cls: tensor(0.5454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7802, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:05<23:17,  5.50s/it][A

	loss_cls: tensor(0.6017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7245, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:10<23:09,  5.49s/it][A

	loss_cls: tensor(0.5097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8914, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:16<23:06,  5.50s/it][A

	loss_cls: tensor(0.5745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7462, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:21<22:57,  5.49s/it][A

	loss_cls: tensor(0.6522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7552, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:26<22:48,  5.47s/it][A

	loss_cls: tensor(0.5329, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7651, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:32<22:50,  5.51s/it][A

	loss_cls: tensor(0.4890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5783, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:38<22:46,  5.51s/it][A

	loss_cls: tensor(0.7092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8312, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:43<22:43,  5.52s/it][A

	loss_cls: tensor(0.5402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0298, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:49<22:34,  5.51s/it][A

	loss_cls: tensor(0.4755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5661, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:54<22:30,  5.51s/it][A

	loss_cls: tensor(0.6464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7029, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [10:00<22:19,  5.49s/it][A

	loss_cls: tensor(0.8313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8985, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:05<22:15,  5.50s/it][A

	loss_cls: tensor(0.6430, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9548, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:11<22:04,  5.47s/it][A

	loss_cls: tensor(0.4038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6960, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:16<21:55,  5.46s/it][A

	loss_cls: tensor(0.4534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7274, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:22<21:58,  5.49s/it][A

	loss_cls: tensor(0.5237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6308, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:27<21:50,  5.48s/it][A

	loss_cls: tensor(0.7105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2766, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:32<21:47,  5.49s/it][A

	loss_cls: tensor(0.3620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4725, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:38<21:37,  5.48s/it][A

	loss_cls: tensor(0.5496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7910, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:43<21:37,  5.50s/it][A

	loss_cls: tensor(0.4491, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5501, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:49<21:31,  5.49s/it][A

	loss_cls: tensor(0.5113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7417, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:54<21:23,  5.48s/it][A

	loss_cls: tensor(0.5393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8355, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [11:00<21:25,  5.52s/it][A

	loss_cls: tensor(0.6087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0275, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:05<21:14,  5.49s/it][A

	loss_cls: tensor(0.6429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8376, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:11<21:11,  5.51s/it][A

	loss_cls: tensor(0.8703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1921, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:16<21:01,  5.49s/it][A

	loss_cls: tensor(0.7853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8816, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:22<21:00,  5.50s/it][A

	loss_cls: tensor(0.5534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8365, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:27<20:51,  5.49s/it][A

	loss_cls: tensor(0.4043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1706, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5749, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:33<20:44,  5.48s/it][A

	loss_cls: tensor(0.3859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4764, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:38<20:42,  5.50s/it][A

	loss_cls: tensor(0.5125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7039, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:44<20:33,  5.48s/it][A

	loss_cls: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8966, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:49<20:31,  5.50s/it][A

	loss_cls: tensor(0.4559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6152, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:55<20:22,  5.48s/it][A

	loss_cls: tensor(0.6574, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9548, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [12:00<20:23,  5.51s/it][A

	loss_cls: tensor(0.7649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9919, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:06<20:15,  5.50s/it][A

	loss_cls: tensor(0.6953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9470, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:11<20:07,  5.49s/it][A

	loss_cls: tensor(0.5447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6900, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:17<20:01,  5.49s/it][A

	loss_cls: tensor(0.6552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9760, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:22<19:55,  5.48s/it][A

	loss_cls: tensor(0.4198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4553, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:28<19:54,  5.50s/it][A

	loss_cls: tensor(0.5713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7587, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:33<19:44,  5.48s/it][A

	loss_cls: tensor(0.4373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5096, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:39<19:43,  5.50s/it][A

	loss_cls: tensor(0.5076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5522, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:44<19:34,  5.49s/it][A

	loss_cls: tensor(0.8176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0793, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8969, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:50<19:29,  5.49s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7058, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:55<19:30,  5.52s/it][A

	loss_cls: tensor(0.6370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8225, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [13:01<19:20,  5.50s/it][A

	loss_cls: tensor(0.6670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9131, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:06<19:17,  5.51s/it][A

	loss_cls: tensor(0.4808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6933, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:12<19:09,  5.50s/it][A

	loss_cls: tensor(0.5476, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0429, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:17<19:07,  5.51s/it][A

	loss_cls: tensor(0.5706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6505, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:23<18:56,  5.49s/it][A

	loss_cls: tensor(0.9835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0677, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:28<18:49,  5.48s/it][A

	loss_cls: tensor(0.6424, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0144, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:34<18:45,  5.49s/it][A

	loss_cls: tensor(0.4454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0298, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4752, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:39<18:37,  5.48s/it][A

	loss_cls: tensor(0.6125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8459, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:45<18:35,  5.50s/it][A

	loss_cls: tensor(0.6705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2293, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:50<18:30,  5.50s/it][A

	loss_cls: tensor(0.4456, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1448, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5904, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:56<18:28,  5.52s/it][A

	loss_cls: tensor(0.3901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6373, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [14:01<18:21,  5.51s/it][A

	loss_cls: tensor(0.7064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3712, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:07<18:11,  5.49s/it][A

	loss_cls: tensor(0.8317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2775, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:12<18:08,  5.50s/it][A

	loss_cls: tensor(0.6205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7014, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:18<18:01,  5.49s/it][A

	loss_cls: tensor(0.7378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2096, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9474, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:23<17:53,  5.48s/it][A

	loss_cls: tensor(0.7930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0023, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:29<17:47,  5.48s/it][A

	loss_cls: tensor(0.5478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8515, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:34<17:45,  5.49s/it][A

	loss_cls: tensor(0.6420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2112, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8532, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:40<17:37,  5.48s/it][A

	loss_cls: tensor(0.4614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7856, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:45<17:29,  5.47s/it][A

	loss_cls: tensor(0.4440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6578, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:51<17:27,  5.48s/it][A

	loss_cls: tensor(0.5167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7088, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:56<17:20,  5.48s/it][A

	loss_cls: tensor(0.6031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8038, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [15:02<17:21,  5.51s/it][A

	loss_cls: tensor(0.4704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5963, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:07<17:15,  5.51s/it][A

	loss_cls: tensor(0.5280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7134, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:13<17:11,  5.51s/it][A

	loss_cls: tensor(0.6568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8544, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:18<17:02,  5.50s/it][A

	loss_cls: tensor(0.4076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6077, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:24<16:56,  5.49s/it][A

	loss_cls: tensor(0.3833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5119, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:29<16:52,  5.50s/it][A

	loss_cls: tensor(0.5175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8342, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:35<16:44,  5.49s/it][A

	loss_cls: tensor(0.4421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6185, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:40<16:42,  5.51s/it][A

	loss_cls: tensor(0.6524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1220, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:46<16:37,  5.51s/it][A

	loss_cls: tensor(0.5524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5839, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:51<16:34,  5.53s/it][A

	loss_cls: tensor(0.5049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1975, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7024, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:57<16:25,  5.51s/it][A

	loss_cls: tensor(0.6015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8825, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [16:02<16:17,  5.49s/it][A

	loss_cls: tensor(0.4014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4364, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:08<16:14,  5.50s/it][A

	loss_cls: tensor(1.1216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4959, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:13<16:07,  5.50s/it][A

	loss_cls: tensor(0.6158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7291, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:19<16:06,  5.52s/it][A

	loss_cls: tensor(0.7547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8741, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:24<15:57,  5.50s/it][A

	loss_cls: tensor(0.5102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1710, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6812, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:30<15:52,  5.50s/it][A

	loss_cls: tensor(0.8843, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0280, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:35<15:45,  5.50s/it][A

	loss_cls: tensor(0.5469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6738, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:41<15:39,  5.49s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8703, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:46<15:35,  5.50s/it][A

	loss_cls: tensor(0.6830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9950, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:52<15:28,  5.50s/it][A

	loss_cls: tensor(0.4808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6003, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:57<15:27,  5.52s/it][A

	loss_cls: tensor(0.5285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6850, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [17:03<15:20,  5.51s/it][A

	loss_cls: tensor(0.6146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6803, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:08<15:17,  5.53s/it][A

	loss_cls: tensor(0.6415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9486, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:14<15:05,  5.49s/it][A

	loss_cls: tensor(0.6106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7474, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:19<14:58,  5.48s/it][A

	loss_cls: tensor(0.4783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8000, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:25<14:56,  5.50s/it][A

	loss_cls: tensor(0.5875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6491, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:30<14:50,  5.50s/it][A

	loss_cls: tensor(0.5807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6802, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:36<14:50,  5.53s/it][A

	loss_cls: tensor(0.6563, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6989, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:41<14:41,  5.51s/it][A

	loss_cls: tensor(0.7150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1056, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:47<14:38,  5.52s/it][A

	loss_cls: tensor(0.4993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:52<14:30,  5.51s/it][A

	loss_cls: tensor(0.6046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0957, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:58<14:24,  5.50s/it][A

	loss_cls: tensor(0.4119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4447, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [18:03<14:21,  5.52s/it][A

	loss_cls: tensor(0.9969, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4207, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:09<14:13,  5.51s/it][A

	loss_cls: tensor(0.7145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0409, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:14<14:09,  5.52s/it][A

	loss_cls: tensor(0.4571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6096, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:20<14:01,  5.50s/it][A

	loss_cls: tensor(0.5179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6830, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:25<13:57,  5.51s/it][A

	loss_cls: tensor(0.4219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5971, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:31<13:47,  5.48s/it][A

	loss_cls: tensor(0.4483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5679, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:36<13:39,  5.46s/it][A

	loss_cls: tensor(0.7704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9787, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:42<13:39,  5.50s/it][A

	loss_cls: tensor(0.5083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7630, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:47<13:32,  5.49s/it][A

	loss_cls: tensor(0.3919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8179, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:53<13:30,  5.51s/it][A

	loss_cls: tensor(0.5669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7843, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:58<13:22,  5.50s/it][A

	loss_cls: tensor(0.6720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0402, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [19:04<13:19,  5.51s/it][A

	loss_cls: tensor(0.5035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6396, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:09<13:12,  5.51s/it][A

	loss_cls: tensor(0.5013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6758, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:15<13:07,  5.51s/it][A

	loss_cls: tensor(0.3173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3745, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:20<13:02,  5.51s/it][A

	loss_cls: tensor(0.8909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9965, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:26<12:55,  5.50s/it][A

	loss_cls: tensor(0.8300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0225, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:31<12:52,  5.52s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8458, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:37<12:44,  5.50s/it][A

	loss_cls: tensor(0.3661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6641, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:42<12:40,  5.51s/it][A

	loss_cls: tensor(0.5892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7338, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:48<12:33,  5.50s/it][A

	loss_cls: tensor(0.5231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5935, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:53<12:29,  5.51s/it][A

	loss_cls: tensor(0.5204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6959, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:59<12:22,  5.50s/it][A

	loss_cls: tensor(0.7753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9721, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [20:04<12:15,  5.49s/it][A

	loss_cls: tensor(0.4218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5281, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:10<12:12,  5.50s/it][A

	loss_cls: tensor(0.6822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9540, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:15<12:04,  5.49s/it][A

	loss_cls: tensor(0.4273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9391, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:21<12:01,  5.51s/it][A

	loss_cls: tensor(0.7076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1030, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:26<11:53,  5.49s/it][A

	loss_cls: tensor(0.4565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5288, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:32<11:51,  5.52s/it][A

	loss_cls: tensor(0.7842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9019, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:37<11:40,  5.48s/it][A

	loss_cls: tensor(0.3963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5435, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:43<11:35,  5.47s/it][A

	loss_cls: tensor(0.7215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0286, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:48<11:31,  5.49s/it][A

	loss_cls: tensor(0.5337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5770, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:54<11:24,  5.48s/it][A

	loss_cls: tensor(0.4537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5919, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:59<11:21,  5.50s/it][A

	loss_cls: tensor(0.4019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7573, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [21:05<11:14,  5.49s/it][A

	loss_cls: tensor(0.6196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2870, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9066, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:10<11:11,  5.50s/it][A

	loss_cls: tensor(0.6468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8951, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:16<10:57,  5.44s/it][A

	loss_cls: tensor(0.6344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8414, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:21<10:46,  5.39s/it][A

	loss_cls: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8545, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:26<10:39,  5.37s/it][A

	loss_cls: tensor(0.7424, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9190, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:31<10:30,  5.34s/it][A

	loss_cls: tensor(0.4797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8430, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:37<10:24,  5.34s/it][A

	loss_cls: tensor(0.6934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1209, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8143, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:42<10:16,  5.32s/it][A

	loss_cls: tensor(0.4119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5533, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:47<10:14,  5.34s/it][A

	loss_cls: tensor(0.4245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7743, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:53<10:07,  5.33s/it][A

	loss_cls: tensor(0.3911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7713, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:58<10:00,  5.31s/it][A

	loss_cls: tensor(0.7046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8351, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [22:03<09:55,  5.32s/it][A

	loss_cls: tensor(0.5862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7351, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:09<09:48,  5.30s/it][A

	loss_cls: tensor(0.5993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7103, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:14<09:43,  5.31s/it][A

	loss_cls: tensor(0.3281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4721, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:19<09:37,  5.30s/it][A

	loss_cls: tensor(0.5398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7832, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:25<09:34,  5.32s/it][A

	loss_cls: tensor(0.8463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2909, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:30<09:28,  5.32s/it][A

	loss_cls: tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8908, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:35<09:22,  5.31s/it][A

	loss_cls: tensor(0.6233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8310, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:41<09:21,  5.35s/it][A

	loss_cls: tensor(0.3661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6592, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:46<09:15,  5.34s/it][A

	loss_cls: tensor(0.8350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1400, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:51<09:12,  5.36s/it][A

	loss_cls: tensor(0.5212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7622, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:57<09:04,  5.34s/it][A

	loss_cls: tensor(0.4078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8475, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [23:02<09:00,  5.36s/it][A

	loss_cls: tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8259, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:07<08:54,  5.35s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9068, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:13<08:48,  5.34s/it][A

	loss_cls: tensor(0.6159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9259, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:18<08:44,  5.35s/it][A

	loss_cls: tensor(0.8357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1207, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:23<08:37,  5.34s/it][A

	loss_cls: tensor(0.5413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8217, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:29<08:34,  5.36s/it][A

	loss_cls: tensor(0.6513, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8554, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:34<08:28,  5.36s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7093, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:40<08:24,  5.37s/it][A

	loss_cls: tensor(1.0722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4856, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:45<08:18,  5.36s/it][A

	loss_cls: tensor(0.5699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9599, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:50<08:12,  5.35s/it][A

	loss_cls: tensor(0.7116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1023, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:56<08:09,  5.38s/it][A

	loss_cls: tensor(0.4651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6934, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [24:01<08:02,  5.36s/it][A

	loss_cls: tensor(0.6667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9253, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:06<07:58,  5.38s/it][A

	loss_cls: tensor(0.5032, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2439, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:12<07:52,  5.37s/it][A

	loss_cls: tensor(0.4609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8176, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:17<07:48,  5.39s/it][A

	loss_cls: tensor(0.4812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7073, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:22<07:41,  5.36s/it][A

	loss_cls: tensor(0.5921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9608, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:28<07:35,  5.35s/it][A

	loss_cls: tensor(0.3608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9636, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:33<07:31,  5.38s/it][A

	loss_cls: tensor(0.5745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8391, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:39<07:24,  5.36s/it][A

	loss_cls: tensor(1.0584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1589, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:44<07:22,  5.40s/it][A

	loss_cls: tensor(0.7637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9352, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:50<07:19,  5.42s/it][A

	loss_cls: tensor(0.4913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8453, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:55<07:15,  5.44s/it][A

	loss_cls: tensor(0.6835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0002, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [25:00<07:08,  5.43s/it][A

	loss_cls: tensor(0.5700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7081, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:06<07:02,  5.41s/it][A

	loss_cls: tensor(0.4739, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6928, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:11<06:57,  5.42s/it][A

	loss_cls: tensor(0.6705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0554, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:17<06:50,  5.41s/it][A

	loss_cls: tensor(0.6417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0049, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:22<06:47,  5.43s/it][A

	loss_cls: tensor(0.4956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9311, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:27<06:39,  5.41s/it][A

	loss_cls: tensor(0.7623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9820, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:33<06:34,  5.41s/it][A

	loss_cls: tensor(0.5419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7251, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:38<06:28,  5.39s/it][A

	loss_cls: tensor(0.5573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7662, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:44<06:21,  5.37s/it][A

	loss_cls: tensor(0.7061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7610, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:49<06:16,  5.38s/it][A

	loss_cls: tensor(0.5808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7179, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:54<06:10,  5.37s/it][A

	loss_cls: tensor(0.5342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7497, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [26:00<06:06,  5.39s/it][A

	loss_cls: tensor(0.4749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7888, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:05<06:00,  5.38s/it][A

	loss_cls: tensor(0.5075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6343, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:10<05:55,  5.39s/it][A

	loss_cls: tensor(0.5723, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8706, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:16<05:49,  5.37s/it][A

	loss_cls: tensor(0.4797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7740, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:21<05:43,  5.36s/it][A

	loss_cls: tensor(0.8220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9456, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:27<05:38,  5.37s/it][A

	loss_cls: tensor(0.5761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6595, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:32<05:32,  5.37s/it][A

	loss_cls: tensor(0.5504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6059, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:37<05:28,  5.39s/it][A

	loss_cls: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6758, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:43<05:22,  5.37s/it][A

	loss_cls: tensor(0.5874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7897, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:48<05:17,  5.38s/it][A

	loss_cls: tensor(0.8093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2710, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0803, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:53<05:12,  5.38s/it][A

	loss_cls: tensor(0.5483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7826, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:59<05:05,  5.37s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5685, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:04<05:00,  5.37s/it][A

	loss_cls: tensor(0.4884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6293, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:10<04:55,  5.37s/it][A

	loss_cls: tensor(0.4760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6730, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:15<04:51,  5.39s/it][A

	loss_cls: tensor(0.5597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0281, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:20<04:45,  5.38s/it][A

	loss_cls: tensor(0.7909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9873, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:26<04:40,  5.39s/it][A

	loss_cls: tensor(0.7053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8031, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:31<04:33,  5.36s/it][A

	loss_cls: tensor(0.3600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5556, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:36<04:26,  5.34s/it][A

	loss_cls: tensor(0.4157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4476, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:42<04:22,  5.36s/it][A

	loss_cls: tensor(0.7181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2483, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9664, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:47<04:16,  5.35s/it][A

	loss_cls: tensor(0.3131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6233, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:53<04:12,  5.38s/it][A

	loss_cls: tensor(0.8122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8938, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:58<04:06,  5.37s/it][A

	loss_cls: tensor(0.6625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7824, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:03<04:02,  5.38s/it][A

	loss_cls: tensor(0.8459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1493, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:09<03:55,  5.36s/it][A

	loss_cls: tensor(0.5977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7886, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:14<03:50,  5.37s/it][A

	loss_cls: tensor(0.6214, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7756, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:19<03:45,  5.36s/it][A

	loss_cls: tensor(0.7668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9387, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:25<03:39,  5.36s/it][A

	loss_cls: tensor(0.5721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7088, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:30<03:35,  5.38s/it][A

	loss_cls: tensor(0.5549, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7827, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:35<03:29,  5.37s/it][A

	loss_cls: tensor(0.4178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6025, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:41<03:24,  5.38s/it][A

	loss_cls: tensor(0.5890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7479, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:46<03:18,  5.37s/it][A

	loss_cls: tensor(0.6651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7908, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:52<03:13,  5.38s/it][A

	loss_cls: tensor(0.5291, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7021, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:57<03:08,  5.39s/it][A

	loss_cls: tensor(0.5413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7120, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:02<03:02,  5.37s/it][A

	loss_cls: tensor(0.3562, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5946, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:08<02:57,  5.38s/it][A

	loss_cls: tensor(0.4174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6904, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:13<02:51,  5.37s/it][A

	loss_cls: tensor(0.5411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6144, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:18<02:46,  5.38s/it][A

	loss_cls: tensor(0.8236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0577, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:24<02:40,  5.35s/it][A

	loss_cls: tensor(0.5780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8992, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:29<02:36,  5.38s/it][A

	loss_cls: tensor(0.8870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0565, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:35<02:30,  5.37s/it][A

	loss_cls: tensor(0.4528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8462, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:40<02:24,  5.36s/it][A

	loss_cls: tensor(0.6737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1539, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:45<02:19,  5.38s/it][A

	loss_cls: tensor(0.5589, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7567, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:51<02:14,  5.37s/it][A

	loss_cls: tensor(0.5737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9328, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:56<02:09,  5.39s/it][A

	loss_cls: tensor(0.3313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6224, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:01<02:03,  5.37s/it][A

	loss_cls: tensor(0.6567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7665, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:07<01:58,  5.39s/it][A

	loss_cls: tensor(0.4805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7115, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:12<01:52,  5.37s/it][A

	loss_cls: tensor(0.4148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7701, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:18<01:47,  5.36s/it][A

	loss_cls: tensor(0.4372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8833, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:23<01:42,  5.37s/it][A

	loss_cls: tensor(0.4197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5483, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:28<01:36,  5.36s/it][A

	loss_cls: tensor(0.4642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1947, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6589, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:34<01:31,  5.36s/it][A

	loss_cls: tensor(0.7661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9411, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:39<01:25,  5.35s/it][A

	loss_cls: tensor(0.5256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8132, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:44<01:20,  5.37s/it][A

	loss_cls: tensor(0.5305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:50<01:14,  5.36s/it][A

	loss_cls: tensor(0.5966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8382, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:55<01:09,  5.34s/it][A

	loss_cls: tensor(0.3579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4297, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:00<01:04,  5.37s/it][A

	loss_cls: tensor(0.4867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2962, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7828, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:06<00:59,  5.38s/it][A

	loss_cls: tensor(0.7824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8821, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:11<00:53,  5.40s/it][A

	loss_cls: tensor(0.8936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0311, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:17<00:48,  5.38s/it][A

	loss_cls: tensor(0.4635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7443, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:22<00:43,  5.40s/it][A

	loss_cls: tensor(0.4751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6558, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:27<00:37,  5.38s/it][A

	loss_cls: tensor(0.2871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3995, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:33<00:32,  5.37s/it][A

	loss_cls: tensor(0.4985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8483, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:38<00:26,  5.39s/it][A

	loss_cls: tensor(0.5962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1385, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:44<00:21,  5.37s/it][A

	loss_cls: tensor(0.4137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6457, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:49<00:16,  5.39s/it][A

	loss_cls: tensor(0.4765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0692, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5458, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:54<00:10,  5.37s/it][A

	loss_cls: tensor(0.3286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8866, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:00<00:05,  5.38s/it][A

	loss_cls: tensor(0.6496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9227, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:02<00:00,  5.43s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.7099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1571, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8161180585117663

	Training cls acc: 0.6926789077212806

	Training cls prec: 0.5771439757668572

	Training cls rec: 0.6228228457042017

	Training cls f1: 0.5390643725641036

--
	Training ner acc: 0.9548435326445638

	Training ner prec: 0.26742451338046797

	Training ner rec: 0.27665284402827833

	Training ner f1: 0.2718338962678999

	Current Learning rate:  0.00031428571428571427



  1%|          | 1/177 [00:00<02:14,  1.30it/s][A
  1%|          | 2/177 [00:01<02:01,  1.44it/s][A
  2%|▏         | 3/177 [00:02<02:03,  1.41it/s][A
  2%|▏         | 4/177 [00:02<02:04,  1.39it/s][A
  3%|▎         | 5/177 [00:03<02:04,  1.38it/s][A
  3%|▎         | 6/177 [00:04<01:59,  1.44it/s][A
  4%|▍         | 7/177 [00:04<02:00,  1.41it/s][A
  5%|▍         | 8/177 [00:05<02:01,  1.40it/s][A
  5%|▌         | 9/177 [00:06<01:56,  1.44it/s][A
  6%|▌         | 10/177 [00:07<01:56,  1.43it/s][A
  6%|▌         | 11/177 [00:07<01:57,  1.41it/s][A
  7%|▋         | 12/177 [00:08<01:58,  1.40it/s][A
  7%|▋         | 13/177 [00:09<01:53,  1.44it/s][A
  8%|▊         | 14/177 [00:09<01:54,  1.42it/s][A
  8%|▊         | 15/177 [00:10<01:55,  1.40it/s][A
  9%|▉         | 16/177 [00:11<01:55,  1.39it/s][A
 10%|▉         | 17/177 [00:12<01:52,  1.43it/s][A
 10%|█         | 18/177 [00:12<01:53,  1.40it/s][A
 11%|█         | 19/177 [00:13<01:53,  1.39it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7659513252434758

	Validation cls acc: 0.7535310734463276

	Validation cls prec: 0.6177495291902072

	Validation cls rec: 0.6157855797686306

	Validation cls f1: 0.5875537833164951

--
	Validation ner acc: 0.954700932586836

	Validation ner prec: 0.4144124103368515

	Validation ner rec: 0.42476459510357817

	Validation ner f1: 0.4193768269710225



  0%|          | 1/354 [00:05<31:56,  5.43s/it][A

	loss_cls: tensor(0.7649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9021, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:30,  5.37s/it][A

	loss_cls: tensor(0.4626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7605, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:36,  5.40s/it][A

	loss_cls: tensor(1.1712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3364, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:19,  5.37s/it][A

	loss_cls: tensor(1.2781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4453, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:20,  5.39s/it][A

	loss_cls: tensor(0.7234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8628, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:06,  5.36s/it][A

	loss_cls: tensor(0.5318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7700, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:03,  5.37s/it][A

	loss_cls: tensor(0.6948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9850, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:42<30:56,  5.37s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8010, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:46,  5.35s/it][A

	loss_cls: tensor(0.7479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8468, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:49,  5.38s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8216, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:37,  5.36s/it][A

	loss_cls: tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7470, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:43,  5.39s/it][A

	loss_cls: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9608, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:34,  5.38s/it][A

	loss_cls: tensor(0.4668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7853, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:32,  5.39s/it][A

	loss_cls: tensor(0.6339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7156, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:18,  5.36s/it][A

	loss_cls: tensor(0.4964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6572, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:25<30:05,  5.34s/it][A

	loss_cls: tensor(0.5297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8737, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:06,  5.36s/it][A

	loss_cls: tensor(0.4937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7314, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<29:55,  5.34s/it][A

	loss_cls: tensor(0.6390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1324, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:41<29:54,  5.36s/it][A

	loss_cls: tensor(0.5854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7525, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<29:41,  5.33s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8834, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:39,  5.34s/it][A

	loss_cls: tensor(0.4348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6960, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:57<29:32,  5.34s/it][A

	loss_cls: tensor(0.9067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1228, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0294, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:24,  5.33s/it][A

	loss_cls: tensor(0.9547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2062, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:23,  5.34s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5793, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:13<29:13,  5.33s/it][A

	loss_cls: tensor(0.7297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8004, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:12,  5.34s/it][A

	loss_cls: tensor(0.6785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8701, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:01,  5.33s/it][A

	loss_cls: tensor(0.6668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0342, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:29<29:02,  5.35s/it][A

	loss_cls: tensor(0.5380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6837, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<28:54,  5.34s/it][A

	loss_cls: tensor(0.8365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0951, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<28:46,  5.33s/it][A

	loss_cls: tensor(0.7406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8260, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:45<28:44,  5.34s/it][A

	loss_cls: tensor(0.7102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0495, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<28:33,  5.32s/it][A

	loss_cls: tensor(0.5524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7991, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:56<28:32,  5.34s/it][A

	loss_cls: tensor(0.4542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6269, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:01<28:24,  5.33s/it][A

	loss_cls: tensor(0.8315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0341, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:28,  5.36s/it][A

	loss_cls: tensor(0.6865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7268, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:12<28:20,  5.35s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2895, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8706, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:14,  5.34s/it][A

	loss_cls: tensor(0.8436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1309, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:23<28:14,  5.36s/it][A

	loss_cls: tensor(0.6295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9646, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:28<28:08,  5.36s/it][A

	loss_cls: tensor(0.6857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1436, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:34<28:07,  5.37s/it][A

	loss_cls: tensor(0.7022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9170, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:39<27:58,  5.36s/it][A

	loss_cls: tensor(0.7028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7671, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:44<27:58,  5.38s/it][A

	loss_cls: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8042, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:50<27:48,  5.37s/it][A

	loss_cls: tensor(0.6524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9046, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:55<27:40,  5.36s/it][A

	loss_cls: tensor(0.5005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8585, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:01<27:37,  5.37s/it][A

	loss_cls: tensor(0.5650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7741, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:06<27:27,  5.35s/it][A

	loss_cls: tensor(0.7464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1480, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:11<27:28,  5.37s/it][A

	loss_cls: tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7151, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:17<27:20,  5.36s/it][A

	loss_cls: tensor(0.5446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8630, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:22<27:20,  5.38s/it][A

	loss_cls: tensor(0.5453, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8203, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:27<27:11,  5.37s/it][A

	loss_cls: tensor(0.5933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8828, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:33<27:02,  5.35s/it][A

	loss_cls: tensor(0.8529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0851, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:38<27:00,  5.37s/it][A

	loss_cls: tensor(0.4663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9857, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:43<26:52,  5.36s/it][A

	loss_cls: tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0272, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:49<26:54,  5.38s/it][A

	loss_cls: tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7971, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:54<26:47,  5.38s/it][A

	loss_cls: tensor(0.5638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7318, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:00<26:46,  5.39s/it][A

	loss_cls: tensor(0.5817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6996, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:05<26:35,  5.37s/it][A

	loss_cls: tensor(0.7046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8808, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:10<26:27,  5.36s/it][A

	loss_cls: tensor(0.9341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0695, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:16<26:28,  5.38s/it][A

	loss_cls: tensor(0.7180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9182, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:21<26:17,  5.36s/it][A

	loss_cls: tensor(0.6966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8172, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:26<26:19,  5.39s/it][A

	loss_cls: tensor(0.4641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6060, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:32<26:11,  5.38s/it][A

	loss_cls: tensor(0.7404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9568, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:37<26:11,  5.40s/it][A

	loss_cls: tensor(0.6221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6905, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:43<25:59,  5.38s/it][A

	loss_cls: tensor(0.7043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8695, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:48<25:52,  5.37s/it][A

	loss_cls: tensor(0.7967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1301, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:53<25:49,  5.38s/it][A

	loss_cls: tensor(0.8285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0136, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [05:59<25:41,  5.37s/it][A

	loss_cls: tensor(0.4961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6394, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:04<25:40,  5.39s/it][A

	loss_cls: tensor(0.6669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8270, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:10<25:34,  5.38s/it][A

	loss_cls: tensor(0.7005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2823, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:15<25:33,  5.40s/it][A

	loss_cls: tensor(0.4830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7397, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:20<25:24,  5.39s/it][A

	loss_cls: tensor(0.6257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8787, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:26<25:12,  5.36s/it][A

	loss_cls: tensor(0.6277, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9695, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:31<25:09,  5.37s/it][A

	loss_cls: tensor(0.7643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8645, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:36<25:00,  5.36s/it][A

	loss_cls: tensor(0.4693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6208, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:42<25:01,  5.38s/it][A

	loss_cls: tensor(0.6036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7645, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:47<24:53,  5.37s/it][A

	loss_cls: tensor(0.6828, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8221, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:53<24:51,  5.39s/it][A

	loss_cls: tensor(0.5931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7633, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [06:58<24:41,  5.37s/it][A

	loss_cls: tensor(0.4995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6033, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:03<24:33,  5.36s/it][A

	loss_cls: tensor(0.5410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7356, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:09<24:33,  5.38s/it][A

	loss_cls: tensor(0.6134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6579, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:14<24:25,  5.37s/it][A

	loss_cls: tensor(0.8830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0877, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:19<24:24,  5.38s/it][A

	loss_cls: tensor(0.5147, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9429, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:25<24:13,  5.36s/it][A

	loss_cls: tensor(0.4168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6855, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:30<24:11,  5.38s/it][A

	loss_cls: tensor(0.5932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6921, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:35<24:03,  5.36s/it][A

	loss_cls: tensor(0.6409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7353, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:41<23:54,  5.35s/it][A

	loss_cls: tensor(0.4065, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6201, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:46<23:54,  5.37s/it][A

	loss_cls: tensor(0.4251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4596, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:52<23:47,  5.37s/it][A

	loss_cls: tensor(0.8561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1730, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [07:57<23:48,  5.39s/it][A

	loss_cls: tensor(0.4655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4947, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:02<23:40,  5.38s/it][A

	loss_cls: tensor(0.5371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5996, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:08<23:36,  5.39s/it][A

	loss_cls: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7006, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:13<23:28,  5.38s/it][A

	loss_cls: tensor(0.6282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7956, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:18<23:19,  5.36s/it][A

	loss_cls: tensor(0.6341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7450, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:24<23:18,  5.38s/it][A

	loss_cls: tensor(0.9703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0374, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:29<23:10,  5.37s/it][A

	loss_cls: tensor(0.5718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8445, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:35<23:10,  5.39s/it][A

	loss_cls: tensor(0.7249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0403, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:40<23:01,  5.38s/it][A

	loss_cls: tensor(0.6319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7138, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:45<22:59,  5.39s/it][A

	loss_cls: tensor(0.5007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7662, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:51<22:49,  5.37s/it][A

	loss_cls: tensor(0.4967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7620, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:56<22:47,  5.38s/it][A

	loss_cls: tensor(0.9335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4261, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:01<22:38,  5.37s/it][A

	loss_cls: tensor(0.6256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1710, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7966, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:07<22:33,  5.37s/it][A

	loss_cls: tensor(0.5268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6633, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:12<22:34,  5.40s/it][A

	loss_cls: tensor(0.6697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2943, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:18<22:24,  5.38s/it][A

	loss_cls: tensor(0.6642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1329, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:23<22:22,  5.39s/it][A

	loss_cls: tensor(0.4648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5086, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:28<22:12,  5.37s/it][A

	loss_cls: tensor(0.5102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6323, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:34<22:10,  5.39s/it][A

	loss_cls: tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8020, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:39<22:04,  5.38s/it][A

	loss_cls: tensor(0.5645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6405, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:45<21:56,  5.38s/it][A

	loss_cls: tensor(0.6262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9339, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:50<21:55,  5.39s/it][A

	loss_cls: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0633, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:55<21:49,  5.39s/it][A

	loss_cls: tensor(0.8917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1690, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:01<21:46,  5.40s/it][A

	loss_cls: tensor(0.7157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9604, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:06<21:36,  5.38s/it][A

	loss_cls: tensor(0.7698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1062, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:12<21:35,  5.40s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7559, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:17<21:25,  5.38s/it][A

	loss_cls: tensor(0.7830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9427, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:22<21:16,  5.37s/it][A

	loss_cls: tensor(0.7123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0726, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:28<21:15,  5.38s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8426, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:33<21:04,  5.36s/it][A

	loss_cls: tensor(0.5460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9192, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:38<21:01,  5.37s/it][A

	loss_cls: tensor(0.3824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5900, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:44<20:52,  5.35s/it][A

	loss_cls: tensor(0.5526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0282, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:49<20:50,  5.37s/it][A

	loss_cls: tensor(0.6388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8297, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:54<20:41,  5.35s/it][A

	loss_cls: tensor(0.5353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0812, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6165, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:00<20:32,  5.34s/it][A

	loss_cls: tensor(0.5841, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7233, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:05<20:31,  5.35s/it][A

	loss_cls: tensor(0.7031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7726, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:10<20:25,  5.35s/it][A

	loss_cls: tensor(0.5359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7464, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:16<20:21,  5.36s/it][A

	loss_cls: tensor(0.6309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7401, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:21<20:17,  5.36s/it][A

	loss_cls: tensor(0.5732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7869, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:27<20:16,  5.38s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7314, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:32<20:11,  5.39s/it][A

	loss_cls: tensor(0.6031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6965, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:37<20:03,  5.37s/it][A

	loss_cls: tensor(0.6393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8030, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:43<20:03,  5.40s/it][A

	loss_cls: tensor(0.7273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1073, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:48<19:55,  5.38s/it][A

	loss_cls: tensor(0.4687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1328, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:54<19:53,  5.40s/it][A

	loss_cls: tensor(0.6676, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9205, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [11:59<19:45,  5.39s/it][A

	loss_cls: tensor(0.5615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8443, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:04<19:44,  5.41s/it][A

	loss_cls: tensor(0.6611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8161, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:10<19:36,  5.40s/it][A

	loss_cls: tensor(0.5464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6352, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:15<19:29,  5.39s/it][A

	loss_cls: tensor(0.3903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4917, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:21<19:26,  5.40s/it][A

	loss_cls: tensor(0.5527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8509, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:26<19:17,  5.38s/it][A

	loss_cls: tensor(0.6404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9836, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:31<19:14,  5.39s/it][A

	loss_cls: tensor(0.4880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5955, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:37<19:07,  5.39s/it][A

	loss_cls: tensor(0.3611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6810, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:42<19:05,  5.40s/it][A

	loss_cls: tensor(0.6774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0778, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:47<18:56,  5.38s/it][A

	loss_cls: tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8452, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:53<18:48,  5.37s/it][A

	loss_cls: tensor(0.8579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0870, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [12:58<18:45,  5.39s/it][A

	loss_cls: tensor(0.5826, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7723, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:04<18:36,  5.37s/it][A

	loss_cls: tensor(0.7235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2280, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:09<18:34,  5.39s/it][A

	loss_cls: tensor(0.4186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4804, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:14<18:28,  5.38s/it][A

	loss_cls: tensor(0.4949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8821, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:20<18:25,  5.39s/it][A

	loss_cls: tensor(0.6907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8853, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:25<18:17,  5.38s/it][A

	loss_cls: tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7959, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:30<18:09,  5.37s/it][A

	loss_cls: tensor(0.3770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6806, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:36<18:10,  5.40s/it][A

	loss_cls: tensor(0.6409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8455, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:41<18:04,  5.40s/it][A

	loss_cls: tensor(1.0931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2921, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:47<18:01,  5.41s/it][A

	loss_cls: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1867, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8350, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:52<17:51,  5.38s/it][A

	loss_cls: tensor(0.3684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6225, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [13:58<17:48,  5.40s/it][A

	loss_cls: tensor(0.6909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1822, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8731, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:03<17:38,  5.37s/it][A

	loss_cls: tensor(0.5754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9163, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:08<17:29,  5.36s/it][A

	loss_cls: tensor(0.6102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7070, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:14<17:27,  5.37s/it][A

	loss_cls: tensor(0.4039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4519, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:19<17:19,  5.36s/it][A

	loss_cls: tensor(0.7631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9398, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:24<17:18,  5.38s/it][A

	loss_cls: tensor(0.5554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8564, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:30<17:10,  5.37s/it][A

	loss_cls: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5830, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:35<17:08,  5.38s/it][A

	loss_cls: tensor(0.6986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2536, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:40<17:00,  5.37s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9381, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:46<16:51,  5.35s/it][A

	loss_cls: tensor(0.4542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7262, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:51<16:48,  5.37s/it][A

	loss_cls: tensor(0.5232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7366, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [14:56<16:41,  5.36s/it][A

	loss_cls: tensor(0.5803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6637, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:02<16:40,  5.38s/it][A

	loss_cls: tensor(0.5440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8489, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:07<16:33,  5.37s/it][A

	loss_cls: tensor(0.5820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3066, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8886, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:13<16:33,  5.40s/it][A

	loss_cls: tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8288, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:18<16:24,  5.38s/it][A

	loss_cls: tensor(0.3738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7672, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:23<16:15,  5.36s/it][A

	loss_cls: tensor(0.7040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0288, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:29<16:11,  5.37s/it][A

	loss_cls: tensor(0.6021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8098, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:34<16:03,  5.35s/it][A

	loss_cls: tensor(0.4280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4876, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:40<16:03,  5.39s/it][A

	loss_cls: tensor(0.5456, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2402, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7858, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:45<15:59,  5.39s/it][A

	loss_cls: tensor(0.5622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7571, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:50<15:54,  5.39s/it][A

	loss_cls: tensor(0.6009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6623, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:56<15:46,  5.38s/it][A

	loss_cls: tensor(0.4083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4642, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:01<15:35,  5.35s/it][A

	loss_cls: tensor(0.3673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5220, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:06<15:31,  5.35s/it][A

	loss_cls: tensor(0.7550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1927, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:12<15:25,  5.35s/it][A

	loss_cls: tensor(0.6329, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9341, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:17<15:23,  5.37s/it][A

	loss_cls: tensor(0.4316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5815, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:22<15:17,  5.36s/it][A

	loss_cls: tensor(0.4242, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5316, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:28<15:14,  5.38s/it][A

	loss_cls: tensor(0.6135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7470, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:33<15:07,  5.37s/it][A

	loss_cls: tensor(0.4954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5593, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:38<14:58,  5.35s/it][A

	loss_cls: tensor(0.3648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6494, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:44<14:56,  5.37s/it][A

	loss_cls: tensor(0.6153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8157, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:49<14:48,  5.35s/it][A

	loss_cls: tensor(0.2647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3613, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:55<14:46,  5.37s/it][A

	loss_cls: tensor(0.4667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6864, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:00<14:39,  5.36s/it][A

	loss_cls: tensor(0.6117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8371, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:05<14:35,  5.37s/it][A

	loss_cls: tensor(0.9651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4515, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:11<14:27,  5.36s/it][A

	loss_cls: tensor(0.5643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7054, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:16<14:18,  5.33s/it][A

	loss_cls: tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6374, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:21<14:16,  5.36s/it][A

	loss_cls: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6457, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:27<14:10,  5.35s/it][A

	loss_cls: tensor(0.4572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6193, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:32<14:08,  5.37s/it][A

	loss_cls: tensor(0.3033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3861, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:37<14:00,  5.35s/it][A

	loss_cls: tensor(0.4429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6327, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:43<13:57,  5.37s/it][A

	loss_cls: tensor(0.7740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0381, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:48<13:51,  5.37s/it][A

	loss_cls: tensor(0.4436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5799, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:54<13:45,  5.36s/it][A

	loss_cls: tensor(0.2922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3314, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [17:59<13:40,  5.36s/it][A

	loss_cls: tensor(0.5643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9169, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:04<13:33,  5.35s/it][A

	loss_cls: tensor(0.6967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2972, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:10<13:30,  5.37s/it][A

	loss_cls: tensor(0.4804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6566, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:15<13:22,  5.35s/it][A

	loss_cls: tensor(0.3995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:20<13:19,  5.37s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7843, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:26<13:12,  5.36s/it][A

	loss_cls: tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5673, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:31<13:09,  5.37s/it][A

	loss_cls: tensor(0.5171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0561, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5732, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:36<13:03,  5.37s/it][A

	loss_cls: tensor(0.6896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9476, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:42<12:57,  5.36s/it][A

	loss_cls: tensor(0.6499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0485, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:47<12:53,  5.37s/it][A

	loss_cls: tensor(1.0686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2632, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:52<12:45,  5.35s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6106, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [18:58<12:42,  5.37s/it][A

	loss_cls: tensor(0.4945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7053, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:03<12:34,  5.35s/it][A

	loss_cls: tensor(0.3207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4612, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:09<12:31,  5.37s/it][A

	loss_cls: tensor(0.3974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4993, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:14<12:23,  5.35s/it][A

	loss_cls: tensor(0.3863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4904, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:19<12:18,  5.35s/it][A

	loss_cls: tensor(0.5166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8766, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:25<12:15,  5.37s/it][A

	loss_cls: tensor(0.6457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0864, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:30<12:08,  5.35s/it][A

	loss_cls: tensor(1.1596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5645, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:35<12:03,  5.36s/it][A

	loss_cls: tensor(0.3607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4754, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:41<11:57,  5.35s/it][A

	loss_cls: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0562, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:46<11:54,  5.38s/it][A

	loss_cls: tensor(0.4519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8416, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:52<11:49,  5.37s/it][A

	loss_cls: tensor(0.9516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2204, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [19:57<11:43,  5.37s/it][A

	loss_cls: tensor(0.8506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9813, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:02<11:38,  5.37s/it][A

	loss_cls: tensor(0.5955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7351, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:08<11:31,  5.36s/it][A

	loss_cls: tensor(0.6402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2014, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8416, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:13<11:28,  5.38s/it][A

	loss_cls: tensor(0.4205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6160, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:18<11:20,  5.36s/it][A

	loss_cls: tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7938, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:24<11:16,  5.37s/it][A

	loss_cls: tensor(0.6943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0725, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:29<11:08,  5.35s/it][A

	loss_cls: tensor(0.5400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7366, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:34<11:02,  5.34s/it][A

	loss_cls: tensor(0.4572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:40<10:59,  5.36s/it][A

	loss_cls: tensor(0.7357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8203, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:45<10:52,  5.35s/it][A

	loss_cls: tensor(0.6295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7412, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:50<10:49,  5.36s/it][A

	loss_cls: tensor(0.6140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9146, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [20:56<10:43,  5.36s/it][A

	loss_cls: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6331, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:01<10:39,  5.37s/it][A

	loss_cls: tensor(0.5223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6463, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:07<10:32,  5.36s/it][A

	loss_cls: tensor(0.6649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9133, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:12<10:25,  5.35s/it][A

	loss_cls: tensor(1.0008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0610, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:17<10:22,  5.37s/it][A

	loss_cls: tensor(0.6112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8083, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:23<10:15,  5.36s/it][A

	loss_cls: tensor(0.7343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8653, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:28<10:12,  5.37s/it][A

	loss_cls: tensor(0.4609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7106, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:33<10:05,  5.36s/it][A

	loss_cls: tensor(0.5339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6602, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:39<10:01,  5.37s/it][A

	loss_cls: tensor(0.4278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6783, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:44<09:54,  5.36s/it][A

	loss_cls: tensor(0.8400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3628, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:49<09:48,  5.35s/it][A

	loss_cls: tensor(0.4681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7801, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [21:55<09:45,  5.37s/it][A

	loss_cls: tensor(0.5247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9463, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:00<09:42,  5.39s/it][A

	loss_cls: tensor(0.4185, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5824, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:06<09:37,  5.39s/it][A

	loss_cls: tensor(0.9253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1491, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:11<09:31,  5.39s/it][A

	loss_cls: tensor(0.3657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4957, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:16<09:26,  5.39s/it][A

	loss_cls: tensor(0.4241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6207, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:22<09:18,  5.37s/it][A

	loss_cls: tensor(0.4612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7128, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:27<09:11,  5.35s/it][A

	loss_cls: tensor(0.4932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6360, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:32<09:07,  5.37s/it][A

	loss_cls: tensor(0.6105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8436, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:38<08:59,  5.34s/it][A

	loss_cls: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6351, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:43<08:56,  5.37s/it][A

	loss_cls: tensor(0.5071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8377, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:49<08:50,  5.35s/it][A

	loss_cls: tensor(0.3428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5806, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [22:54<08:45,  5.37s/it][A

	loss_cls: tensor(0.4879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7853, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [22:59<08:40,  5.36s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6467, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:05<08:33,  5.35s/it][A

	loss_cls: tensor(1.1364, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2757, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:10<08:28,  5.36s/it][A

	loss_cls: tensor(0.6155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8356, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:15<08:22,  5.35s/it][A

	loss_cls: tensor(0.7987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0509, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:21<08:20,  5.38s/it][A

	loss_cls: tensor(0.6068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8698, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:26<08:13,  5.36s/it][A

	loss_cls: tensor(0.5264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6084, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:31<08:08,  5.37s/it][A

	loss_cls: tensor(0.4443, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5833, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:37<08:01,  5.35s/it][A

	loss_cls: tensor(0.7781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0347, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:42<07:54,  5.34s/it][A

	loss_cls: tensor(0.5351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8388, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:47<07:50,  5.35s/it][A

	loss_cls: tensor(0.5675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6837, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [23:53<07:44,  5.34s/it][A

	loss_cls: tensor(0.6238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6763, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [23:58<07:41,  5.36s/it][A

	loss_cls: tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6487, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:04<07:36,  5.37s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7015, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:09<07:31,  5.38s/it][A

	loss_cls: tensor(0.4722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6285, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:14<07:25,  5.37s/it][A

	loss_cls: tensor(0.7312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0499, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:20<07:18,  5.35s/it][A

	loss_cls: tensor(0.4982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5248, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:25<07:14,  5.37s/it][A

	loss_cls: tensor(0.5803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9137, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:30<07:08,  5.36s/it][A

	loss_cls: tensor(0.4991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5836, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:36<07:04,  5.38s/it][A

	loss_cls: tensor(0.6018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9461, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:41<06:58,  5.36s/it][A

	loss_cls: tensor(0.4599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6206, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:47<06:54,  5.38s/it][A

	loss_cls: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7121, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [24:52<06:47,  5.37s/it][A

	loss_cls: tensor(0.7983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0061, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [24:57<06:41,  5.35s/it][A

	loss_cls: tensor(0.4895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7082, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:03<06:37,  5.37s/it][A

	loss_cls: tensor(0.6679, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7336, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:08<06:31,  5.36s/it][A

	loss_cls: tensor(0.5603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7063, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:13<06:27,  5.38s/it][A

	loss_cls: tensor(0.8961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2100, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:19<06:21,  5.37s/it][A

	loss_cls: tensor(0.4859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8878, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:24<06:16,  5.38s/it][A

	loss_cls: tensor(0.5442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7643, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:29<06:10,  5.37s/it][A

	loss_cls: tensor(0.9734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2673, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:35<06:04,  5.36s/it][A

	loss_cls: tensor(0.4125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8230, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:40<06:00,  5.38s/it][A

	loss_cls: tensor(0.5361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6624, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:46<05:54,  5.37s/it][A

	loss_cls: tensor(0.4764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7978, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [25:51<05:50,  5.39s/it][A

	loss_cls: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5694, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [25:56<05:44,  5.38s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6077, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:02<05:39,  5.39s/it][A

	loss_cls: tensor(0.7334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8620, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:07<05:32,  5.36s/it][A

	loss_cls: tensor(0.4532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5152, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:12<05:26,  5.35s/it][A

	loss_cls: tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8568, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:18<05:22,  5.38s/it][A

	loss_cls: tensor(0.4542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6389, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:23<05:16,  5.37s/it][A

	loss_cls: tensor(0.4008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1614, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5622, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:29<05:13,  5.40s/it][A

	loss_cls: tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6407, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:34<05:06,  5.38s/it][A

	loss_cls: tensor(0.4824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6244, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:39<05:02,  5.39s/it][A

	loss_cls: tensor(1.0138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4061, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:45<04:55,  5.38s/it][A

	loss_cls: tensor(0.5995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7773, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [26:50<04:51,  5.39s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8487, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [26:56<04:45,  5.38s/it][A

	loss_cls: tensor(0.9152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0416, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:01<04:39,  5.37s/it][A

	loss_cls: tensor(0.3976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4798, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:06<04:34,  5.39s/it][A

	loss_cls: tensor(0.5144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7953, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:12<04:29,  5.39s/it][A

	loss_cls: tensor(0.3731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5156, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:17<04:24,  5.40s/it][A

	loss_cls: tensor(0.6750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9685, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:22<04:18,  5.39s/it][A

	loss_cls: tensor(0.5633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6138, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:28<04:14,  5.41s/it][A

	loss_cls: tensor(0.4189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7532, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:33<04:08,  5.40s/it][A

	loss_cls: tensor(0.5509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7230, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:39<04:02,  5.39s/it][A

	loss_cls: tensor(0.4472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8050, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:44<03:58,  5.41s/it][A

	loss_cls: tensor(0.4293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7683, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [27:50<03:52,  5.40s/it][A

	loss_cls: tensor(0.8897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1553, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [27:55<03:47,  5.41s/it][A

	loss_cls: tensor(0.6535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8155, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:00<03:41,  5.39s/it][A

	loss_cls: tensor(0.5303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7542, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:06<03:36,  5.42s/it][A

	loss_cls: tensor(0.9794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2464, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:11<03:30,  5.40s/it][A

	loss_cls: tensor(0.4299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5690, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:16<03:24,  5.39s/it][A

	loss_cls: tensor(0.3639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5171, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:22<03:20,  5.41s/it][A

	loss_cls: tensor(0.6315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8578, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:27<03:14,  5.40s/it][A

	loss_cls: tensor(0.7615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9127, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:33<03:09,  5.42s/it][A

	loss_cls: tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0922, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:38<03:03,  5.40s/it][A

	loss_cls: tensor(0.5260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6510, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:44<02:59,  5.43s/it][A

	loss_cls: tensor(0.6707, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9083, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:49<02:53,  5.42s/it][A

	loss_cls: tensor(0.4139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6855, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [28:54<02:47,  5.41s/it][A

	loss_cls: tensor(0.8093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2526, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:00<02:42,  5.43s/it][A

	loss_cls: tensor(0.9216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2339, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1555, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:05<02:36,  5.41s/it][A

	loss_cls: tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6171, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:11<02:31,  5.42s/it][A

	loss_cls: tensor(0.3432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5644, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:16<02:25,  5.41s/it][A

	loss_cls: tensor(0.6125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1274, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:22<02:21,  5.43s/it][A

	loss_cls: tensor(0.4561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6506, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:27<02:15,  5.42s/it][A

	loss_cls: tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8635, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:32<02:09,  5.41s/it][A

	loss_cls: tensor(0.7908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3734, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:38<02:04,  5.43s/it][A

	loss_cls: tensor(0.5308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8191, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:43<01:59,  5.41s/it][A

	loss_cls: tensor(0.7494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0290, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:49<01:53,  5.42s/it][A

	loss_cls: tensor(0.4620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9654, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [29:54<01:48,  5.41s/it][A

	loss_cls: tensor(0.6261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8362, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [29:59<01:43,  5.42s/it][A

	loss_cls: tensor(0.3958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4776, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:05<01:37,  5.40s/it][A

	loss_cls: tensor(0.5432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0210, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:10<01:31,  5.40s/it][A

	loss_cls: tensor(0.5094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6382, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:16<01:26,  5.41s/it][A

	loss_cls: tensor(0.4777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7301, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:21<01:20,  5.38s/it][A

	loss_cls: tensor(0.5113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7485, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:26<01:15,  5.41s/it][A

	loss_cls: tensor(0.5272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8560, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:32<01:10,  5.41s/it][A

	loss_cls: tensor(0.3696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5027, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:37<01:05,  5.44s/it][A

	loss_cls: tensor(0.5807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7250, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:43<00:59,  5.43s/it][A

	loss_cls: tensor(0.6445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7182, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:48<00:54,  5.42s/it][A

	loss_cls: tensor(0.7850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9055, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [30:54<00:48,  5.44s/it][A

	loss_cls: tensor(0.5407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8781, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [30:59<00:43,  5.42s/it][A

	loss_cls: tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6312, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:05<00:38,  5.45s/it][A

	loss_cls: tensor(0.4915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1289, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6204, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:10<00:32,  5.43s/it][A

	loss_cls: tensor(0.6965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9649, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:15<00:27,  5.45s/it][A

	loss_cls: tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9298, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:21<00:21,  5.44s/it][A

	loss_cls: tensor(0.6207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0815, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7022, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:26<00:16,  5.43s/it][A

	loss_cls: tensor(0.8067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0880, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:32<00:10,  5.44s/it][A

	loss_cls: tensor(0.5658, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8302, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:37<00:05,  5.42s/it][A

	loss_cls: tensor(0.6000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2886, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8887, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:39<00:00,  5.37s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(1.0694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3360, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8242553873250713

	Training cls acc: 0.6922080979284368

	Training cls prec: 0.5718399538314792

	Training cls rec: 0.6041162697942359

	Training cls f1: 0.5360258169108252

--
	Training ner acc: 0.9555250671128647

	Training ner prec: 0.2672517985823376

	Training ner rec: 0.2749101699907006

	Training ner f1: 0.2705547366314039

	Current Learning rate:  0.0002857142857142857



  1%|          | 1/177 [00:00<01:58,  1.49it/s][A
  1%|          | 2/177 [00:01<02:05,  1.40it/s][A
  2%|▏         | 3/177 [00:02<02:06,  1.38it/s][A
  2%|▏         | 4/177 [00:02<02:06,  1.37it/s][A
  3%|▎         | 5/177 [00:03<02:01,  1.42it/s][A
  3%|▎         | 6/177 [00:04<02:02,  1.40it/s][A
  4%|▍         | 7/177 [00:05<02:03,  1.38it/s][A
  5%|▍         | 8/177 [00:05<01:58,  1.42it/s][A
  5%|▌         | 9/177 [00:06<01:59,  1.41it/s][A
  6%|▌         | 10/177 [00:07<01:59,  1.40it/s][A
  6%|▌         | 11/177 [00:07<02:00,  1.38it/s][A
  7%|▋         | 12/177 [00:08<01:56,  1.42it/s][A
  7%|▋         | 13/177 [00:09<01:57,  1.40it/s][A
  8%|▊         | 14/177 [00:10<01:57,  1.39it/s][A
  8%|▊         | 15/177 [00:10<01:57,  1.38it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.42it/s][A
 10%|▉         | 17/177 [00:12<01:54,  1.40it/s][A
 10%|█         | 18/177 [00:12<01:55,  1.38it/s][A
 11%|█         | 19/177 [00:13<01:51,  1.42it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7761024171854817

	Validation cls acc: 0.7125706214689266

	Validation cls prec: 0.6103309120258273

	Validation cls rec: 0.6195117029862792

	Validation cls f1: 0.5651396625972898

--
	Validation ner acc: 0.9543074445464209

	Validation ner prec: 0.41316543849209975

	Validation ner rec: 0.42363465160075336

	Validation ner f1: 0.41818129402208076



  0%|          | 1/354 [00:05<32:26,  5.51s/it][A

	loss_cls: tensor(0.7379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8884, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:55,  5.44s/it][A

	loss_cls: tensor(0.4398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6899, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:55,  5.46s/it][A

	loss_cls: tensor(0.5976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0055, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:39,  5.43s/it][A

	loss_cls: tensor(0.4045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5281, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:30,  5.42s/it][A

	loss_cls: tensor(1.0143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1256, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:34,  5.44s/it][A

	loss_cls: tensor(0.6594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7486, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:22,  5.43s/it][A

	loss_cls: tensor(0.5638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8352, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:17,  5.43s/it][A

	loss_cls: tensor(0.4436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5594, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:08,  5.42s/it][A

	loss_cls: tensor(0.5570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7623, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:11,  5.44s/it][A

	loss_cls: tensor(0.5394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0384, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:59,  5.42s/it][A

	loss_cls: tensor(0.4768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8283, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:47,  5.40s/it][A

	loss_cls: tensor(0.5417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6893, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:48,  5.42s/it][A

	loss_cls: tensor(0.4757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5180, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:37,  5.40s/it][A

	loss_cls: tensor(0.7568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9034, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:32,  5.40s/it][A

	loss_cls: tensor(0.4626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5158, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:22,  5.39s/it][A

	loss_cls: tensor(0.7774, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8469, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:22,  5.41s/it][A

	loss_cls: tensor(0.6118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6585, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:11,  5.39s/it][A

	loss_cls: tensor(0.7572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8183, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:03,  5.38s/it][A

	loss_cls: tensor(0.5294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6189, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:02,  5.40s/it][A

	loss_cls: tensor(0.5217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5586, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<29:52,  5.38s/it][A

	loss_cls: tensor(0.7725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8879, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<29:52,  5.40s/it][A

	loss_cls: tensor(0.6755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7662, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:42,  5.38s/it][A

	loss_cls: tensor(0.2724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4964, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:42,  5.40s/it][A

	loss_cls: tensor(0.7926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4150, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:33,  5.39s/it][A

	loss_cls: tensor(0.6006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7548, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:24,  5.38s/it][A

	loss_cls: tensor(0.9143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1460, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:25,  5.40s/it][A

	loss_cls: tensor(0.3865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5016, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:14,  5.38s/it][A

	loss_cls: tensor(0.3701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0739, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4439, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:17,  5.41s/it][A

	loss_cls: tensor(0.4977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7084, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:10,  5.40s/it][A

	loss_cls: tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2223, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:08,  5.41s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7058, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<29:05,  5.42s/it][A

	loss_cls: tensor(0.4338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4624, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:58,  5.42s/it][A

	loss_cls: tensor(0.3911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7144, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<28:57,  5.43s/it][A

	loss_cls: tensor(0.7100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9272, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:46,  5.41s/it][A

	loss_cls: tensor(0.8469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3598, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:49,  5.44s/it][A

	loss_cls: tensor(1.1454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4260, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:41,  5.43s/it][A

	loss_cls: tensor(0.6606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9993, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:41,  5.45s/it][A

	loss_cls: tensor(0.7942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0527, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:28,  5.42s/it][A

	loss_cls: tensor(0.4975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6931, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:19,  5.41s/it][A

	loss_cls: tensor(0.6839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9223, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<28:19,  5.43s/it][A

	loss_cls: tensor(0.4365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5568, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:10,  5.42s/it][A

	loss_cls: tensor(0.9475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2312, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<28:12,  5.44s/it][A

	loss_cls: tensor(0.3862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4525, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<28:04,  5.44s/it][A

	loss_cls: tensor(0.5633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8816, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<28:02,  5.44s/it][A

	loss_cls: tensor(0.3512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6593, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:09<27:52,  5.43s/it][A

	loss_cls: tensor(0.6580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9312, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:42,  5.41s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6240, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:20<27:43,  5.44s/it][A

	loss_cls: tensor(0.4962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6068, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:33,  5.42s/it][A

	loss_cls: tensor(0.4872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6779, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:34,  5.44s/it][A

	loss_cls: tensor(0.7127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8263, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:24,  5.43s/it][A

	loss_cls: tensor(0.8081, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1491, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:24,  5.44s/it][A

	loss_cls: tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2533, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:47<27:13,  5.43s/it][A

	loss_cls: tensor(0.5712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8560, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<27:01,  5.41s/it][A

	loss_cls: tensor(0.5446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5924, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:57<26:54,  5.40s/it][A

	loss_cls: tensor(0.4170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7184, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<26:49,  5.40s/it][A

	loss_cls: tensor(0.6414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:51,  5.43s/it][A

	loss_cls: tensor(0.5327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7385, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:41,  5.41s/it][A

	loss_cls: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6021, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:41,  5.43s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6530, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:24<26:30,  5.41s/it][A

	loss_cls: tensor(0.5807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6836, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:30<26:19,  5.39s/it][A

	loss_cls: tensor(0.7280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8989, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:35<26:16,  5.40s/it][A

	loss_cls: tensor(0.4052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6077, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:08,  5.39s/it][A

	loss_cls: tensor(0.5644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8348, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:46<26:07,  5.40s/it][A

	loss_cls: tensor(0.4886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6058, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:51<25:56,  5.38s/it][A

	loss_cls: tensor(0.7989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8993, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:57<25:55,  5.40s/it][A

	loss_cls: tensor(0.6500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8313, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:02<25:45,  5.38s/it][A

	loss_cls: tensor(0.8525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2947, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1472, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:08<25:36,  5.37s/it][A

	loss_cls: tensor(0.6994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9044, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:13<25:37,  5.39s/it][A

	loss_cls: tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8195, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:18<25:28,  5.38s/it][A

	loss_cls: tensor(0.5620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8205, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:24<25:28,  5.40s/it][A

	loss_cls: tensor(0.4989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6112, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:29<25:19,  5.39s/it][A

	loss_cls: tensor(0.7435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1176, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:35<25:19,  5.41s/it][A

	loss_cls: tensor(0.5249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6256, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:40<25:10,  5.39s/it][A

	loss_cls: tensor(0.4924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7154, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:45<25:01,  5.38s/it][A

	loss_cls: tensor(0.4337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5168, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:51<25:01,  5.40s/it][A

	loss_cls: tensor(0.6597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9417, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:56<24:54,  5.40s/it][A

	loss_cls: tensor(0.5910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8459, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:02<24:48,  5.39s/it][A

	loss_cls: tensor(0.4696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5555, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:07<24:43,  5.40s/it][A

	loss_cls: tensor(0.5107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5494, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:12<24:45,  5.42s/it][A

	loss_cls: tensor(0.6126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7655, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:18<24:36,  5.41s/it][A

	loss_cls: tensor(0.6157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9639, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:23<24:35,  5.42s/it][A

	loss_cls: tensor(0.9599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1199, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:29<24:28,  5.42s/it][A

	loss_cls: tensor(0.5565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6772, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:34<24:21,  5.41s/it][A

	loss_cls: tensor(0.6867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7770, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:40<24:21,  5.43s/it][A

	loss_cls: tensor(0.5748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9826, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:45<24:10,  5.41s/it][A

	loss_cls: tensor(0.3465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4669, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:50<24:03,  5.41s/it][A

	loss_cls: tensor(0.4913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6226, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:56<23:56,  5.40s/it][A

	loss_cls: tensor(0.4055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4851, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:01<23:52,  5.41s/it][A

	loss_cls: tensor(0.8133, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1815, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:06<23:47,  5.41s/it][A

	loss_cls: tensor(0.4857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7282, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:12<23:40,  5.40s/it][A

	loss_cls: tensor(0.5249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8151, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:17<23:38,  5.41s/it][A

	loss_cls: tensor(0.4875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9960, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:23<23:29,  5.40s/it][A

	loss_cls: tensor(0.4800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8504, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:28<23:27,  5.42s/it][A

	loss_cls: tensor(0.5691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8519, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:33<23:18,  5.40s/it][A

	loss_cls: tensor(0.4763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7737, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:39<23:19,  5.42s/it][A

	loss_cls: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7217, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:44<23:11,  5.42s/it][A

	loss_cls: tensor(0.4133, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6266, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:50<23:03,  5.41s/it][A

	loss_cls: tensor(0.5010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6297, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:55<23:03,  5.42s/it][A

	loss_cls: tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6389, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:01<22:56,  5.42s/it][A

	loss_cls: tensor(0.3570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5469, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:06<22:56,  5.44s/it][A

	loss_cls: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7578, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:12<22:48,  5.43s/it][A

	loss_cls: tensor(0.5231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6507, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:17<22:48,  5.45s/it][A

	loss_cls: tensor(0.3281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5960, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:22<22:38,  5.43s/it][A

	loss_cls: tensor(0.6445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8888, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:28<22:27,  5.41s/it][A

	loss_cls: tensor(0.6020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7762, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:33<22:26,  5.43s/it][A

	loss_cls: tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6391, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:39<22:16,  5.41s/it][A

	loss_cls: tensor(0.4480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7203, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:44<22:13,  5.42s/it][A

	loss_cls: tensor(1.0720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5123, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:49<22:04,  5.41s/it][A

	loss_cls: tensor(0.3757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4776, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:55<22:02,  5.42s/it][A

	loss_cls: tensor(1.0296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4043, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:00<21:50,  5.39s/it][A

	loss_cls: tensor(0.5500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5937, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:06<21:41,  5.38s/it][A

	loss_cls: tensor(0.5762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8339, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:11<21:37,  5.39s/it][A

	loss_cls: tensor(0.9304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1271, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:16<21:28,  5.37s/it][A

	loss_cls: tensor(0.6231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0956, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7187, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:22<21:26,  5.38s/it][A

	loss_cls: tensor(0.6995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1018, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:27<21:18,  5.37s/it][A

	loss_cls: tensor(0.6477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9092, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:32<21:17,  5.39s/it][A

	loss_cls: tensor(0.3813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1835, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5648, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:38<21:07,  5.37s/it][A

	loss_cls: tensor(0.4499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5967, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:43<21:00,  5.36s/it][A

	loss_cls: tensor(1.0262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4672, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:49<20:57,  5.37s/it][A

	loss_cls: tensor(0.6351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1779, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:54<20:47,  5.35s/it][A

	loss_cls: tensor(0.6100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7103, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:59<20:45,  5.37s/it][A

	loss_cls: tensor(0.9613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3262, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:05<20:37,  5.36s/it][A

	loss_cls: tensor(0.5349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7877, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:10<20:35,  5.37s/it][A

	loss_cls: tensor(0.6009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7583, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:15<20:28,  5.36s/it][A

	loss_cls: tensor(0.8814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0777, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:21<20:20,  5.35s/it][A

	loss_cls: tensor(0.6224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8374, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:26<20:19,  5.37s/it][A

	loss_cls: tensor(0.5387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6995, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:31<20:08,  5.35s/it][A

	loss_cls: tensor(0.5805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7204, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:37<20:06,  5.36s/it][A

	loss_cls: tensor(0.5012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6622, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:42<19:58,  5.35s/it][A

	loss_cls: tensor(0.5804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0714, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:48<19:58,  5.37s/it][A

	loss_cls: tensor(0.4497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6949, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:53<19:49,  5.36s/it][A

	loss_cls: tensor(0.6283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1749, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:58<19:40,  5.34s/it][A

	loss_cls: tensor(0.9105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0149, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:04<19:39,  5.36s/it][A

	loss_cls: tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6515, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:09<19:31,  5.35s/it][A

	loss_cls: tensor(0.4509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6614, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:14<19:30,  5.37s/it][A

	loss_cls: tensor(0.4876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7640, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:20<19:21,  5.35s/it][A

	loss_cls: tensor(0.6336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8443, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:25<19:19,  5.37s/it][A

	loss_cls: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5754, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:30<19:10,  5.35s/it][A

	loss_cls: tensor(0.5844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9604, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:36<19:01,  5.33s/it][A

	loss_cls: tensor(0.6817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8476, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:41<18:58,  5.34s/it][A

	loss_cls: tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8063, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:46<18:49,  5.33s/it][A

	loss_cls: tensor(0.6836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:52<18:50,  5.36s/it][A

	loss_cls: tensor(0.7752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8608, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:57<18:42,  5.35s/it][A

	loss_cls: tensor(0.8062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1582, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9644, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:02<18:40,  5.36s/it][A

	loss_cls: tensor(0.5842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6610, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:08<18:31,  5.34s/it][A

	loss_cls: tensor(0.4517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5340, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:13<18:23,  5.33s/it][A

	loss_cls: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7026, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:18<18:21,  5.35s/it][A

	loss_cls: tensor(0.4158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7485, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:24<18:14,  5.34s/it][A

	loss_cls: tensor(0.6520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9584, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:29<18:15,  5.37s/it][A

	loss_cls: tensor(0.6709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9810, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:34<18:05,  5.35s/it][A

	loss_cls: tensor(0.6335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8518, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:40<18:04,  5.37s/it][A

	loss_cls: tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8483, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:45<17:56,  5.35s/it][A

	loss_cls: tensor(0.3859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8242, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:51<17:47,  5.34s/it][A

	loss_cls: tensor(0.4579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5930, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:56<17:45,  5.35s/it][A

	loss_cls: tensor(0.4028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4861, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:01<17:36,  5.34s/it][A

	loss_cls: tensor(0.4571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6996, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:07<17:35,  5.36s/it][A

	loss_cls: tensor(0.7264, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8682, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:12<17:27,  5.35s/it][A

	loss_cls: tensor(0.4693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9271, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:17<17:24,  5.36s/it][A

	loss_cls: tensor(0.5684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7532, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:23<17:15,  5.34s/it][A

	loss_cls: tensor(0.6067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8905, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:28<17:07,  5.32s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8382, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:33<17:04,  5.34s/it][A

	loss_cls: tensor(0.4870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6530, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:39<16:58,  5.33s/it][A

	loss_cls: tensor(0.6824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9686, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:44<16:56,  5.35s/it][A

	loss_cls: tensor(0.7820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9627, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:49<16:47,  5.33s/it][A

	loss_cls: tensor(0.5253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9839, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:55<16:45,  5.35s/it][A

	loss_cls: tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8380, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:00<16:35,  5.32s/it][A

	loss_cls: tensor(1.1280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3959, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:05<16:26,  5.31s/it][A

	loss_cls: tensor(0.3771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4433, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:11<16:26,  5.33s/it][A

	loss_cls: tensor(0.6769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7835, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:16<16:19,  5.32s/it][A

	loss_cls: tensor(0.4693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6239, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:21<16:21,  5.36s/it][A

	loss_cls: tensor(0.4531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6028, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:27<16:14,  5.36s/it][A

	loss_cls: tensor(0.6659, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8349, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:32<16:11,  5.37s/it][A

	loss_cls: tensor(0.4465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5874, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:37<16:02,  5.35s/it][A

	loss_cls: tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8379, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:43<15:52,  5.32s/it][A

	loss_cls: tensor(0.6577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9339, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:48<15:51,  5.34s/it][A

	loss_cls: tensor(0.4585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6135, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:53<15:45,  5.34s/it][A

	loss_cls: tensor(0.5547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7819, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [15:59<15:43,  5.36s/it][A

	loss_cls: tensor(0.5907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8434, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:04<15:37,  5.36s/it][A

	loss_cls: tensor(0.3230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1054, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:10<15:35,  5.37s/it][A

	loss_cls: tensor(0.3526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5363, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:15<15:25,  5.35s/it][A

	loss_cls: tensor(0.5581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6118, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:20<15:22,  5.36s/it][A

	loss_cls: tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6721, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:26<15:18,  5.37s/it][A

	loss_cls: tensor(0.7326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1289, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8614, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:31<15:12,  5.37s/it][A

	loss_cls: tensor(0.8184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0945, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:36<15:12,  5.40s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1692, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7258, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:42<15:05,  5.39s/it][A

	loss_cls: tensor(0.5885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9313, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:47<15:01,  5.40s/it][A

	loss_cls: tensor(0.2678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5062, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:53<14:52,  5.38s/it][A

	loss_cls: tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4172, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [16:58<14:50,  5.40s/it][A

	loss_cls: tensor(0.6189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8053, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:03<14:43,  5.39s/it][A

	loss_cls: tensor(0.4352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3314, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7666, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:09<14:36,  5.38s/it][A

	loss_cls: tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7216, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:14<14:33,  5.39s/it][A

	loss_cls: tensor(0.7194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7990, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:19<14:23,  5.37s/it][A

	loss_cls: tensor(0.4677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6736, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:25<14:20,  5.38s/it][A

	loss_cls: tensor(0.6829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8448, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:30<14:12,  5.36s/it][A

	loss_cls: tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8318, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:36<14:10,  5.38s/it][A

	loss_cls: tensor(0.5990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8094, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:41<14:04,  5.38s/it][A

	loss_cls: tensor(0.6405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8048, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:46<13:57,  5.37s/it][A

	loss_cls: tensor(0.5995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3739, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9734, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:52<13:55,  5.39s/it][A

	loss_cls: tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7950, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [17:57<13:48,  5.38s/it][A

	loss_cls: tensor(0.5021, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7028, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:03<13:44,  5.39s/it][A

	loss_cls: tensor(0.7870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1292, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9162, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:08<13:36,  5.37s/it][A

	loss_cls: tensor(0.7038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8990, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:13<13:34,  5.39s/it][A

	loss_cls: tensor(0.5202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6445, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:19<13:27,  5.38s/it][A

	loss_cls: tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5151, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:24<13:20,  5.37s/it][A

	loss_cls: tensor(0.7875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9756, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:29<13:18,  5.39s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7326, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:35<13:11,  5.38s/it][A

	loss_cls: tensor(0.4873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2602, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7475, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:40<13:07,  5.39s/it][A

	loss_cls: tensor(0.7306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3448, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0754, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:46<12:59,  5.37s/it][A

	loss_cls: tensor(0.7440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8156, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:51<12:55,  5.39s/it][A

	loss_cls: tensor(0.7005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8612, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:56<12:48,  5.37s/it][A

	loss_cls: tensor(0.6041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7892, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:02<12:41,  5.36s/it][A

	loss_cls: tensor(0.2704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4628, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:07<12:37,  5.37s/it][A

	loss_cls: tensor(0.4523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7581, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:12<12:30,  5.36s/it][A

	loss_cls: tensor(0.6535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9675, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:18<12:26,  5.37s/it][A

	loss_cls: tensor(0.5753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2315, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8068, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:23<12:19,  5.36s/it][A

	loss_cls: tensor(0.6011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2153, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8164, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:28<12:16,  5.37s/it][A

	loss_cls: tensor(0.4631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8310, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:34<12:11,  5.38s/it][A

	loss_cls: tensor(0.4614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6616, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:39<12:04,  5.37s/it][A

	loss_cls: tensor(0.7507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8785, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:45<12:02,  5.39s/it][A

	loss_cls: tensor(0.5265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7902, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:50<11:54,  5.37s/it][A

	loss_cls: tensor(0.4863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6472, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:55<11:51,  5.39s/it][A

	loss_cls: tensor(0.7286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8294, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:01<11:45,  5.38s/it][A

	loss_cls: tensor(0.5644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9778, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:06<11:42,  5.40s/it][A

	loss_cls: tensor(0.5991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0588, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:12<11:34,  5.38s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6190, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:17<11:26,  5.37s/it][A

	loss_cls: tensor(0.6649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0264, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:22<11:23,  5.39s/it][A

	loss_cls: tensor(0.7754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9949, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:28<11:16,  5.37s/it][A

	loss_cls: tensor(0.5450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8347, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:33<11:14,  5.39s/it][A

	loss_cls: tensor(0.4540, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6004, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:38<11:07,  5.38s/it][A

	loss_cls: tensor(0.7550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8441, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:44<11:04,  5.40s/it][A

	loss_cls: tensor(0.7350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9777, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:49<10:56,  5.38s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8535, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:55<10:49,  5.37s/it][A

	loss_cls: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7359, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:00<10:45,  5.38s/it][A

	loss_cls: tensor(0.8314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5564, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:05<10:37,  5.36s/it][A

	loss_cls: tensor(0.6857, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9988, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:11<10:35,  5.39s/it][A

	loss_cls: tensor(0.4488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5446, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:16<10:28,  5.37s/it][A

	loss_cls: tensor(0.4002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6595, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:22<10:24,  5.39s/it][A

	loss_cls: tensor(0.5805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6665, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:27<10:17,  5.37s/it][A

	loss_cls: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8551, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:32<10:10,  5.36s/it][A

	loss_cls: tensor(0.6804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0262, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:38<10:07,  5.38s/it][A

	loss_cls: tensor(0.6141, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9205, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:43<10:02,  5.38s/it][A

	loss_cls: tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6523, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:48<09:59,  5.40s/it][A

	loss_cls: tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6405, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:54<09:51,  5.38s/it][A

	loss_cls: tensor(0.5730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6891, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [21:59<09:48,  5.40s/it][A

	loss_cls: tensor(0.5616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4499, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0115, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:05<09:41,  5.38s/it][A

	loss_cls: tensor(0.5197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6964, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:10<09:33,  5.36s/it][A

	loss_cls: tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7379, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:15<09:30,  5.38s/it][A

	loss_cls: tensor(0.7649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2053, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:21<09:24,  5.38s/it][A

	loss_cls: tensor(0.6435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9632, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:26<09:20,  5.39s/it][A

	loss_cls: tensor(0.4338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5965, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:31<09:14,  5.38s/it][A

	loss_cls: tensor(0.5220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7215, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:37<09:10,  5.40s/it][A

	loss_cls: tensor(0.6995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8892, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:42<09:03,  5.38s/it][A

	loss_cls: tensor(0.6131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0067, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:48<08:57,  5.37s/it][A

	loss_cls: tensor(0.8327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0497, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:53<08:52,  5.38s/it][A

	loss_cls: tensor(0.9985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0901, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [22:58<08:46,  5.37s/it][A

	loss_cls: tensor(0.6526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9533, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:04<08:41,  5.38s/it][A

	loss_cls: tensor(0.6075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8003, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:09<08:35,  5.37s/it][A

	loss_cls: tensor(0.6635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7602, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:14<08:31,  5.38s/it][A

	loss_cls: tensor(0.5655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8249, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:20<08:25,  5.38s/it][A

	loss_cls: tensor(0.7734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0612, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:25<08:20,  5.38s/it][A

	loss_cls: tensor(0.5112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7973, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:31<08:16,  5.40s/it][A

	loss_cls: tensor(0.4503, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4934, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:36<08:10,  5.39s/it][A

	loss_cls: tensor(0.4858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6976, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:42<08:07,  5.41s/it][A

	loss_cls: tensor(0.5143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7461, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:47<08:00,  5.40s/it][A

	loss_cls: tensor(0.4809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7260, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:52<07:56,  5.41s/it][A

	loss_cls: tensor(0.7608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9453, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [23:58<07:49,  5.40s/it][A

	loss_cls: tensor(0.4339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5362, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:03<07:43,  5.38s/it][A

	loss_cls: tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8184, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:09<07:39,  5.40s/it][A

	loss_cls: tensor(0.3701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4352, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:14<07:32,  5.39s/it][A

	loss_cls: tensor(0.9459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0881, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:19<07:28,  5.40s/it][A

	loss_cls: tensor(0.7911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8816, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:25<07:20,  5.38s/it][A

	loss_cls: tensor(0.4403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5777, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:30<07:16,  5.39s/it][A

	loss_cls: tensor(0.5188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8595, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:35<07:09,  5.36s/it][A

	loss_cls: tensor(0.7373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8190, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:41<07:01,  5.34s/it][A

	loss_cls: tensor(0.6766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9134, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:46<06:57,  5.35s/it][A

	loss_cls: tensor(0.4589, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5535, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:51<06:51,  5.35s/it][A

	loss_cls: tensor(0.4789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5955, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [24:57<06:47,  5.36s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8852, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:02<06:40,  5.34s/it][A

	loss_cls: tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9710, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:07<06:36,  5.35s/it][A

	loss_cls: tensor(0.9046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4808, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:13<06:29,  5.34s/it][A

	loss_cls: tensor(0.4806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8465, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:18<06:24,  5.35s/it][A

	loss_cls: tensor(0.9844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5379, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:23<06:18,  5.34s/it][A

	loss_cls: tensor(0.6178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9071, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:29<06:13,  5.33s/it][A

	loss_cls: tensor(0.5405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7494, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:34<06:09,  5.35s/it][A

	loss_cls: tensor(1.2139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4108, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:39<06:03,  5.34s/it][A

	loss_cls: tensor(0.7620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8954, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:45<05:58,  5.35s/it][A

	loss_cls: tensor(0.6833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8592, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [25:50<05:52,  5.35s/it][A

	loss_cls: tensor(0.4257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6305, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [25:56<05:48,  5.36s/it][A

	loss_cls: tensor(0.5717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7983, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:01<05:41,  5.34s/it][A

	loss_cls: tensor(0.5184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8748, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:06<05:35,  5.33s/it][A

	loss_cls: tensor(0.6713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8771, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:12<05:31,  5.35s/it][A

	loss_cls: tensor(0.7758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0660, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:17<05:25,  5.34s/it][A

	loss_cls: tensor(0.7859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9809, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:22<05:20,  5.34s/it][A

	loss_cls: tensor(0.4685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5716, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:27<05:14,  5.32s/it][A

	loss_cls: tensor(0.5954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9077, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:33<05:10,  5.35s/it][A

	loss_cls: tensor(0.5448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7389, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:38<05:04,  5.34s/it][A

	loss_cls: tensor(0.5325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7182, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:43<04:58,  5.33s/it][A

	loss_cls: tensor(0.3811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7396, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [26:49<04:54,  5.35s/it][A

	loss_cls: tensor(0.8815, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0310, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [26:54<04:48,  5.34s/it][A

	loss_cls: tensor(0.6885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8719, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:00<04:43,  5.35s/it][A

	loss_cls: tensor(0.6566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7058, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:05<04:37,  5.33s/it][A

	loss_cls: tensor(0.6230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9207, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:10<04:32,  5.34s/it][A

	loss_cls: tensor(0.5940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7360, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:16<04:26,  5.33s/it][A

	loss_cls: tensor(0.4885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9377, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:21<04:20,  5.31s/it][A

	loss_cls: tensor(0.4301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5447, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:26<04:15,  5.33s/it][A

	loss_cls: tensor(0.4981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6637, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:31<04:10,  5.32s/it][A

	loss_cls: tensor(0.6065, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7559, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:37<04:05,  5.34s/it][A

	loss_cls: tensor(0.4692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7720, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:42<03:59,  5.33s/it][A

	loss_cls: tensor(0.5772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8696, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [27:48<03:55,  5.34s/it][A

	loss_cls: tensor(0.6947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8051, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [27:53<03:49,  5.34s/it][A

	loss_cls: tensor(0.7230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7565, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [27:58<03:46,  5.38s/it][A

	loss_cls: tensor(0.4849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8134, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:04<03:43,  5.45s/it][A

	loss_cls: tensor(0.4904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6118, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:09<03:38,  5.46s/it][A

	loss_cls: tensor(0.5935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9348, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:15<03:33,  5.49s/it][A

	loss_cls: tensor(0.3213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3545, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:20<03:28,  5.48s/it][A

	loss_cls: tensor(0.5443, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7658, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:26<03:23,  5.50s/it][A

	loss_cls: tensor(0.7630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8205, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:32<03:17,  5.50s/it][A

	loss_cls: tensor(0.4555, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6917, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:37<03:12,  5.49s/it][A

	loss_cls: tensor(0.4703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6728, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:43<03:07,  5.51s/it][A

	loss_cls: tensor(0.5187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [28:48<03:01,  5.50s/it][A

	loss_cls: tensor(0.6182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0378, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [28:54<02:56,  5.51s/it][A

	loss_cls: tensor(0.5334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9270, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [28:59<02:51,  5.52s/it][A

	loss_cls: tensor(0.7273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0640, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:05<02:46,  5.55s/it][A

	loss_cls: tensor(0.5924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6795, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:10<02:40,  5.53s/it][A

	loss_cls: tensor(0.3913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5862, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:16<02:34,  5.51s/it][A

	loss_cls: tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8375, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:21<02:24,  5.34s/it][A

	loss_cls: tensor(0.5794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6794, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:26<02:16,  5.26s/it][A

	loss_cls: tensor(0.5848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8621, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:31<02:13,  5.35s/it][A

	loss_cls: tensor(0.8028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0085, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:37<02:09,  5.41s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9468, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:42<02:05,  5.46s/it][A

	loss_cls: tensor(0.7734, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1356, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [29:48<02:00,  5.47s/it][A

	loss_cls: tensor(0.6092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7953, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [29:53<01:54,  5.48s/it][A

	loss_cls: tensor(0.7647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1313, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [29:59<01:50,  5.51s/it][A

	loss_cls: tensor(0.7154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9548, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:04<01:44,  5.50s/it][A

	loss_cls: tensor(0.4354, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5043, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:10<01:39,  5.52s/it][A

	loss_cls: tensor(0.8047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9106, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:15<01:31,  5.41s/it][A

	loss_cls: tensor(0.6073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0328, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:21<01:27,  5.46s/it][A

	loss_cls: tensor(0.4314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5880, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:26<01:21,  5.46s/it][A

	loss_cls: tensor(0.6392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0310, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:32<01:16,  5.47s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9838, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:37<01:11,  5.49s/it][A

	loss_cls: tensor(0.3733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4626, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:43<01:05,  5.49s/it][A

	loss_cls: tensor(0.4291, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6506, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [30:48<00:58,  5.32s/it][A

	loss_cls: tensor(0.3827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5414, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [30:53<00:53,  5.34s/it][A

	loss_cls: tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8729, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [30:59<00:48,  5.39s/it][A

	loss_cls: tensor(0.4844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7084, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:04<00:43,  5.42s/it][A

	loss_cls: tensor(0.6367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0863, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:10<00:38,  5.45s/it][A

	loss_cls: tensor(0.4419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6170, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:15<00:32,  5.48s/it][A

	loss_cls: tensor(0.4008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5827, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:21<00:27,  5.49s/it][A

	loss_cls: tensor(0.6370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8728, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:26<00:22,  5.51s/it][A

	loss_cls: tensor(0.5404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0943, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:31<00:15,  5.32s/it][A

	loss_cls: tensor(0.9431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0809, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:36<00:10,  5.35s/it][A

	loss_cls: tensor(0.6697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8046, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:42<00:05,  5.39s/it][A

	loss_cls: tensor(0.7415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9812, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:44<00:00,  5.38s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.9497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9990, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8202047122400359

	Training cls acc: 0.7055673258003766

	Training cls prec: 0.5817204609577491

	Training cls rec: 0.6192747048149589

	Training cls f1: 0.5478537043025883

--
	Training ner acc: 0.9557918158825526

	Training ner prec: 0.2654942541822457

	Training ner rec: 0.2727052345642949

	Training ner f1: 0.2682670467596357

	Current Learning rate:  0.0002571428571428571



  1%|          | 1/177 [00:00<02:14,  1.31it/s][A
  1%|          | 2/177 [00:01<02:12,  1.32it/s][A
  2%|▏         | 3/177 [00:02<02:11,  1.32it/s][A
  2%|▏         | 4/177 [00:02<02:05,  1.37it/s][A
  3%|▎         | 5/177 [00:03<02:06,  1.36it/s][A
  3%|▎         | 6/177 [00:04<02:07,  1.34it/s][A
  4%|▍         | 7/177 [00:05<02:02,  1.39it/s][A
  5%|▍         | 8/177 [00:05<02:03,  1.37it/s][A
  5%|▌         | 9/177 [00:06<02:04,  1.35it/s][A
  6%|▌         | 10/177 [00:07<02:04,  1.35it/s][A
  6%|▌         | 11/177 [00:08<02:00,  1.38it/s][A
  7%|▋         | 12/177 [00:08<02:01,  1.36it/s][A
  7%|▋         | 13/177 [00:09<02:01,  1.35it/s][A
  8%|▊         | 14/177 [00:10<02:01,  1.34it/s][A
  8%|▊         | 15/177 [00:11<01:57,  1.38it/s][A
  9%|▉         | 16/177 [00:11<01:58,  1.36it/s][A
 10%|▉         | 17/177 [00:12<01:58,  1.35it/s][A
 10%|█         | 18/177 [00:13<01:54,  1.39it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.38it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7704003039052931

	Validation cls acc: 0.6551318267419963

	Validation cls prec: 0.5983185364541297

	Validation cls rec: 0.5630986010223298

	Validation cls f1: 0.5237360254309408

--
	Validation ner acc: 0.9555147763961864

	Validation ner prec: 0.4361649284396391

	Validation ner rec: 0.446421845574388

	Validation ner f1: 0.4410769800702948



  0%|          | 1/354 [00:05<32:20,  5.50s/it][A

	loss_cls: tensor(0.6055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8306, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:11<32:25,  5.53s/it][A

	loss_cls: tensor(0.3460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5121, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:08,  5.49s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6505, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:22<32:04,  5.50s/it][A

	loss_cls: tensor(0.9123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1051, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:51,  5.48s/it][A

	loss_cls: tensor(0.6219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8268, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:33<31:55,  5.51s/it][A

	loss_cls: tensor(0.5477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6179, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:42,  5.48s/it][A

	loss_cls: tensor(0.6092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8535, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:36,  5.48s/it][A

	loss_cls: tensor(0.4743, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7935, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:35,  5.49s/it][A

	loss_cls: tensor(0.6807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2191, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8999, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:23,  5.47s/it][A

	loss_cls: tensor(0.3488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4009, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:23,  5.49s/it][A

	loss_cls: tensor(0.5740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9198, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:16,  5.49s/it][A

	loss_cls: tensor(0.3636, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4263, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<31:17,  5.50s/it][A

	loss_cls: tensor(0.7480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3600, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<31:07,  5.49s/it][A

	loss_cls: tensor(0.6670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0912, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:22<31:00,  5.49s/it][A

	loss_cls: tensor(0.7103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8146, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<31:05,  5.52s/it][A

	loss_cls: tensor(0.4954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6360, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:33<30:54,  5.50s/it][A

	loss_cls: tensor(0.8726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1044, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:50,  5.51s/it][A

	loss_cls: tensor(0.4430, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1962, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6392, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:44<30:42,  5.50s/it][A

	loss_cls: tensor(0.4457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5819, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:42,  5.52s/it][A

	loss_cls: tensor(0.5208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7882, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:55<30:29,  5.49s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6091, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:20,  5.48s/it][A

	loss_cls: tensor(0.3946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4961, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:06<30:21,  5.50s/it][A

	loss_cls: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5180, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<30:13,  5.50s/it][A

	loss_cls: tensor(0.4349, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5735, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:17<30:11,  5.51s/it][A

	loss_cls: tensor(0.7956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9991, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:22<29:28,  5.39s/it][A

	loss_cls: tensor(0.3144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5755, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:28<29:33,  5.42s/it][A

	loss_cls: tensor(0.6752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2151, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:33<29:36,  5.45s/it][A

	loss_cls: tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7593, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:39<29:33,  5.46s/it][A

	loss_cls: tensor(0.7902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9667, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:44<29:34,  5.48s/it][A

	loss_cls: tensor(0.5218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6546, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:50<29:29,  5.48s/it][A

	loss_cls: tensor(0.4838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6835, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:55<29:35,  5.51s/it][A

	loss_cls: tensor(0.5449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7332, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [03:01<29:25,  5.50s/it][A

	loss_cls: tensor(0.3795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7462, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:06<29:25,  5.52s/it][A

	loss_cls: tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9459, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:12<29:13,  5.50s/it][A

	loss_cls: tensor(0.5767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9652, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:17<29:04,  5.49s/it][A

	loss_cls: tensor(0.6032, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7161, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:23<29:07,  5.51s/it][A

	loss_cls: tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8347, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:28<28:56,  5.50s/it][A

	loss_cls: tensor(0.4608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6492, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:34<28:57,  5.52s/it][A

	loss_cls: tensor(0.7185, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0033, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:39<28:48,  5.51s/it][A

	loss_cls: tensor(0.4825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6831, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:45<28:46,  5.52s/it][A

	loss_cls: tensor(0.4896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5655, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:50<28:35,  5.50s/it][A

	loss_cls: tensor(0.7012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8137, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:56<28:26,  5.49s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7559, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [04:01<28:26,  5.51s/it][A

	loss_cls: tensor(0.6961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0137, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:07<28:13,  5.48s/it][A

	loss_cls: tensor(0.5504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6677, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:12<28:16,  5.51s/it][A

	loss_cls: tensor(0.5212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6477, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:18<28:06,  5.49s/it][A

	loss_cls: tensor(0.4399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7321, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:23<28:05,  5.51s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6031, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:28<27:44,  5.46s/it][A

	loss_cls: tensor(0.7497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9990, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:34<27:37,  5.45s/it][A

	loss_cls: tensor(0.5907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:39<27:38,  5.47s/it][A

	loss_cls: tensor(0.4039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5012, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:45<27:32,  5.47s/it][A

	loss_cls: tensor(0.3677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2096, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5773, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:50<27:35,  5.50s/it][A

	loss_cls: tensor(0.5170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7758, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:56<27:27,  5.49s/it][A

	loss_cls: tensor(0.5157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7520, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [05:02<27:28,  5.51s/it][A

	loss_cls: tensor(0.7849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1353, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9203, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:07<27:18,  5.50s/it][A

	loss_cls: tensor(0.5592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8519, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:12<27:10,  5.49s/it][A

	loss_cls: tensor(0.7945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9792, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:18<27:10,  5.51s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6036, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:23<27:02,  5.50s/it][A

	loss_cls: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2313, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8000, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:29<27:05,  5.53s/it][A

	loss_cls: tensor(0.5209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7786, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:35<26:55,  5.51s/it][A

	loss_cls: tensor(0.4379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7686, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:40<26:53,  5.53s/it][A

	loss_cls: tensor(0.3957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4312, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:46<26:42,  5.51s/it][A

	loss_cls: tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5487, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:51<26:33,  5.49s/it][A

	loss_cls: tensor(0.5878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8612, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:57<26:31,  5.51s/it][A

	loss_cls: tensor(0.5315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8153, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [06:02<26:27,  5.51s/it][A

	loss_cls: tensor(0.4951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9451, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:08<26:27,  5.53s/it][A

	loss_cls: tensor(0.4404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4771, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:13<26:16,  5.51s/it][A

	loss_cls: tensor(0.6814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1917, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8732, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:19<26:14,  5.52s/it][A

	loss_cls: tensor(0.3457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5050, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:24<26:04,  5.51s/it][A

	loss_cls: tensor(0.7795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1937, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:30<25:53,  5.49s/it][A

	loss_cls: tensor(1.2285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5517, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:35<25:49,  5.49s/it][A

	loss_cls: tensor(0.9542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0672, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:41<25:40,  5.48s/it][A

	loss_cls: tensor(0.4644, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8671, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:46<25:41,  5.51s/it][A

	loss_cls: tensor(0.4388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6463, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:52<25:33,  5.50s/it][A

	loss_cls: tensor(0.6777, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0926, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:57<25:29,  5.50s/it][A

	loss_cls: tensor(0.5156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9837, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [07:02<25:00,  5.42s/it][A

	loss_cls: tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4838, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:08<24:57,  5.42s/it][A

	loss_cls: tensor(0.4572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5536, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:13<25:03,  5.47s/it][A

	loss_cls: tensor(0.7475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2753, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:19<25:00,  5.47s/it][A

	loss_cls: tensor(0.5263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5908, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:24<25:01,  5.50s/it][A

	loss_cls: tensor(0.6108, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8103, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:30<24:55,  5.50s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7203, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:35<24:53,  5.51s/it][A

	loss_cls: tensor(0.5331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7635, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:41<24:44,  5.50s/it][A

	loss_cls: tensor(0.3776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5485, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:46<24:35,  5.49s/it][A

	loss_cls: tensor(0.3475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4870, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:52<24:35,  5.51s/it][A

	loss_cls: tensor(0.6515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9267, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:57<24:28,  5.50s/it][A

	loss_cls: tensor(0.3949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6199, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [08:03<24:30,  5.53s/it][A

	loss_cls: tensor(0.6730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9127, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:08<24:20,  5.51s/it][A

	loss_cls: tensor(0.3370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2042, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5412, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:14<24:17,  5.52s/it][A

	loss_cls: tensor(0.5877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8011, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:19<23:40,  5.40s/it][A

	loss_cls: tensor(0.4997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5756, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:25<23:46,  5.45s/it][A

	loss_cls: tensor(0.4034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4949, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:30<23:44,  5.46s/it][A

	loss_cls: tensor(0.4087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5770, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:36<23:39,  5.46s/it][A

	loss_cls: tensor(0.7504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9249, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:41<23:35,  5.47s/it][A

	loss_cls: tensor(0.6391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9817, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:47<23:27,  5.45s/it][A

	loss_cls: tensor(0.2726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2998, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:52<23:29,  5.48s/it][A

	loss_cls: tensor(0.7505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9790, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:58<23:20,  5.47s/it][A

	loss_cls: tensor(1.0140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2590, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [09:03<23:23,  5.51s/it][A

	loss_cls: tensor(0.6299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9890, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:09<23:15,  5.50s/it][A

	loss_cls: tensor(0.6357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8061, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:14<23:08,  5.49s/it][A

	loss_cls: tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2997, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:20<23:09,  5.51s/it][A

	loss_cls: tensor(0.3979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4752, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:25<22:59,  5.50s/it][A

	loss_cls: tensor(0.5084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5507, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:31<22:58,  5.51s/it][A

	loss_cls: tensor(0.5170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7475, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:36<22:32,  5.43s/it][A

	loss_cls: tensor(0.7035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8507, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:41<22:37,  5.47s/it][A

	loss_cls: tensor(0.4114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6783, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:47<22:31,  5.47s/it][A

	loss_cls: tensor(0.6955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9384, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:52<22:26,  5.47s/it][A

	loss_cls: tensor(0.5937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7714, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:58<22:25,  5.49s/it][A

	loss_cls: tensor(0.5134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1779, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6913, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [10:03<22:18,  5.48s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7343, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:09<22:17,  5.50s/it][A

	loss_cls: tensor(0.6230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1098, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7329, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:14<22:10,  5.50s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6972, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:20<22:07,  5.51s/it][A

	loss_cls: tensor(0.7085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9078, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:25<22:01,  5.51s/it][A

	loss_cls: tensor(0.7390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8007, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:31<21:52,  5.49s/it][A

	loss_cls: tensor(0.6829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7877, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:36<21:51,  5.51s/it][A

	loss_cls: tensor(0.7935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9613, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:42<21:41,  5.49s/it][A

	loss_cls: tensor(0.4890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7012, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:47<21:32,  5.48s/it][A

	loss_cls: tensor(0.7011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0585, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:53<21:25,  5.47s/it][A

	loss_cls: tensor(0.6474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8059, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:58<21:24,  5.49s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7499, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [11:04<21:18,  5.49s/it][A

	loss_cls: tensor(0.2628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3213, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:09<21:10,  5.48s/it][A

	loss_cls: tensor(0.5272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0339, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5611, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:15<21:08,  5.49s/it][A

	loss_cls: tensor(0.4720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1590, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6309, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:20<21:00,  5.48s/it][A

	loss_cls: tensor(0.5580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6765, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:26<20:58,  5.50s/it][A

	loss_cls: tensor(0.4920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8015, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:31<20:51,  5.49s/it][A

	loss_cls: tensor(1.1082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5106, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:37<20:49,  5.50s/it][A

	loss_cls: tensor(0.4382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6451, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:42<20:39,  5.48s/it][A

	loss_cls: tensor(0.5177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6082, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:48<20:32,  5.48s/it][A

	loss_cls: tensor(0.5011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5478, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:53<20:33,  5.51s/it][A

	loss_cls: tensor(0.4211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0969, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5180, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:59<20:23,  5.49s/it][A

	loss_cls: tensor(0.5925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9190, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [12:04<20:21,  5.50s/it][A

	loss_cls: tensor(0.5745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9887, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:10<20:14,  5.49s/it][A

	loss_cls: tensor(0.6165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9293, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:15<20:12,  5.51s/it][A

	loss_cls: tensor(0.7286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1183, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:21<20:03,  5.50s/it][A

	loss_cls: tensor(0.6963, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8987, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:26<19:54,  5.48s/it][A

	loss_cls: tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8861, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:32<19:53,  5.50s/it][A

	loss_cls: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7174, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:37<19:42,  5.47s/it][A

	loss_cls: tensor(0.5845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7192, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:43<19:41,  5.49s/it][A

	loss_cls: tensor(0.7987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0993, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:48<19:33,  5.48s/it][A

	loss_cls: tensor(0.7126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2733, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9859, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:54<19:27,  5.48s/it][A

	loss_cls: tensor(0.7177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9284, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:59<19:21,  5.48s/it][A

	loss_cls: tensor(0.5369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7825, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [13:05<19:13,  5.47s/it][A

	loss_cls: tensor(0.4040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6203, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:10<19:15,  5.50s/it][A

	loss_cls: tensor(0.8991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1891, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:16<19:06,  5.48s/it][A

	loss_cls: tensor(0.6990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2796, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:21<19:05,  5.51s/it][A

	loss_cls: tensor(0.3918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4416, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:27<18:58,  5.50s/it][A

	loss_cls: tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7064, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:32<18:55,  5.51s/it][A

	loss_cls: tensor(0.8505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0371, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:38<18:46,  5.50s/it][A

	loss_cls: tensor(0.7577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9676, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:43<18:39,  5.49s/it][A

	loss_cls: tensor(0.4961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7128, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:49<18:39,  5.51s/it][A

	loss_cls: tensor(0.5642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6897, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:54<18:30,  5.50s/it][A

	loss_cls: tensor(0.5005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2633, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7638, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [14:00<18:29,  5.52s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8393, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [14:05<18:20,  5.50s/it][A

	loss_cls: tensor(0.5786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6603, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:11<18:15,  5.51s/it][A

	loss_cls: tensor(0.5004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5444, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:16<17:46,  5.39s/it][A

	loss_cls: tensor(0.7122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8527, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:21<17:45,  5.41s/it][A

	loss_cls: tensor(0.6932, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9923, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:27<17:48,  5.45s/it][A

	loss_cls: tensor(0.7195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9471, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:32<17:43,  5.45s/it][A

	loss_cls: tensor(0.6826, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8588, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:38<17:45,  5.49s/it][A

	loss_cls: tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4142, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:43<17:38,  5.49s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3287, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9336, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:49<17:37,  5.51s/it][A

	loss_cls: tensor(0.3631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6403, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:54<17:30,  5.50s/it][A

	loss_cls: tensor(0.5523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7584, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [15:00<17:15,  5.45s/it][A

	loss_cls: tensor(0.6311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7397, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [15:05<17:14,  5.47s/it][A

	loss_cls: tensor(0.8434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1711, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:11<17:07,  5.47s/it][A

	loss_cls: tensor(0.6773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2512, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:16<17:07,  5.49s/it][A

	loss_cls: tensor(0.7085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8481, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:22<16:51,  5.44s/it][A

	loss_cls: tensor(0.6852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8606, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:27<16:43,  5.42s/it][A

	loss_cls: tensor(0.3427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4771, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:32<16:32,  5.40s/it][A

	loss_cls: tensor(0.6473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1282, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:38<16:23,  5.37s/it][A

	loss_cls: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4002, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9286, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:43<16:19,  5.38s/it][A

	loss_cls: tensor(0.5783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2730, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8513, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:48<16:11,  5.37s/it][A

	loss_cls: tensor(0.5277, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1659, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6936, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:54<16:09,  5.39s/it][A

	loss_cls: tensor(0.6325, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7699, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:59<16:01,  5.37s/it][A

	loss_cls: tensor(0.5195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6830, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [16:05<16:00,  5.40s/it][A

	loss_cls: tensor(0.7776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9736, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:10<15:52,  5.38s/it][A

	loss_cls: tensor(0.4928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6725, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:15<15:44,  5.37s/it][A

	loss_cls: tensor(0.4618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8091, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:21<15:42,  5.38s/it][A

	loss_cls: tensor(0.7462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9809, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:26<15:35,  5.37s/it][A

	loss_cls: tensor(0.4544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6065, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:31<15:32,  5.39s/it][A

	loss_cls: tensor(0.5085, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7699, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:37<15:23,  5.37s/it][A

	loss_cls: tensor(0.9813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2662, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:42<15:21,  5.39s/it][A

	loss_cls: tensor(0.5922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8095, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:47<15:11,  5.36s/it][A

	loss_cls: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7312, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:53<15:04,  5.35s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0779, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5934, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:58<15:03,  5.38s/it][A

	loss_cls: tensor(0.6402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8994, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [17:04<14:57,  5.38s/it][A

	loss_cls: tensor(0.4270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4719, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:09<14:53,  5.38s/it][A

	loss_cls: tensor(0.6858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:14<14:47,  5.38s/it][A

	loss_cls: tensor(0.6682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7920, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:20<14:44,  5.40s/it][A

	loss_cls: tensor(0.7346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9649, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:25<14:36,  5.37s/it][A

	loss_cls: tensor(0.6818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0330, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:31<14:32,  5.38s/it][A

	loss_cls: tensor(0.5751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6979, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:36<14:25,  5.38s/it][A

	loss_cls: tensor(0.6708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8600, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:41<14:19,  5.37s/it][A

	loss_cls: tensor(0.5952, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6331, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:47<14:17,  5.39s/it][A

	loss_cls: tensor(0.5392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9820, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:52<14:10,  5.38s/it][A

	loss_cls: tensor(0.6978, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3994, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:57<14:07,  5.40s/it][A

	loss_cls: tensor(0.6728, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8248, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [18:03<13:58,  5.38s/it][A

	loss_cls: tensor(0.6690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8651, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:08<13:55,  5.39s/it][A

	loss_cls: tensor(0.7305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2197, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:14<13:49,  5.39s/it][A

	loss_cls: tensor(0.7223, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0003, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:19<13:42,  5.37s/it][A

	loss_cls: tensor(0.6836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9196, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:24<13:39,  5.39s/it][A

	loss_cls: tensor(0.5814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8568, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:30<13:31,  5.37s/it][A

	loss_cls: tensor(0.4535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8572, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:35<13:28,  5.39s/it][A

	loss_cls: tensor(0.7232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9864, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:40<13:20,  5.37s/it][A

	loss_cls: tensor(0.5214, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7022, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:46<13:17,  5.39s/it][A

	loss_cls: tensor(0.6192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3177, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:51<13:10,  5.38s/it][A

	loss_cls: tensor(0.5621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7708, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:57<13:04,  5.38s/it][A

	loss_cls: tensor(0.6006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8026, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [19:02<13:01,  5.39s/it][A

	loss_cls: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8403, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:07<12:54,  5.38s/it][A

	loss_cls: tensor(0.7935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9981, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:13<12:55,  5.42s/it][A

	loss_cls: tensor(0.4561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6577, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:18<12:48,  5.41s/it][A

	loss_cls: tensor(0.7234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1620, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8854, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:24<12:45,  5.43s/it][A

	loss_cls: tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8207, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:29<12:37,  5.41s/it][A

	loss_cls: tensor(0.4387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6690, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:35<12:30,  5.40s/it][A

	loss_cls: tensor(0.6626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7401, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:40<12:27,  5.42s/it][A

	loss_cls: tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5840, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:45<12:19,  5.40s/it][A

	loss_cls: tensor(0.6051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6820, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:51<12:14,  5.40s/it][A

	loss_cls: tensor(0.5254, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6627, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:56<12:07,  5.39s/it][A

	loss_cls: tensor(0.5307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6200, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [20:02<12:06,  5.42s/it][A

	loss_cls: tensor(0.8551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1581, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:07<11:59,  5.41s/it][A

	loss_cls: tensor(0.5253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6336, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:12<11:52,  5.40s/it][A

	loss_cls: tensor(1.1522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3906, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:18<11:48,  5.41s/it][A

	loss_cls: tensor(0.5561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9272, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:23<11:42,  5.40s/it][A

	loss_cls: tensor(0.5369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5800, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:29<11:38,  5.41s/it][A

	loss_cls: tensor(0.7301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4222, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1523, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:34<11:31,  5.40s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8363, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:39<11:28,  5.42s/it][A

	loss_cls: tensor(0.6681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0311, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6992, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:45<11:21,  5.41s/it][A

	loss_cls: tensor(0.4157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6326, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:50<11:15,  5.40s/it][A

	loss_cls: tensor(0.6516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7727, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:56<11:10,  5.41s/it][A

	loss_cls: tensor(0.7509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1345, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [21:01<11:03,  5.40s/it][A

	loss_cls: tensor(0.5442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6307, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:06<11:00,  5.41s/it][A

	loss_cls: tensor(0.6420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3241, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:12<10:54,  5.41s/it][A

	loss_cls: tensor(0.5416, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7683, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:17<10:47,  5.40s/it][A

	loss_cls: tensor(0.3987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5627, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:23<10:42,  5.40s/it][A

	loss_cls: tensor(1.0461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4346, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:28<10:35,  5.39s/it][A

	loss_cls: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5820, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:34<10:34,  5.42s/it][A

	loss_cls: tensor(0.7191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8762, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:39<10:26,  5.40s/it][A

	loss_cls: tensor(0.4916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6073, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:44<10:23,  5.42s/it][A

	loss_cls: tensor(0.7787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2164, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:50<10:16,  5.41s/it][A

	loss_cls: tensor(0.4548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0572, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:55<10:13,  5.43s/it][A

	loss_cls: tensor(0.6390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8715, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [22:01<10:07,  5.42s/it][A

	loss_cls: tensor(0.8881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1313, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:06<10:00,  5.41s/it][A

	loss_cls: tensor(0.5483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6675, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:11<09:57,  5.43s/it][A

	loss_cls: tensor(0.5664, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8234, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:17<09:49,  5.41s/it][A

	loss_cls: tensor(0.7073, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2445, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:22<09:47,  5.44s/it][A

	loss_cls: tensor(0.8382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2021, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:28<09:40,  5.42s/it][A

	loss_cls: tensor(0.5872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7178, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:33<09:35,  5.43s/it][A

	loss_cls: tensor(0.6126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0016, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:39<09:29,  5.42s/it][A

	loss_cls: tensor(0.5907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8085, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:44<09:22,  5.41s/it][A

	loss_cls: tensor(0.4869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5767, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:49<09:18,  5.42s/it][A

	loss_cls: tensor(0.5323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0169, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:55<09:11,  5.41s/it][A

	loss_cls: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6706, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [23:00<09:08,  5.43s/it][A

	loss_cls: tensor(0.7144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8822, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:06<09:01,  5.42s/it][A

	loss_cls: tensor(0.7413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1943, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:11<08:57,  5.43s/it][A

	loss_cls: tensor(0.4706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6312, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:16<08:49,  5.41s/it][A

	loss_cls: tensor(0.6565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9367, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:22<08:42,  5.39s/it][A

	loss_cls: tensor(0.6202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7533, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:27<08:38,  5.40s/it][A

	loss_cls: tensor(0.5998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8154, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:33<08:33,  5.40s/it][A

	loss_cls: tensor(0.7193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0621, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:38<08:30,  5.43s/it][A

	loss_cls: tensor(0.5351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8412, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:44<08:23,  5.41s/it][A

	loss_cls: tensor(0.5564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7033, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:49<08:19,  5.43s/it][A

	loss_cls: tensor(0.5824, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7239, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:54<08:12,  5.41s/it][A

	loss_cls: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0818, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7016, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [24:00<08:06,  5.40s/it][A

	loss_cls: tensor(0.6130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7998, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:05<08:02,  5.42s/it][A

	loss_cls: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9886, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:11<07:56,  5.41s/it][A

	loss_cls: tensor(0.6943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9511, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:16<07:52,  5.43s/it][A

	loss_cls: tensor(0.5503, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7330, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:21<07:46,  5.42s/it][A

	loss_cls: tensor(0.6291, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9367, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:27<07:42,  5.44s/it][A

	loss_cls: tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:32<07:36,  5.43s/it][A

	loss_cls: tensor(0.5384, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7247, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:38<07:29,  5.42s/it][A

	loss_cls: tensor(0.7677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0174, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:43<07:24,  5.43s/it][A

	loss_cls: tensor(0.5329, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6399, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:49<07:18,  5.42s/it][A

	loss_cls: tensor(0.6127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6535, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:54<07:14,  5.43s/it][A

	loss_cls: tensor(0.5680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9646, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:59<07:07,  5.41s/it][A

	loss_cls: tensor(0.6756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9575, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:05<07:04,  5.45s/it][A

	loss_cls: tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0131, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:10<06:58,  5.43s/it][A

	loss_cls: tensor(0.4868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1905, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6773, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:16<06:51,  5.41s/it][A

	loss_cls: tensor(0.7675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0332, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:21<06:47,  5.43s/it][A

	loss_cls: tensor(0.7236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8024, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:27<06:40,  5.41s/it][A

	loss_cls: tensor(0.5282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7162, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:32<06:36,  5.44s/it][A

	loss_cls: tensor(0.5330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7056, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:37<06:30,  5.43s/it][A

	loss_cls: tensor(0.4934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8893, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:43<06:26,  5.44s/it][A

	loss_cls: tensor(0.5704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9980, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:48<06:19,  5.42s/it][A

	loss_cls: tensor(0.5157, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6924, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:54<06:13,  5.41s/it][A

	loss_cls: tensor(0.5729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1430, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:59<06:09,  5.43s/it][A

	loss_cls: tensor(0.8091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9702, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:05<06:02,  5.42s/it][A

	loss_cls: tensor(0.5317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5903, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:10<05:58,  5.43s/it][A

	loss_cls: tensor(0.5565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6898, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:15<05:52,  5.42s/it][A

	loss_cls: tensor(0.5054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7695, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:21<05:47,  5.44s/it][A

	loss_cls: tensor(0.5434, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8598, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:26<05:41,  5.42s/it][A

	loss_cls: tensor(0.5700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6910, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:32<05:36,  5.43s/it][A

	loss_cls: tensor(0.6721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7990, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:37<05:30,  5.42s/it][A

	loss_cls: tensor(0.4865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6395, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:43<05:24,  5.42s/it][A

	loss_cls: tensor(0.5066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8062, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:48<05:20,  5.43s/it][A

	loss_cls: tensor(0.6653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7187, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:53<05:14,  5.42s/it][A

	loss_cls: tensor(0.5687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8535, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:59<05:09,  5.43s/it][A

	loss_cls: tensor(0.8422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1652, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:04<05:03,  5.41s/it][A

	loss_cls: tensor(0.5893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8592, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:10<04:58,  5.44s/it][A

	loss_cls: tensor(0.4128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5594, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:15<04:52,  5.43s/it][A

	loss_cls: tensor(0.5273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8505, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:20<04:47,  5.42s/it][A

	loss_cls: tensor(0.5921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6860, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:26<04:42,  5.42s/it][A

	loss_cls: tensor(0.6078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9449, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:31<04:35,  5.41s/it][A

	loss_cls: tensor(0.5130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6275, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:37<04:31,  5.42s/it][A

	loss_cls: tensor(0.6423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2297, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8721, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:42<04:25,  5.41s/it][A

	loss_cls: tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5797, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:48<04:20,  5.43s/it][A

	loss_cls: tensor(0.6139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8917, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:53<04:15,  5.43s/it][A

	loss_cls: tensor(0.3163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5156, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:58<04:09,  5.42s/it][A

	loss_cls: tensor(0.6521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8097, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:04<04:04,  5.44s/it][A

	loss_cls: tensor(0.5243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6921, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:09<03:58,  5.42s/it][A

	loss_cls: tensor(0.6740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0174, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:15<03:53,  5.44s/it][A

	loss_cls: tensor(1.0846, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4787, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:20<03:47,  5.42s/it][A

	loss_cls: tensor(0.7310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9916, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:26<03:43,  5.44s/it][A

	loss_cls: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7779, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:31<03:37,  5.43s/it][A

	loss_cls: tensor(0.6835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8353, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:36<03:31,  5.42s/it][A

	loss_cls: tensor(0.4673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6955, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:42<03:26,  5.44s/it][A

	loss_cls: tensor(0.4333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5813, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:47<03:20,  5.42s/it][A

	loss_cls: tensor(0.8117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9361, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:53<03:15,  5.43s/it][A

	loss_cls: tensor(0.6317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6746, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:58<03:09,  5.42s/it][A

	loss_cls: tensor(0.4542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5649, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:04<03:04,  5.44s/it][A

	loss_cls: tensor(0.4984, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1133, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6117, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:09<02:59,  5.43s/it][A

	loss_cls: tensor(0.5987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8480, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:14<02:53,  5.43s/it][A

	loss_cls: tensor(0.6892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1425, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:20<02:48,  5.44s/it][A

	loss_cls: tensor(0.5479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8886, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:25<02:42,  5.42s/it][A

	loss_cls: tensor(0.5969, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7445, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:31<02:37,  5.43s/it][A

	loss_cls: tensor(0.6488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8087, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:36<02:31,  5.42s/it][A

	loss_cls: tensor(0.7768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9679, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:42<02:26,  5.42s/it][A

	loss_cls: tensor(0.6094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9724, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:47<02:20,  5.42s/it][A

	loss_cls: tensor(0.5880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7651, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:52<02:15,  5.42s/it][A

	loss_cls: tensor(0.5475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0803, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:58<02:10,  5.43s/it][A

	loss_cls: tensor(0.5023, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9785, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:03<02:04,  5.42s/it][A

	loss_cls: tensor(0.7790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8926, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:09<01:59,  5.44s/it][A

	loss_cls: tensor(0.8003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9322, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:14<01:54,  5.43s/it][A

	loss_cls: tensor(0.6026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7234, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:20<01:48,  5.45s/it][A

	loss_cls: tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9293, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:25<01:43,  5.43s/it][A

	loss_cls: tensor(0.6976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8099, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:30<01:37,  5.43s/it][A

	loss_cls: tensor(0.8383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9796, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:36<01:32,  5.44s/it][A

	loss_cls: tensor(0.5121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3311, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8432, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:41<01:26,  5.42s/it][A

	loss_cls: tensor(0.5328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8781, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:47<01:21,  5.44s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8638, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:52<01:16,  5.44s/it][A

	loss_cls: tensor(0.5915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8552, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:58<01:10,  5.45s/it][A

	loss_cls: tensor(0.7141, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9331, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:03<01:05,  5.44s/it][A

	loss_cls: tensor(0.7220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8483, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:08<00:59,  5.42s/it][A

	loss_cls: tensor(1.4531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6272, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:14<00:54,  5.44s/it][A

	loss_cls: tensor(0.6047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9231, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:19<00:48,  5.42s/it][A

	loss_cls: tensor(0.4401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5526, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:25<00:43,  5.44s/it][A

	loss_cls: tensor(0.5733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6811, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:30<00:38,  5.43s/it][A

	loss_cls: tensor(0.7316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8172, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:36<00:32,  5.44s/it][A

	loss_cls: tensor(0.7155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9226, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:41<00:27,  5.43s/it][A

	loss_cls: tensor(0.4939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7401, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:46<00:21,  5.38s/it][A

	loss_cls: tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6899, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:52<00:16,  5.42s/it][A

	loss_cls: tensor(0.6847, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0410, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7258, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:57<00:10,  5.42s/it][A

	loss_cls: tensor(0.6613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8290, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:03<00:05,  5.45s/it][A

	loss_cls: tensor(0.5919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6975, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:05<00:00,  5.44s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4649, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8209730478498222

	Training cls acc: 0.6898540489642185

	Training cls prec: 0.5771620387616151

	Training cls rec: 0.6180157530793124

	Training cls f1: 0.5383628685445634

--
	Training ner acc: 0.9556909498386904

	Training ner prec: 0.26936968383257753

	Training ner rec: 0.2771772157316867

	Training ner f1: 0.27273497605318053

	Current Learning rate:  0.00022857142857142857



  1%|          | 1/177 [00:00<02:14,  1.31it/s][A
  1%|          | 2/177 [00:01<02:11,  1.33it/s][A
  2%|▏         | 3/177 [00:02<02:03,  1.41it/s][A
  2%|▏         | 4/177 [00:02<02:04,  1.38it/s][A
  3%|▎         | 5/177 [00:03<02:05,  1.37it/s][A
  3%|▎         | 6/177 [00:04<02:00,  1.42it/s][A
  4%|▍         | 7/177 [00:05<02:01,  1.40it/s][A
  5%|▍         | 8/177 [00:05<02:01,  1.39it/s][A
  5%|▌         | 9/177 [00:06<02:02,  1.37it/s][A
  6%|▌         | 10/177 [00:07<01:57,  1.42it/s][A
  6%|▌         | 11/177 [00:07<01:58,  1.40it/s][A
  7%|▋         | 12/177 [00:08<01:59,  1.38it/s][A
  7%|▋         | 13/177 [00:09<01:59,  1.37it/s][A
  8%|▊         | 14/177 [00:10<01:55,  1.41it/s][A
  8%|▊         | 15/177 [00:10<01:56,  1.39it/s][A
  9%|▉         | 16/177 [00:11<01:56,  1.38it/s][A
 10%|▉         | 17/177 [00:12<01:53,  1.41it/s][A
 10%|█         | 18/177 [00:12<01:53,  1.40it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.38it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.8039165049959711

	Validation cls acc: 0.638888888888889

	Validation cls prec: 0.5882364810330911

	Validation cls rec: 0.5333467850417003

	Validation cls f1: 0.5022715514240937

--
	Validation ner acc: 0.9558027291858076

	Validation ner prec: 0.4932107227987463

	Validation ner rec: 0.5032015065913371

	Validation ner f1: 0.49797131647838094



  0%|          | 1/354 [00:05<31:52,  5.42s/it][A

	loss_cls: tensor(0.6249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1014, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<32:05,  5.47s/it][A

	loss_cls: tensor(0.6339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8785, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:48,  5.44s/it][A

	loss_cls: tensor(0.4339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5014, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:40,  5.43s/it][A

	loss_cls: tensor(0.7589, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9427, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:40,  5.45s/it][A

	loss_cls: tensor(0.7450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2017, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:28,  5.43s/it][A

	loss_cls: tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6407, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:24,  5.43s/it][A

	loss_cls: tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6341, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:15,  5.42s/it][A

	loss_cls: tensor(0.8388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0309, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:17,  5.44s/it][A

	loss_cls: tensor(0.7075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9729, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:08,  5.43s/it][A

	loss_cls: tensor(0.5028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8432, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:57,  5.42s/it][A

	loss_cls: tensor(0.5869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7360, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:52,  5.42s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0695, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:40,  5.40s/it][A

	loss_cls: tensor(0.5904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6947, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:38,  5.41s/it][A

	loss_cls: tensor(0.4950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5423, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:32,  5.40s/it][A

	loss_cls: tensor(0.5422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7560, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:33,  5.43s/it][A

	loss_cls: tensor(0.6816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9512, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:24,  5.41s/it][A

	loss_cls: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8020, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:14,  5.40s/it][A

	loss_cls: tensor(0.5938, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1516, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:14,  5.42s/it][A

	loss_cls: tensor(0.6141, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7446, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:03,  5.40s/it][A

	loss_cls: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6915, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:09,  5.43s/it][A

	loss_cls: tensor(0.4639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5776, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<30:01,  5.43s/it][A

	loss_cls: tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6864, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:59,  5.44s/it][A

	loss_cls: tensor(0.9888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2458, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:49,  5.42s/it][A

	loss_cls: tensor(0.7650, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9307, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:37,  5.40s/it][A

	loss_cls: tensor(0.4533, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5089, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:36,  5.42s/it][A

	loss_cls: tensor(0.5867, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8075, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:22,  5.39s/it][A

	loss_cls: tensor(0.4527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8357, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:23,  5.41s/it][A

	loss_cls: tensor(0.5686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8696, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:11,  5.39s/it][A

	loss_cls: tensor(0.8543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9935, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:11,  5.41s/it][A

	loss_cls: tensor(0.4098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5943, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<29:00,  5.39s/it][A

	loss_cls: tensor(0.3926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6008, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<28:52,  5.38s/it][A

	loss_cls: tensor(0.3780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5304, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:51,  5.40s/it][A

	loss_cls: tensor(0.8978, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0962, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:42,  5.38s/it][A

	loss_cls: tensor(0.3990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5750, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:45,  5.41s/it][A

	loss_cls: tensor(0.4466, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5975, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:37,  5.40s/it][A

	loss_cls: tensor(0.4717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7411, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:39,  5.42s/it][A

	loss_cls: tensor(0.8813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1058, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:30,  5.41s/it][A

	loss_cls: tensor(0.6477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8031, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:31<28:20,  5.40s/it][A

	loss_cls: tensor(0.7322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8371, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:18,  5.41s/it][A

	loss_cls: tensor(0.4850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6194, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<28:08,  5.39s/it][A

	loss_cls: tensor(0.6925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1971, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:07,  5.41s/it][A

	loss_cls: tensor(0.6288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8225, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<27:58,  5.40s/it][A

	loss_cls: tensor(0.9016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3740, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:58<28:01,  5.43s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8460, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:53,  5.42s/it][A

	loss_cls: tensor(0.5835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7207, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:08<27:46,  5.41s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8233, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:46,  5.43s/it][A

	loss_cls: tensor(0.7475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9350, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:37,  5.42s/it][A

	loss_cls: tensor(0.5366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7113, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:25<27:40,  5.44s/it][A

	loss_cls: tensor(0.5430, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6860, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:29,  5.43s/it][A

	loss_cls: tensor(0.3752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5373, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:36<27:27,  5.44s/it][A

	loss_cls: tensor(0.5584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4226, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9810, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:16,  5.42s/it][A

	loss_cls: tensor(0.7605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0536, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:46<27:05,  5.40s/it][A

	loss_cls: tensor(0.5977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8385, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:52<27:07,  5.42s/it][A

	loss_cls: tensor(0.5878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8790, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:57<26:59,  5.42s/it][A

	loss_cls: tensor(0.5510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0503, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6014, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:03<26:58,  5.43s/it][A

	loss_cls: tensor(0.7177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7672, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:49,  5.42s/it][A

	loss_cls: tensor(0.6487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3840, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0327, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:14<26:48,  5.43s/it][A

	loss_cls: tensor(0.5869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0202, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:36,  5.41s/it][A

	loss_cls: tensor(0.5779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0136, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:24<26:24,  5.39s/it][A

	loss_cls: tensor(0.5040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7932, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:30<26:23,  5.40s/it][A

	loss_cls: tensor(0.8878, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0071, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:35<26:15,  5.40s/it][A

	loss_cls: tensor(0.5334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5903, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:41<26:15,  5.41s/it][A

	loss_cls: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9486, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:46<26:05,  5.40s/it][A

	loss_cls: tensor(0.5596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7112, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:51<26:05,  5.42s/it][A

	loss_cls: tensor(0.7501, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9961, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:57<25:59,  5.41s/it][A

	loss_cls: tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9808, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:02<25:51,  5.41s/it][A

	loss_cls: tensor(0.4600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7089, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:08<25:52,  5.43s/it][A

	loss_cls: tensor(0.4596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7745, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:13<25:44,  5.42s/it][A

	loss_cls: tensor(0.6003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7327, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:19<25:46,  5.44s/it][A

	loss_cls: tensor(0.6103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7292, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:24<25:35,  5.43s/it][A

	loss_cls: tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5639, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:29<25:35,  5.44s/it][A

	loss_cls: tensor(0.6454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8721, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:35<25:22,  5.42s/it][A

	loss_cls: tensor(0.5975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1255, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:40<25:17,  5.42s/it][A

	loss_cls: tensor(0.4099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7075, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:46<25:11,  5.42s/it][A

	loss_cls: tensor(0.4101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1171, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5273, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:51<25:03,  5.41s/it][A

	loss_cls: tensor(0.5520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9158, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:57<25:01,  5.42s/it][A

	loss_cls: tensor(0.8231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1733, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:02<24:52,  5.41s/it][A

	loss_cls: tensor(0.4496, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5247, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:07<24:50,  5.42s/it][A

	loss_cls: tensor(0.5129, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9546, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:13<24:38,  5.40s/it][A

	loss_cls: tensor(0.4000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7882, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:18<24:38,  5.42s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7834, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:24<24:30,  5.41s/it][A

	loss_cls: tensor(0.6568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0586, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:29<24:23,  5.40s/it][A

	loss_cls: tensor(0.3719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5838, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:34<24:23,  5.42s/it][A

	loss_cls: tensor(0.4755, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6290, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:40<24:14,  5.41s/it][A

	loss_cls: tensor(0.4709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5470, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:45<24:10,  5.41s/it][A

	loss_cls: tensor(0.5125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6718, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:51<23:59,  5.39s/it][A

	loss_cls: tensor(0.4872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0633, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:56<24:00,  5.41s/it][A

	loss_cls: tensor(0.4367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6813, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:01<23:52,  5.41s/it][A

	loss_cls: tensor(0.4469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5261, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:07<23:43,  5.39s/it][A

	loss_cls: tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6676, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:12<23:44,  5.42s/it][A

	loss_cls: tensor(0.4606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5686, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:18<23:37,  5.41s/it][A

	loss_cls: tensor(0.6866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9427, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:23<23:35,  5.42s/it][A

	loss_cls: tensor(0.5527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7595, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:28<23:25,  5.40s/it][A

	loss_cls: tensor(0.5545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7801, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:34<23:25,  5.43s/it][A

	loss_cls: tensor(0.8913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1391, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:39<23:16,  5.41s/it][A

	loss_cls: tensor(0.5398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9219, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:45<23:07,  5.40s/it][A

	loss_cls: tensor(0.8647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1022, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:50<23:05,  5.41s/it][A

	loss_cls: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1821, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:55<22:56,  5.40s/it][A

	loss_cls: tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2822, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:01<22:55,  5.42s/it][A

	loss_cls: tensor(0.5204, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7030, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:06<22:48,  5.41s/it][A

	loss_cls: tensor(0.4535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5687, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:12<22:47,  5.43s/it][A

	loss_cls: tensor(0.6110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4307, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0418, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:17<22:38,  5.41s/it][A

	loss_cls: tensor(0.4304, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6098, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:23<22:31,  5.41s/it][A

	loss_cls: tensor(0.4891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8344, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:28<22:31,  5.43s/it][A

	loss_cls: tensor(0.6953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8195, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:33<22:20,  5.41s/it][A

	loss_cls: tensor(0.6394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9889, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:39<22:16,  5.41s/it][A

	loss_cls: tensor(0.5350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0491, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5841, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:44<22:08,  5.40s/it][A

	loss_cls: tensor(0.5927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7003, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:50<22:06,  5.41s/it][A

	loss_cls: tensor(0.9055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2201, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:55<21:57,  5.40s/it][A

	loss_cls: tensor(0.7190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9645, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:00<21:48,  5.38s/it][A

	loss_cls: tensor(0.5590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1988, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7578, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:06<21:45,  5.39s/it][A

	loss_cls: tensor(0.4851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9806, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:11<21:37,  5.38s/it][A

	loss_cls: tensor(0.4112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5141, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:17<21:39,  5.41s/it][A

	loss_cls: tensor(0.6049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8322, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:22<21:34,  5.42s/it][A

	loss_cls: tensor(0.6094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9242, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:28<21:34,  5.44s/it][A

	loss_cls: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5956, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:33<21:25,  5.42s/it][A

	loss_cls: tensor(0.7109, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1044, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:38<21:15,  5.41s/it][A

	loss_cls: tensor(0.7708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2406, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0114, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:44<21:13,  5.42s/it][A

	loss_cls: tensor(0.9632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1604, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:49<21:05,  5.41s/it][A

	loss_cls: tensor(0.1872, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3810, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:55<21:05,  5.43s/it][A

	loss_cls: tensor(0.5196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6711, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:00<20:57,  5.42s/it][A

	loss_cls: tensor(0.7822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9599, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:05<20:55,  5.43s/it][A

	loss_cls: tensor(0.6810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9904, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:11<20:49,  5.43s/it][A

	loss_cls: tensor(0.8061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1009, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:16<20:39,  5.41s/it][A

	loss_cls: tensor(0.6700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0939, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:22<20:37,  5.43s/it][A

	loss_cls: tensor(0.6821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0292, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:27<20:24,  5.40s/it][A

	loss_cls: tensor(0.5979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8971, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:32<20:23,  5.41s/it][A

	loss_cls: tensor(0.6069, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0409, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:38<20:15,  5.40s/it][A

	loss_cls: tensor(0.3564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5684, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:43<20:14,  5.42s/it][A

	loss_cls: tensor(0.5712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8237, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:49<20:05,  5.40s/it][A

	loss_cls: tensor(0.9292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0617, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:54<19:58,  5.40s/it][A

	loss_cls: tensor(0.8817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1153, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:00<19:56,  5.42s/it][A

	loss_cls: tensor(0.5037, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7189, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:05<19:46,  5.39s/it][A

	loss_cls: tensor(0.4866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5927, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:10<19:45,  5.41s/it][A

	loss_cls: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8083, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:16<19:37,  5.40s/it][A

	loss_cls: tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7700, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:21<19:44,  5.46s/it][A

	loss_cls: tensor(0.7369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7944, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:27<19:40,  5.47s/it][A

	loss_cls: tensor(0.7253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8969, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:32<19:36,  5.47s/it][A

	loss_cls: tensor(0.5278, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7958, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:38<19:36,  5.50s/it][A

	loss_cls: tensor(0.5371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6793, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:43<19:31,  5.50s/it][A

	loss_cls: tensor(0.6662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9355, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:49<19:33,  5.54s/it][A

	loss_cls: tensor(0.5605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7254, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:54<19:26,  5.53s/it][A

	loss_cls: tensor(0.5654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6499, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:00<19:24,  5.55s/it][A

	loss_cls: tensor(0.8960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0127, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:06<19:14,  5.52s/it][A

	loss_cls: tensor(0.6956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8407, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:11<19:09,  5.53s/it][A

	loss_cls: tensor(0.7475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7840, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:17<19:06,  5.54s/it][A

	loss_cls: tensor(0.6025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1549, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7574, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:22<18:59,  5.53s/it][A

	loss_cls: tensor(0.8346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2421, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:28<18:56,  5.54s/it][A

	loss_cls: tensor(0.4388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7867, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:33<18:47,  5.53s/it][A

	loss_cls: tensor(0.4657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6748, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:39<18:44,  5.54s/it][A

	loss_cls: tensor(0.8196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0454, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:44<18:38,  5.53s/it][A

	loss_cls: tensor(0.5856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9222, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:50<18:31,  5.53s/it][A

	loss_cls: tensor(0.8115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8934, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:55<18:28,  5.54s/it][A

	loss_cls: tensor(0.6443, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9514, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:01<18:19,  5.53s/it][A

	loss_cls: tensor(0.6273, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7177, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:06<18:17,  5.54s/it][A

	loss_cls: tensor(0.6827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9061, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:12<18:07,  5.52s/it][A

	loss_cls: tensor(0.6187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8247, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:17<18:05,  5.54s/it][A

	loss_cls: tensor(0.6905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1265, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:23<17:53,  5.50s/it][A

	loss_cls: tensor(0.6459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8161, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:28<17:46,  5.50s/it][A

	loss_cls: tensor(0.8796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0906, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:34<17:45,  5.52s/it][A

	loss_cls: tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5359, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:39<17:38,  5.51s/it][A

	loss_cls: tensor(0.7368, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8922, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:45<17:38,  5.54s/it][A

	loss_cls: tensor(0.5657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8024, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:51<17:29,  5.52s/it][A

	loss_cls: tensor(0.3910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4634, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:56<17:29,  5.55s/it][A

	loss_cls: tensor(0.4035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4373, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:02<17:19,  5.53s/it][A

	loss_cls: tensor(0.4194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6111, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:07<17:11,  5.52s/it][A

	loss_cls: tensor(0.4481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6034, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:13<17:09,  5.54s/it][A

	loss_cls: tensor(0.6696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3791, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0487, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:18<17:01,  5.52s/it][A

	loss_cls: tensor(0.4338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5229, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:24<16:59,  5.54s/it][A

	loss_cls: tensor(0.6653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7516, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:29<16:51,  5.53s/it][A

	loss_cls: tensor(0.3340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4294, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:34<16:16,  5.36s/it][A

	loss_cls: tensor(0.5838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1920, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:39<15:46,  5.23s/it][A

	loss_cls: tensor(0.7362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1038, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:44<15:42,  5.24s/it][A

	loss_cls: tensor(0.4910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8383, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:50<15:52,  5.32s/it][A

	loss_cls: tensor(0.4131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5372, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:55<15:56,  5.37s/it][A

	loss_cls: tensor(0.6602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9630, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:01<16:01,  5.43s/it][A

	loss_cls: tensor(0.6670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2296, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8966, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:07<15:59,  5.45s/it][A

	loss_cls: tensor(0.8892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2610, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:12<15:59,  5.49s/it][A

	loss_cls: tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7218, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:18<15:54,  5.49s/it][A

	loss_cls: tensor(0.7425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4082, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:23<15:52,  5.50s/it][A

	loss_cls: tensor(0.6490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9261, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:29<15:45,  5.50s/it][A

	loss_cls: tensor(0.5044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7122, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:34<15:39,  5.49s/it][A

	loss_cls: tensor(0.4939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6565, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:40<15:38,  5.52s/it][A

	loss_cls: tensor(0.6300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8845, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:45<15:31,  5.51s/it][A

	loss_cls: tensor(0.3539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5452, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:51<15:28,  5.53s/it][A

	loss_cls: tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7323, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:56<15:19,  5.51s/it][A

	loss_cls: tensor(0.3522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5026, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:01<14:47,  5.34s/it][A

	loss_cls: tensor(0.7702, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9326, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:07<14:43,  5.35s/it][A

	loss_cls: tensor(0.8176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1629, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:12<14:45,  5.40s/it][A

	loss_cls: tensor(0.3584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5750, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:18<14:48,  5.45s/it][A

	loss_cls: tensor(0.4435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6128, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:23<14:19,  5.31s/it][A

	loss_cls: tensor(0.5578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7901, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:28<14:26,  5.38s/it][A

	loss_cls: tensor(0.8851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2791, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:33<14:18,  5.36s/it][A

	loss_cls: tensor(0.8282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1459, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:39<14:23,  5.43s/it][A

	loss_cls: tensor(0.4754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6898, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:45<14:21,  5.45s/it][A

	loss_cls: tensor(0.6338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0852, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7190, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:50<14:18,  5.47s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8273, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:56<14:15,  5.49s/it][A

	loss_cls: tensor(0.5627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7305, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:01<14:10,  5.48s/it][A

	loss_cls: tensor(0.6201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8314, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:07<14:08,  5.51s/it][A

	loss_cls: tensor(0.4049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8742, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:12<14:03,  5.51s/it][A

	loss_cls: tensor(0.5359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1939, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7297, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:18<14:00,  5.53s/it][A

	loss_cls: tensor(0.4001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6568, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:23<13:53,  5.52s/it][A

	loss_cls: tensor(0.6871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7721, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:29<13:46,  5.51s/it][A

	loss_cls: tensor(0.6830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8494, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:34<13:43,  5.53s/it][A

	loss_cls: tensor(0.6405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0909, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:40<13:36,  5.52s/it][A

	loss_cls: tensor(0.6351, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3908, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0259, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:45<13:32,  5.53s/it][A

	loss_cls: tensor(0.4645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6352, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:51<13:25,  5.52s/it][A

	loss_cls: tensor(0.6305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8133, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:56<13:22,  5.53s/it][A

	loss_cls: tensor(0.5647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6949, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:02<13:15,  5.52s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6753, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:07<12:42,  5.33s/it][A

	loss_cls: tensor(0.4183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6000, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:12<12:46,  5.40s/it][A

	loss_cls: tensor(0.9072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0115, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:18<12:44,  5.42s/it][A

	loss_cls: tensor(0.4951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4561, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9512, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:23<12:46,  5.47s/it][A

	loss_cls: tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8387, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:29<12:42,  5.49s/it][A

	loss_cls: tensor(0.6536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7309, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:34<12:39,  5.51s/it][A

	loss_cls: tensor(0.4794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5407, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:40<12:33,  5.50s/it][A

	loss_cls: tensor(0.6297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8172, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:45<12:27,  5.50s/it][A

	loss_cls: tensor(0.5874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6930, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:51<12:25,  5.52s/it][A

	loss_cls: tensor(0.4904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7676, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:56<12:18,  5.51s/it][A

	loss_cls: tensor(1.1045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3004, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:02<12:15,  5.53s/it][A

	loss_cls: tensor(1.1596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3531, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:08<12:09,  5.52s/it][A

	loss_cls: tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8971, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:13<12:05,  5.54s/it][A

	loss_cls: tensor(0.6665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8652, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:19<11:57,  5.52s/it][A

	loss_cls: tensor(0.3446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3784, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:24<11:48,  5.49s/it][A

	loss_cls: tensor(0.4571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6575, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:30<11:45,  5.51s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8890, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:35<11:37,  5.49s/it][A

	loss_cls: tensor(0.6235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8287, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:41<11:32,  5.50s/it][A

	loss_cls: tensor(0.7437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1282, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:46<11:26,  5.49s/it][A

	loss_cls: tensor(0.7833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0286, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:52<11:22,  5.50s/it][A

	loss_cls: tensor(0.4559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5500, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:57<11:15,  5.49s/it][A

	loss_cls: tensor(0.4971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2824, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7795, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:02<11:08,  5.48s/it][A

	loss_cls: tensor(0.6247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6650, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:08<11:05,  5.50s/it][A

	loss_cls: tensor(0.5825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2711, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8536, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:13<10:59,  5.49s/it][A

	loss_cls: tensor(0.5917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7161, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:19<10:55,  5.51s/it][A

	loss_cls: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5960, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:24<10:47,  5.49s/it][A

	loss_cls: tensor(0.5778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8195, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:30<10:43,  5.50s/it][A

	loss_cls: tensor(0.6694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7146, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:35<10:36,  5.48s/it][A

	loss_cls: tensor(0.8090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9955, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:41<10:29,  5.48s/it][A

	loss_cls: tensor(0.4673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3125, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7798, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:46<10:25,  5.49s/it][A

	loss_cls: tensor(0.6167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7908, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:52<10:19,  5.49s/it][A

	loss_cls: tensor(0.6058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8505, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:57<10:17,  5.51s/it][A

	loss_cls: tensor(0.3388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5168, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:03<10:10,  5.50s/it][A

	loss_cls: tensor(0.7509, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7886, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:08<10:05,  5.51s/it][A

	loss_cls: tensor(0.5929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8103, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:14<09:59,  5.50s/it][A

	loss_cls: tensor(0.5892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6298, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:19<09:51,  5.48s/it][A

	loss_cls: tensor(0.6096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9457, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:25<09:48,  5.50s/it][A

	loss_cls: tensor(0.8215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3986, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:30<09:41,  5.48s/it][A

	loss_cls: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9604, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:36<09:38,  5.51s/it][A

	loss_cls: tensor(0.3804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5570, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:41<09:30,  5.49s/it][A

	loss_cls: tensor(0.4312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2492, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6804, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:47<09:24,  5.48s/it][A

	loss_cls: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6078, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:52<09:17,  5.47s/it][A

	loss_cls: tensor(0.5597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7723, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:58<09:11,  5.46s/it][A

	loss_cls: tensor(0.4973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9767, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:03<09:08,  5.48s/it][A

	loss_cls: tensor(0.5213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6273, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:09<09:02,  5.48s/it][A

	loss_cls: tensor(0.6175, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7399, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:14<08:59,  5.50s/it][A

	loss_cls: tensor(0.3380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4502, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:20<08:51,  5.48s/it][A

	loss_cls: tensor(0.5569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6815, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:25<08:47,  5.50s/it][A

	loss_cls: tensor(0.6241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7716, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:31<08:40,  5.48s/it][A

	loss_cls: tensor(0.5531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8101, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:36<08:34,  5.47s/it][A

	loss_cls: tensor(0.6174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8717, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:42<08:31,  5.50s/it][A

	loss_cls: tensor(0.6772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9382, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:47<08:25,  5.50s/it][A

	loss_cls: tensor(0.7834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2367, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:53<08:23,  5.53s/it][A

	loss_cls: tensor(0.8988, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0439, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:58<08:16,  5.51s/it][A

	loss_cls: tensor(0.4907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6486, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:04<08:11,  5.52s/it][A

	loss_cls: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6814, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:09<08:04,  5.50s/it][A

	loss_cls: tensor(0.7790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1022, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:15<07:57,  5.49s/it][A

	loss_cls: tensor(0.6074, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8196, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:20<07:53,  5.51s/it][A

	loss_cls: tensor(0.6308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0002, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:26<07:47,  5.50s/it][A

	loss_cls: tensor(0.5652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1020, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6672, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:31<07:43,  5.52s/it][A

	loss_cls: tensor(0.5301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8275, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:37<07:36,  5.50s/it][A

	loss_cls: tensor(0.5404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6585, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:42<07:31,  5.51s/it][A

	loss_cls: tensor(0.6292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0471, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:48<07:23,  5.48s/it][A

	loss_cls: tensor(0.5327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7185, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:53<07:19,  5.49s/it][A

	loss_cls: tensor(0.3912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6904, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:59<07:13,  5.49s/it][A

	loss_cls: tensor(0.7440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0887, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:04<07:07,  5.48s/it][A

	loss_cls: tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7088, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:10<07:03,  5.50s/it][A

	loss_cls: tensor(0.5690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7905, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:15<06:57,  5.49s/it][A

	loss_cls: tensor(0.5762, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8282, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:21<06:53,  5.51s/it][A

	loss_cls: tensor(0.8923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1784, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:26<06:46,  5.49s/it][A

	loss_cls: tensor(0.4900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6497, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:32<06:42,  5.51s/it][A

	loss_cls: tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9144, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:37<06:36,  5.50s/it][A

	loss_cls: tensor(0.5307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5810, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:43<06:30,  5.50s/it][A

	loss_cls: tensor(0.4983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8002, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:48<06:26,  5.51s/it][A

	loss_cls: tensor(0.6553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8158, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:54<06:19,  5.50s/it][A

	loss_cls: tensor(0.6012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1857, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7870, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:59<06:14,  5.51s/it][A

	loss_cls: tensor(0.4341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7184, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:05<06:08,  5.49s/it][A

	loss_cls: tensor(0.7150, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0421, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:10<06:03,  5.51s/it][A

	loss_cls: tensor(0.7918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1390, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:16<05:57,  5.50s/it][A

	loss_cls: tensor(0.7891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8913, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:21<05:51,  5.49s/it][A

	loss_cls: tensor(0.5923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7024, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:27<05:47,  5.51s/it][A

	loss_cls: tensor(0.4704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9881, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:32<05:40,  5.50s/it][A

	loss_cls: tensor(0.5114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7392, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:38<05:35,  5.51s/it][A

	loss_cls: tensor(0.9596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0704, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:43<05:29,  5.49s/it][A

	loss_cls: tensor(0.4378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5825, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:49<05:25,  5.52s/it][A

	loss_cls: tensor(0.5282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0518, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:54<05:18,  5.49s/it][A

	loss_cls: tensor(0.3515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6853, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [27:00<05:11,  5.47s/it][A

	loss_cls: tensor(0.5580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2723, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8302, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:05<05:07,  5.49s/it][A

	loss_cls: tensor(0.7034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8952, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:11<05:01,  5.48s/it][A

	loss_cls: tensor(0.4585, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7872, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:16<04:56,  5.49s/it][A

	loss_cls: tensor(0.4880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5795, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:22<04:50,  5.47s/it][A

	loss_cls: tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2975, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8769, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:27<04:46,  5.50s/it][A

	loss_cls: tensor(0.4316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7770, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:33<04:40,  5.50s/it][A

	loss_cls: tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0689, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:38<04:35,  5.50s/it][A

	loss_cls: tensor(0.6832, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9904, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:44<04:30,  5.52s/it][A

	loss_cls: tensor(0.5176, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5895, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:49<04:24,  5.50s/it][A

	loss_cls: tensor(0.6671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8135, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:55<04:19,  5.52s/it][A

	loss_cls: tensor(0.5051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5650, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [28:00<04:13,  5.50s/it][A

	loss_cls: tensor(0.4107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7281, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:06<04:08,  5.52s/it][A

	loss_cls: tensor(0.5710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8286, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:11<04:02,  5.51s/it][A

	loss_cls: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7263, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:17<03:56,  5.49s/it][A

	loss_cls: tensor(0.5512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7367, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:22<03:50,  5.50s/it][A

	loss_cls: tensor(0.8430, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3008, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1438, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:28<03:44,  5.48s/it][A

	loss_cls: tensor(0.4719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1041, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5761, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:33<03:40,  5.50s/it][A

	loss_cls: tensor(0.4795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8306, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:39<03:34,  5.49s/it][A

	loss_cls: tensor(0.6272, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9745, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:44<03:29,  5.51s/it][A

	loss_cls: tensor(0.3375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4339, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:50<03:23,  5.50s/it][A

	loss_cls: tensor(0.4598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8362, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:55<03:17,  5.48s/it][A

	loss_cls: tensor(0.3902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5079, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [29:01<03:11,  5.48s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5572, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:06<03:05,  5.45s/it][A

	loss_cls: tensor(0.7986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0420, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:12<03:00,  5.48s/it][A

	loss_cls: tensor(0.4440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5566, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:17<02:55,  5.47s/it][A

	loss_cls: tensor(0.7233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9629, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:23<02:50,  5.49s/it][A

	loss_cls: tensor(0.6607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1317, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7924, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:28<02:44,  5.48s/it][A

	loss_cls: tensor(0.5946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8705, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:34<02:38,  5.48s/it][A

	loss_cls: tensor(0.6632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4692, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1324, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:39<02:33,  5.49s/it][A

	loss_cls: tensor(0.8232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0297, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:44<02:27,  5.47s/it][A

	loss_cls: tensor(0.7340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9749, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:50<02:22,  5.50s/it][A

	loss_cls: tensor(0.5525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6357, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:55<02:17,  5.48s/it][A

	loss_cls: tensor(0.4071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5144, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [30:01<02:11,  5.50s/it][A

	loss_cls: tensor(0.4127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4712, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:06<02:06,  5.49s/it][A

	loss_cls: tensor(0.4380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5271, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:12<02:00,  5.48s/it][A

	loss_cls: tensor(0.6611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9369, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:17<01:55,  5.50s/it][A

	loss_cls: tensor(0.3445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4327, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:23<01:49,  5.49s/it][A

	loss_cls: tensor(0.6120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7997, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:29<01:44,  5.52s/it][A

	loss_cls: tensor(0.7002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8347, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:34<01:39,  5.51s/it][A

	loss_cls: tensor(0.6925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1022, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7947, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:40<01:33,  5.52s/it][A

	loss_cls: tensor(0.4046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5239, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:45<01:27,  5.49s/it][A

	loss_cls: tensor(0.4197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4544, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:50<01:22,  5.48s/it][A

	loss_cls: tensor(0.7482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9332, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:56<01:16,  5.50s/it][A

	loss_cls: tensor(0.5606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9013, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [31:01<01:11,  5.49s/it][A

	loss_cls: tensor(0.9437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0967, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:07<01:06,  5.56s/it][A

	loss_cls: tensor(0.7663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9875, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:13<01:00,  5.53s/it][A

	loss_cls: tensor(0.4649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6441, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:18<00:55,  5.53s/it][A

	loss_cls: tensor(0.4913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7285, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:24<00:49,  5.52s/it][A

	loss_cls: tensor(0.7283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8550, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:29<00:43,  5.50s/it][A

	loss_cls: tensor(0.6023, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7033, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:35<00:38,  5.51s/it][A

	loss_cls: tensor(0.4629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5518, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:40<00:32,  5.50s/it][A

	loss_cls: tensor(0.4173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5521, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:46<00:27,  5.52s/it][A

	loss_cls: tensor(0.5313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3745, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9058, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:51<00:22,  5.51s/it][A

	loss_cls: tensor(0.7735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0338, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:57<00:16,  5.53s/it][A

	loss_cls: tensor(0.6479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1150, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [32:02<00:11,  5.53s/it][A

	loss_cls: tensor(0.9323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2215, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:08<00:05,  5.51s/it][A

	loss_cls: tensor(0.5939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7685, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:10<00:00,  5.45s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3539, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3978, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8190844868872799

	Training cls acc: 0.6995056497175142

	Training cls prec: 0.5764192532201007

	Training cls rec: 0.6117358747655358

	Training cls f1: 0.542966477745508

--
	Training ner acc: 0.9556640386201961

	Training ner prec: 0.27318799188210846

	Training ner rec: 0.2811096251422396

	Training ner f1: 0.2765932442472858

	Current Learning rate:  0.0002



  1%|          | 1/177 [00:00<02:19,  1.27it/s][A
  1%|          | 2/177 [00:01<02:06,  1.39it/s][A
  2%|▏         | 3/177 [00:02<02:08,  1.36it/s][A
  2%|▏         | 4/177 [00:02<02:08,  1.34it/s][A
  3%|▎         | 5/177 [00:03<02:04,  1.38it/s][A
  3%|▎         | 6/177 [00:04<02:05,  1.36it/s][A
  4%|▍         | 7/177 [00:05<02:05,  1.35it/s][A
  5%|▍         | 8/177 [00:05<02:05,  1.34it/s][A
  5%|▌         | 9/177 [00:06<02:01,  1.38it/s][A
  6%|▌         | 10/177 [00:07<02:02,  1.37it/s][A
  6%|▌         | 11/177 [00:08<02:02,  1.35it/s][A
  7%|▋         | 12/177 [00:08<02:02,  1.34it/s][A
  7%|▋         | 13/177 [00:09<01:58,  1.38it/s][A
  8%|▊         | 14/177 [00:10<01:59,  1.36it/s][A
  8%|▊         | 15/177 [00:11<02:00,  1.34it/s][A
  9%|▉         | 16/177 [00:11<01:56,  1.38it/s][A
 10%|▉         | 17/177 [00:12<01:56,  1.37it/s][A
 10%|█         | 18/177 [00:13<01:57,  1.36it/s][A
 11%|█         | 19/177 [00:13<01:57,  1.35it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.783369429788347

	Validation cls acc: 0.7186911487758945

	Validation cls prec: 0.6104015334947538

	Validation cls rec: 0.5972693032015066

	Validation cls f1: 0.5623131419741589

--
	Validation ner acc: 0.9541642795784374

	Validation ner prec: 0.41045530603842945

	Validation ner rec: 0.4209981167608287

	Validation ner f1: 0.4155066781683935



  0%|          | 1/354 [00:05<32:41,  5.56s/it][A

	loss_cls: tensor(0.5149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1521, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:11<32:22,  5.52s/it][A

	loss_cls: tensor(0.5255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6130, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:21,  5.53s/it][A

	loss_cls: tensor(0.5314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8029, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:22<32:12,  5.52s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9506, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<32:14,  5.54s/it][A

	loss_cls: tensor(0.5520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0841, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6362, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:33<32:09,  5.54s/it][A

	loss_cls: tensor(0.7040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0696, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:38,  5.47s/it][A

	loss_cls: tensor(0.4888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8479, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<30:35,  5.30s/it][A

	loss_cls: tensor(0.5564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7767, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:20,  5.28s/it][A

	loss_cls: tensor(0.5033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7136, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<30:47,  5.37s/it][A

	loss_cls: tensor(0.7690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8984, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:54,  5.41s/it][A

	loss_cls: tensor(0.4779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5361, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:51,  5.41s/it][A

	loss_cls: tensor(0.4220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6824, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:56,  5.44s/it][A

	loss_cls: tensor(0.3736, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5026, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<30:55,  5.46s/it][A

	loss_cls: tensor(0.4462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5060, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<31:01,  5.49s/it][A

	loss_cls: tensor(0.5138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6566, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<30:54,  5.49s/it][A

	loss_cls: tensor(0.5529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6432, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<31:00,  5.52s/it][A

	loss_cls: tensor(0.6692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2152, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:53,  5.52s/it][A

	loss_cls: tensor(0.5414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6351, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:54,  5.54s/it][A

	loss_cls: tensor(0.6538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9096, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:45,  5.52s/it][A

	loss_cls: tensor(0.5047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6469, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:35,  5.51s/it][A

	loss_cls: tensor(0.4952, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8356, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:33,  5.52s/it][A

	loss_cls: tensor(0.6130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8658, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:05<30:25,  5.51s/it][A

	loss_cls: tensor(0.4370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7575, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<30:31,  5.55s/it][A

	loss_cls: tensor(0.3778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4995, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:17<30:23,  5.54s/it][A

	loss_cls: tensor(0.5761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9726, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:22<30:26,  5.57s/it][A

	loss_cls: tensor(0.4019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5071, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:28<30:12,  5.54s/it][A

	loss_cls: tensor(0.7582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2981, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0563, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:33<30:00,  5.52s/it][A

	loss_cls: tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0976, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:39<30:00,  5.54s/it][A

	loss_cls: tensor(0.4310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6344, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:44<29:49,  5.52s/it][A

	loss_cls: tensor(0.5725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8130, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:50<29:51,  5.55s/it][A

	loss_cls: tensor(0.6031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1425, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7456, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:55<29:41,  5.53s/it][A

	loss_cls: tensor(0.5540, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7161, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [03:01<29:38,  5.54s/it][A

	loss_cls: tensor(0.4427, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5824, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:06<29:26,  5.52s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6722, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:12<29:13,  5.50s/it][A

	loss_cls: tensor(0.8599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1857, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:17<29:15,  5.52s/it][A

	loss_cls: tensor(0.6001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6879, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:23<29:07,  5.51s/it][A

	loss_cls: tensor(0.6091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7469, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:29<29:10,  5.54s/it][A

	loss_cls: tensor(0.3696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4064, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7760, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:34<29:01,  5.53s/it][A

	loss_cls: tensor(0.7713, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0165, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:40<28:59,  5.54s/it][A

	loss_cls: tensor(0.7710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0181, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:45<28:49,  5.52s/it][A

	loss_cls: tensor(0.4761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6374, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:51<28:38,  5.51s/it][A

	loss_cls: tensor(0.5190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6103, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:56<28:39,  5.53s/it][A

	loss_cls: tensor(0.6038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6849, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [04:02<28:31,  5.52s/it][A

	loss_cls: tensor(0.4152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5839, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:07<28:32,  5.54s/it][A

	loss_cls: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4980, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1274, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:13<28:25,  5.54s/it][A

	loss_cls: tensor(0.6501, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8813, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:18<28:23,  5.55s/it][A

	loss_cls: tensor(0.5184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9000, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:24<28:10,  5.52s/it][A

	loss_cls: tensor(0.4317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6503, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:29<27:58,  5.50s/it][A

	loss_cls: tensor(0.4497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4358, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8854, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:35<27:58,  5.52s/it][A

	loss_cls: tensor(0.6001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7597, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:40<27:50,  5.51s/it][A

	loss_cls: tensor(0.8367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9086, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:46<27:51,  5.53s/it][A

	loss_cls: tensor(0.4914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6131, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:51<27:41,  5.52s/it][A

	loss_cls: tensor(0.6646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7062, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:57<27:41,  5.54s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7610, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [05:02<27:31,  5.52s/it][A

	loss_cls: tensor(0.7602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1680, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:08<27:21,  5.51s/it][A

	loss_cls: tensor(0.6125, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6751, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:13<27:20,  5.52s/it][A

	loss_cls: tensor(0.5378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5765, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:19<27:06,  5.49s/it][A

	loss_cls: tensor(0.6261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8882, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:24<27:07,  5.52s/it][A

	loss_cls: tensor(0.8207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9980, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:30<26:58,  5.51s/it][A

	loss_cls: tensor(0.5868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2514, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8382, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:36<26:59,  5.53s/it][A

	loss_cls: tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9381, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:41<26:48,  5.51s/it][A

	loss_cls: tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6299, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:46<26:40,  5.50s/it][A

	loss_cls: tensor(0.4410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6943, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1354, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:52<26:39,  5.51s/it][A

	loss_cls: tensor(0.5500, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9623, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:58<26:33,  5.51s/it][A

	loss_cls: tensor(0.6806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9126, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [06:03<26:33,  5.53s/it][A

	loss_cls: tensor(0.6460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8633, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:09<26:23,  5.52s/it][A

	loss_cls: tensor(0.6813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7609, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:14<26:20,  5.53s/it][A

	loss_cls: tensor(0.4807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5492, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:19<25:28,  5.36s/it][A

	loss_cls: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8756, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:25<25:38,  5.42s/it][A

	loss_cls: tensor(0.9809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5832, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:30<25:40,  5.44s/it][A

	loss_cls: tensor(0.7395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0664, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:36<25:39,  5.46s/it][A

	loss_cls: tensor(0.7913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9519, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:41<25:43,  5.49s/it][A

	loss_cls: tensor(0.4936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7548, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:46<25:00,  5.36s/it][A

	loss_cls: tensor(0.6019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2748, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8767, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:52<25:13,  5.42s/it][A

	loss_cls: tensor(0.6691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1589, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8280, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:57<25:12,  5.44s/it][A

	loss_cls: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7351, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [07:03<25:22,  5.50s/it][A

	loss_cls: tensor(0.6813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9072, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:08<25:17,  5.50s/it][A

	loss_cls: tensor(0.4324, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6812, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:14<25:11,  5.50s/it][A

	loss_cls: tensor(0.5598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7715, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:19<25:05,  5.49s/it][A

	loss_cls: tensor(0.7642, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1319, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:25<24:49,  5.46s/it][A

	loss_cls: tensor(0.4241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5398, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:30<24:43,  5.45s/it][A

	loss_cls: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6733, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:36<24:28,  5.42s/it][A

	loss_cls: tensor(0.8725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0856, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:41<24:23,  5.42s/it][A

	loss_cls: tensor(0.5382, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6119, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:46<24:15,  5.41s/it][A

	loss_cls: tensor(0.7881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1156, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:52<24:03,  5.39s/it][A

	loss_cls: tensor(0.4180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5081, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:57<24:01,  5.40s/it][A

	loss_cls: tensor(0.6064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6928, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [08:02<23:48,  5.37s/it][A

	loss_cls: tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5580, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:08<23:44,  5.37s/it][A

	loss_cls: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7628, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:13<23:33,  5.35s/it][A

	loss_cls: tensor(0.6091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8734, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:19<23:33,  5.37s/it][A

	loss_cls: tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1128, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7340, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:24<23:26,  5.37s/it][A

	loss_cls: tensor(0.4836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1006, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5842, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:29<23:19,  5.36s/it][A

	loss_cls: tensor(0.4307, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4617, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:35<23:18,  5.38s/it][A

	loss_cls: tensor(0.6433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1154, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:40<23:11,  5.37s/it][A

	loss_cls: tensor(0.5550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1057, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6607, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:45<23:09,  5.39s/it][A

	loss_cls: tensor(0.5370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9305, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:51<23:01,  5.38s/it][A

	loss_cls: tensor(0.6871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9058, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:56<23:00,  5.39s/it][A

	loss_cls: tensor(0.5407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8453, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [09:02<22:51,  5.38s/it][A

	loss_cls: tensor(0.6546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8834, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:07<22:44,  5.37s/it][A

	loss_cls: tensor(0.8107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9736, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:12<22:43,  5.39s/it][A

	loss_cls: tensor(0.5036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6401, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:18<22:32,  5.37s/it][A

	loss_cls: tensor(0.6425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7438, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:23<22:31,  5.39s/it][A

	loss_cls: tensor(0.4296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6628, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:28<22:21,  5.37s/it][A

	loss_cls: tensor(0.3464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4210, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:34<22:23,  5.40s/it][A

	loss_cls: tensor(0.4521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7045, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:39<22:16,  5.39s/it][A

	loss_cls: tensor(0.6538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7341, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:45<22:08,  5.38s/it][A

	loss_cls: tensor(0.4168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4525, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:50<22:06,  5.39s/it][A

	loss_cls: tensor(0.5097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5800, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:55<21:57,  5.38s/it][A

	loss_cls: tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8467, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [10:01<21:56,  5.40s/it][A

	loss_cls: tensor(0.5240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6659, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:06<21:49,  5.39s/it][A

	loss_cls: tensor(0.4044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4425, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:12<21:48,  5.41s/it][A

	loss_cls: tensor(0.2432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.2848, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:17<21:40,  5.39s/it][A

	loss_cls: tensor(0.5903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9469, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:22<21:31,  5.38s/it][A

	loss_cls: tensor(0.4022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1842, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5864, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:28<21:37,  5.43s/it][A

	loss_cls: tensor(0.8283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2859, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1142, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:33<21:26,  5.41s/it][A

	loss_cls: tensor(0.6163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8616, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:39<21:25,  5.42s/it][A

	loss_cls: tensor(0.9203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3069, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:44<21:15,  5.41s/it][A

	loss_cls: tensor(0.3001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3344, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:50<21:13,  5.42s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8679, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:55<21:03,  5.40s/it][A

	loss_cls: tensor(0.5187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6639, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [11:00<20:55,  5.39s/it][A

	loss_cls: tensor(0.8078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0752, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:06<20:53,  5.40s/it][A

	loss_cls: tensor(0.5486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1610, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7096, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:11<20:44,  5.39s/it][A

	loss_cls: tensor(0.4632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5892, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:17<20:45,  5.41s/it][A

	loss_cls: tensor(1.0222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1841, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:22<20:37,  5.41s/it][A

	loss_cls: tensor(1.2305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3347, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:27<20:35,  5.42s/it][A

	loss_cls: tensor(0.4160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6625, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:33<20:28,  5.41s/it][A

	loss_cls: tensor(0.7944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0102, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:38<20:21,  5.41s/it][A

	loss_cls: tensor(0.3586, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3924, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:44<20:21,  5.43s/it][A

	loss_cls: tensor(0.7486, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3931, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1417, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:49<20:14,  5.42s/it][A

	loss_cls: tensor(0.5712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8791, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:55<20:13,  5.44s/it][A

	loss_cls: tensor(1.0854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3977, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [12:00<20:05,  5.43s/it][A

	loss_cls: tensor(0.7599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1353, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:05<20:03,  5.44s/it][A

	loss_cls: tensor(0.3806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6709, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:11<19:54,  5.43s/it][A

	loss_cls: tensor(0.6447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:16<19:45,  5.42s/it][A

	loss_cls: tensor(0.4103, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0458, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4561, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:22<19:42,  5.43s/it][A

	loss_cls: tensor(0.7117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9839, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:27<19:33,  5.41s/it][A

	loss_cls: tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9517, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:32<19:31,  5.42s/it][A

	loss_cls: tensor(0.5079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7951, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:38<19:21,  5.40s/it][A

	loss_cls: tensor(0.5987, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7609, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:43<19:17,  5.41s/it][A

	loss_cls: tensor(0.8126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9864, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:49<19:07,  5.39s/it][A

	loss_cls: tensor(0.5819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8471, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:54<18:56,  5.36s/it][A

	loss_cls: tensor(0.5927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9282, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:59<18:52,  5.37s/it][A

	loss_cls: tensor(0.6167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7595, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:05<18:45,  5.36s/it][A

	loss_cls: tensor(0.4926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6979, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:10<18:43,  5.38s/it][A

	loss_cls: tensor(0.4990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7563, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:15<18:34,  5.36s/it][A

	loss_cls: tensor(0.5979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1944, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7923, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:21<18:32,  5.37s/it][A

	loss_cls: tensor(0.6523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7693, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:26<18:23,  5.36s/it][A

	loss_cls: tensor(0.5380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8657, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:31<18:13,  5.34s/it][A

	loss_cls: tensor(0.5457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9777, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:37<18:12,  5.36s/it][A

	loss_cls: tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9683, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:42<18:08,  5.36s/it][A

	loss_cls: tensor(0.6794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8854, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:48<18:08,  5.39s/it][A

	loss_cls: tensor(0.4249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8226, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:53<18:00,  5.38s/it][A

	loss_cls: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7716, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:58<17:56,  5.38s/it][A

	loss_cls: tensor(0.6398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8133, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:04<17:46,  5.36s/it][A

	loss_cls: tensor(0.4537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1895, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6432, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:09<17:38,  5.35s/it][A

	loss_cls: tensor(0.6605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1336, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7941, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:14<17:37,  5.37s/it][A

	loss_cls: tensor(0.9049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0931, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:20<17:30,  5.36s/it][A

	loss_cls: tensor(0.7370, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9069, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:25<17:28,  5.38s/it][A

	loss_cls: tensor(0.6718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1275, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:30<17:19,  5.36s/it][A

	loss_cls: tensor(0.7495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0958, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8453, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:36<17:15,  5.37s/it][A

	loss_cls: tensor(0.5681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8910, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:41<17:07,  5.35s/it][A

	loss_cls: tensor(0.4903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7100, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:46<16:59,  5.34s/it][A

	loss_cls: tensor(0.5785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8622, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:52<16:58,  5.36s/it][A

	loss_cls: tensor(0.6653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8323, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:57<16:48,  5.34s/it][A

	loss_cls: tensor(0.6099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6669, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:03<16:46,  5.35s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7363, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:08<16:37,  5.33s/it][A

	loss_cls: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8506, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:13<16:33,  5.34s/it][A

	loss_cls: tensor(0.6780, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9968, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:18<16:24,  5.32s/it][A

	loss_cls: tensor(0.5252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1234, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6486, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:24<16:20,  5.33s/it][A

	loss_cls: tensor(0.6302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0047, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:29<16:14,  5.32s/it][A

	loss_cls: tensor(0.4637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9133, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:34<16:08,  5.32s/it][A

	loss_cls: tensor(0.6578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5224, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1802, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:40<16:07,  5.34s/it][A

	loss_cls: tensor(0.5233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6090, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:45<16:05,  5.36s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5396, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0552, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:51<16:07,  5.40s/it][A

	loss_cls: tensor(0.4665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6403, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:56<16:04,  5.42s/it][A

	loss_cls: tensor(0.5363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9166, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:01<15:47,  5.35s/it][A

	loss_cls: tensor(0.7894, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0102, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:07<15:51,  5.40s/it][A

	loss_cls: tensor(0.6964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1356, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8320, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:12<15:50,  5.43s/it][A

	loss_cls: tensor(0.4890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8643, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:18<15:48,  5.45s/it][A

	loss_cls: tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9320, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:23<15:45,  5.46s/it][A

	loss_cls: tensor(0.6569, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8580, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:29<15:44,  5.49s/it][A

	loss_cls: tensor(0.5184, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3151, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8335, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:34<15:39,  5.50s/it][A

	loss_cls: tensor(0.5868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6451, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:40<15:38,  5.52s/it][A

	loss_cls: tensor(0.7866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9852, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:46<15:33,  5.52s/it][A

	loss_cls: tensor(0.5812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7144, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:51<15:25,  5.51s/it][A

	loss_cls: tensor(0.7107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7913, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:57<15:23,  5.53s/it][A

	loss_cls: tensor(0.8031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9122, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:02<15:16,  5.52s/it][A

	loss_cls: tensor(0.4276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6879, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:08<15:12,  5.53s/it][A

	loss_cls: tensor(0.4794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5966, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:13<15:05,  5.52s/it][A

	loss_cls: tensor(0.7002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8322, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:18<14:33,  5.36s/it][A

	loss_cls: tensor(0.4985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5415, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:24<14:34,  5.40s/it][A

	loss_cls: tensor(0.4612, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7064, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:29<14:33,  5.43s/it][A

	loss_cls: tensor(0.5596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8674, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:35<14:35,  5.47s/it][A

	loss_cls: tensor(0.6611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9529, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:40<14:30,  5.47s/it][A

	loss_cls: tensor(0.4679, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5022, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:46<14:29,  5.50s/it][A

	loss_cls: tensor(0.7202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0125, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:51<14:21,  5.49s/it][A

	loss_cls: tensor(0.5672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6046, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:57<14:20,  5.51s/it][A

	loss_cls: tensor(0.3046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4149, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:02<14:13,  5.51s/it][A

	loss_cls: tensor(1.0122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4532, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:08<14:07,  5.50s/it][A

	loss_cls: tensor(0.5973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8433, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:13<14:04,  5.52s/it][A

	loss_cls: tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7193, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:19<13:57,  5.51s/it][A

	loss_cls: tensor(0.7277, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9493, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:24<13:54,  5.53s/it][A

	loss_cls: tensor(0.8162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0511, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:30<13:47,  5.52s/it][A

	loss_cls: tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8759, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:35<13:45,  5.54s/it][A

	loss_cls: tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9433, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:41<13:37,  5.53s/it][A

	loss_cls: tensor(0.5036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9018, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:46<13:30,  5.51s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9563, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:52<13:09,  5.40s/it][A

	loss_cls: tensor(0.8628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2084, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0712, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:57<13:07,  5.43s/it][A

	loss_cls: tensor(0.5135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7234, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:03<13:08,  5.47s/it][A

	loss_cls: tensor(0.7111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8265, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:08<13:04,  5.48s/it][A

	loss_cls: tensor(0.6724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8389, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:14<13:03,  5.52s/it][A

	loss_cls: tensor(0.8441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3418, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:19<12:57,  5.51s/it][A

	loss_cls: tensor(0.3889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0714, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4603, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:25<12:51,  5.51s/it][A

	loss_cls: tensor(0.7673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0990, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:30<12:51,  5.55s/it][A

	loss_cls: tensor(0.5716, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8827, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:36<12:46,  5.55s/it][A

	loss_cls: tensor(0.4480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8784, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:42<12:41,  5.56s/it][A

	loss_cls: tensor(0.6457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8217, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:47<12:32,  5.54s/it][A

	loss_cls: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7129, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:53<12:28,  5.54s/it][A

	loss_cls: tensor(0.5789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8307, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:58<12:17,  5.50s/it][A

	loss_cls: tensor(0.6119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9742, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:03<12:08,  5.47s/it][A

	loss_cls: tensor(0.5608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7392, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:09<12:05,  5.49s/it][A

	loss_cls: tensor(0.7216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9045, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:14<11:56,  5.47s/it][A

	loss_cls: tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1970, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8052, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:20<11:55,  5.50s/it][A

	loss_cls: tensor(0.5982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9348, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:25<11:48,  5.49s/it][A

	loss_cls: tensor(0.7983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0069, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:31<11:45,  5.51s/it][A

	loss_cls: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7013, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:36<11:35,  5.48s/it][A

	loss_cls: tensor(0.5680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1629, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7310, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:42<11:29,  5.47s/it][A

	loss_cls: tensor(0.5665, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1497, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:47<11:27,  5.50s/it][A

	loss_cls: tensor(0.6127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6793, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:53<11:20,  5.49s/it][A

	loss_cls: tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7560, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:58<11:17,  5.51s/it][A

	loss_cls: tensor(0.6999, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7988, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:04<11:10,  5.49s/it][A

	loss_cls: tensor(0.5049, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7955, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:09<11:06,  5.51s/it][A

	loss_cls: tensor(0.4629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6094, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:15<11:00,  5.50s/it][A

	loss_cls: tensor(0.4330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5935, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:20<10:53,  5.49s/it][A

	loss_cls: tensor(0.5127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6348, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:26<10:49,  5.50s/it][A

	loss_cls: tensor(0.8306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1575, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9881, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:31<10:42,  5.49s/it][A

	loss_cls: tensor(0.5042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6667, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:37<10:40,  5.52s/it][A

	loss_cls: tensor(0.3357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5283, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:42<10:33,  5.50s/it][A

	loss_cls: tensor(0.3930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5401, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:48<10:28,  5.51s/it][A

	loss_cls: tensor(0.8038, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0432, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:53<10:20,  5.49s/it][A

	loss_cls: tensor(0.5641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7675, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:59<10:14,  5.49s/it][A

	loss_cls: tensor(0.5618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7592, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:04<10:09,  5.49s/it][A

	loss_cls: tensor(0.4653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7096, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:10<10:01,  5.47s/it][A

	loss_cls: tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9130, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:15<09:59,  5.50s/it][A

	loss_cls: tensor(0.4896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6071, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:21<09:52,  5.49s/it][A

	loss_cls: tensor(0.4101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3310, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7412, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:26<09:49,  5.51s/it][A

	loss_cls: tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5215, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:32<09:42,  5.50s/it][A

	loss_cls: tensor(0.5441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2338, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7779, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:37<09:36,  5.49s/it][A

	loss_cls: tensor(0.4621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6475, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:43<09:32,  5.51s/it][A

	loss_cls: tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9153, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:48<09:26,  5.50s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6988, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:54<09:23,  5.52s/it][A

	loss_cls: tensor(0.7323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1279, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:59<09:16,  5.51s/it][A

	loss_cls: tensor(0.4221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5994, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:05<09:11,  5.52s/it][A

	loss_cls: tensor(0.5290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6513, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:10<09:04,  5.50s/it][A

	loss_cls: tensor(0.4440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5606, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:16<08:57,  5.49s/it][A

	loss_cls: tensor(0.4302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5349, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:21<08:54,  5.51s/it][A

	loss_cls: tensor(0.5663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9474, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:27<08:47,  5.50s/it][A

	loss_cls: tensor(0.8413, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1663, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0076, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:32<08:43,  5.51s/it][A

	loss_cls: tensor(0.6340, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7683, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:38<08:36,  5.50s/it][A

	loss_cls: tensor(0.8312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2509, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:43<08:32,  5.52s/it][A

	loss_cls: tensor(0.5897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7036, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:49<08:25,  5.50s/it][A

	loss_cls: tensor(0.5120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5601, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:54<08:18,  5.47s/it][A

	loss_cls: tensor(0.5575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0094, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [24:00<08:15,  5.50s/it][A

	loss_cls: tensor(0.7741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9695, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:05<08:08,  5.49s/it][A

	loss_cls: tensor(0.6433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9088, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:11<08:05,  5.52s/it][A

	loss_cls: tensor(0.7552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9073, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:16<07:58,  5.50s/it][A

	loss_cls: tensor(0.4326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6571, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:22<07:53,  5.51s/it][A

	loss_cls: tensor(0.6892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9485, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:27<07:47,  5.50s/it][A

	loss_cls: tensor(0.6136, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9835, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:33<07:42,  5.51s/it][A

	loss_cls: tensor(0.4715, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2816, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7531, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:38<07:36,  5.50s/it][A

	loss_cls: tensor(0.8022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9524, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:44<07:30,  5.49s/it][A

	loss_cls: tensor(0.3473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3256, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6729, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:49<07:25,  5.50s/it][A

	loss_cls: tensor(0.4249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7637, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:55<07:19,  5.49s/it][A

	loss_cls: tensor(0.7303, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8565, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [25:00<07:14,  5.50s/it][A

	loss_cls: tensor(0.9091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9905, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:06<07:05,  5.46s/it][A

	loss_cls: tensor(0.4748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2988, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7735, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:11<07:02,  5.48s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8707, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:17<06:56,  5.49s/it][A

	loss_cls: tensor(0.5402, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7240, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:22<06:51,  5.49s/it][A

	loss_cls: tensor(0.8161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1329, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:28<06:47,  5.51s/it][A

	loss_cls: tensor(0.5017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7421, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:33<06:41,  5.49s/it][A

	loss_cls: tensor(0.7406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1182, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:39<06:36,  5.51s/it][A

	loss_cls: tensor(0.8011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9824, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:44<06:31,  5.51s/it][A

	loss_cls: tensor(0.4594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6496, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:50<06:26,  5.53s/it][A

	loss_cls: tensor(0.5946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9728, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:55<06:20,  5.51s/it][A

	loss_cls: tensor(0.6117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1171, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [26:01<06:13,  5.49s/it][A

	loss_cls: tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7279, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:06<06:08,  5.50s/it][A

	loss_cls: tensor(0.5314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8114, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:12<06:02,  5.49s/it][A

	loss_cls: tensor(0.6117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7067, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:17<05:56,  5.48s/it][A

	loss_cls: tensor(1.1409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2165, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:23<05:49,  5.46s/it][A

	loss_cls: tensor(0.5572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7026, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:28<05:46,  5.50s/it][A

	loss_cls: tensor(0.4790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8469, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:34<05:40,  5.49s/it][A

	loss_cls: tensor(0.3484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6055, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:39<05:34,  5.49s/it][A

	loss_cls: tensor(1.0093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1724, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:45<05:29,  5.50s/it][A

	loss_cls: tensor(0.4524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4797, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9321, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:50<05:24,  5.49s/it][A

	loss_cls: tensor(0.3711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6201, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:56<05:19,  5.50s/it][A

	loss_cls: tensor(0.4851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8606, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [27:01<05:13,  5.49s/it][A

	loss_cls: tensor(0.8670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9916, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:07<05:09,  5.52s/it][A

	loss_cls: tensor(0.5920, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9027, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:12<05:03,  5.51s/it][A

	loss_cls: tensor(1.0781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2907, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3688, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:18<04:57,  5.51s/it][A

	loss_cls: tensor(0.6779, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1652, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8431, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:23<04:52,  5.52s/it][A

	loss_cls: tensor(0.7917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9535, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:29<04:45,  5.50s/it][A

	loss_cls: tensor(0.5161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8213, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:34<04:41,  5.52s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7112, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:40<04:35,  5.50s/it][A

	loss_cls: tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7646, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:45<04:30,  5.53s/it][A

	loss_cls: tensor(0.5677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7796, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:51<04:25,  5.53s/it][A

	loss_cls: tensor(0.6501, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3313, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9814, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:56<04:18,  5.51s/it][A

	loss_cls: tensor(0.7203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8699, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [28:02<04:14,  5.53s/it][A

	loss_cls: tensor(0.5166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6449, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:07<04:07,  5.50s/it][A

	loss_cls: tensor(0.9281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1067, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0349, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:13<04:02,  5.52s/it][A

	loss_cls: tensor(0.3510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5057, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:18<03:57,  5.51s/it][A

	loss_cls: tensor(0.7174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1990, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9164, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:24<03:51,  5.50s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3304, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7765, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:29<03:45,  5.49s/it][A

	loss_cls: tensor(0.4954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0674, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:35<03:39,  5.49s/it][A

	loss_cls: tensor(0.5268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8253, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:40<03:34,  5.51s/it][A

	loss_cls: tensor(0.5789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7931, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:46<03:28,  5.48s/it][A

	loss_cls: tensor(0.8692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2683, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1375, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:51<03:24,  5.51s/it][A

	loss_cls: tensor(0.5912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8281, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:57<03:18,  5.51s/it][A

	loss_cls: tensor(0.4203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2216, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6419, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [29:03<03:13,  5.52s/it][A

	loss_cls: tensor(0.6611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9351, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:08<03:07,  5.51s/it][A

	loss_cls: tensor(0.4660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6591, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:13<03:01,  5.49s/it][A

	loss_cls: tensor(0.6731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8801, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:19<02:56,  5.51s/it][A

	loss_cls: tensor(0.5620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8113, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:24<02:50,  5.49s/it][A

	loss_cls: tensor(0.4283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1880, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6163, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:30<02:45,  5.52s/it][A

	loss_cls: tensor(0.5856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8255, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:36<02:39,  5.51s/it][A

	loss_cls: tensor(0.4619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5642, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:41<02:34,  5.52s/it][A

	loss_cls: tensor(0.6586, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1295, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7881, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:47<02:28,  5.51s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6624, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:52<02:23,  5.50s/it][A

	loss_cls: tensor(0.7035, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7525, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:58<02:18,  5.52s/it][A

	loss_cls: tensor(0.5209, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7097, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [30:03<02:12,  5.51s/it][A

	loss_cls: tensor(0.5849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6311, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:09<02:07,  5.53s/it][A

	loss_cls: tensor(0.6626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8637, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:14<02:01,  5.51s/it][A

	loss_cls: tensor(0.7007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7474, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:20<01:56,  5.53s/it][A

	loss_cls: tensor(0.5104, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6782, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:25<01:50,  5.51s/it][A

	loss_cls: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6342, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:30<01:43,  5.45s/it][A

	loss_cls: tensor(0.6693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9762, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:36<01:38,  5.49s/it][A

	loss_cls: tensor(0.6172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8021, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:42<01:33,  5.48s/it][A

	loss_cls: tensor(0.7454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9286, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:47<01:28,  5.51s/it][A

	loss_cls: tensor(0.4567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7587, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:53<01:22,  5.51s/it][A

	loss_cls: tensor(0.5534, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7031, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:58<01:17,  5.52s/it][A

	loss_cls: tensor(0.4417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7248, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [31:04<01:11,  5.50s/it][A

	loss_cls: tensor(0.5806, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9207, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:09<01:05,  5.49s/it][A

	loss_cls: tensor(0.7292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8410, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:15<01:00,  5.51s/it][A

	loss_cls: tensor(0.5327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8444, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:20<00:54,  5.50s/it][A

	loss_cls: tensor(0.4668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7824, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:26<00:49,  5.52s/it][A

	loss_cls: tensor(0.7263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9689, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:31<00:44,  5.51s/it][A

	loss_cls: tensor(0.5466, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0205, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:37<00:38,  5.53s/it][A

	loss_cls: tensor(0.7445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9002, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:42<00:33,  5.51s/it][A

	loss_cls: tensor(0.4818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5162, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:48<00:27,  5.50s/it][A

	loss_cls: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2308, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6263, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:53<00:22,  5.52s/it][A

	loss_cls: tensor(0.5925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9194, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:59<00:16,  5.51s/it][A

	loss_cls: tensor(0.5318, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8484, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [32:04<00:11,  5.53s/it][A

	loss_cls: tensor(0.6503, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8755, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:10<00:05,  5.52s/it][A

	loss_cls: tensor(0.5848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8322, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:12<00:00,  5.46s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.7571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1894, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8202556977837773

	Training cls acc: 0.6898540489642185

	Training cls prec: 0.5779305851869412

	Training cls rec: 0.6192603888578465

	Training cls f1: 0.5402122487631105

--
	Training ner acc: 0.9550640864246178

	Training ner prec: 0.2700776721961822

	Training ner rec: 0.2786706949061574

	Training ner f1: 0.2739187889220468

	Current Learning rate:  0.00017142857142857143



  1%|          | 1/177 [00:00<01:57,  1.49it/s][A
  1%|          | 2/177 [00:01<02:04,  1.41it/s][A
  2%|▏         | 3/177 [00:02<02:05,  1.39it/s][A
  2%|▏         | 4/177 [00:02<02:00,  1.43it/s][A
  3%|▎         | 5/177 [00:03<02:00,  1.42it/s][A
  3%|▎         | 6/177 [00:04<02:02,  1.40it/s][A
  4%|▍         | 7/177 [00:04<02:02,  1.38it/s][A
  5%|▍         | 8/177 [00:05<01:58,  1.43it/s][A
  5%|▌         | 9/177 [00:06<01:58,  1.42it/s][A
  6%|▌         | 10/177 [00:07<01:59,  1.39it/s][A
  6%|▌         | 11/177 [00:07<01:59,  1.39it/s][A
  7%|▋         | 12/177 [00:08<01:55,  1.43it/s][A
  7%|▋         | 13/177 [00:09<01:56,  1.41it/s][A
  8%|▊         | 14/177 [00:09<01:57,  1.39it/s][A
  8%|▊         | 15/177 [00:10<01:53,  1.43it/s][A
  9%|▉         | 16/177 [00:11<01:53,  1.42it/s][A
 10%|▉         | 17/177 [00:12<01:53,  1.41it/s][A
 10%|█         | 18/177 [00:12<01:54,  1.39it/s][A
 11%|█         | 19/177 [00:13<01:50,  1.44it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7750742650974942

	Validation cls acc: 0.7274011299435028

	Validation cls prec: 0.6123654829163304

	Validation cls rec: 0.5957761635727739

	Validation cls f1: 0.5672099087353325

--
	Validation ner acc: 0.9533820332755699

	Validation ner prec: 0.41331591942627693

	Validation ner rec: 0.4240112994350283

	Validation ner f1: 0.4184399331305531



  0%|          | 1/354 [00:05<32:47,  5.57s/it][A

	loss_cls: tensor(1.0433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3923, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:11<32:29,  5.54s/it][A

	loss_cls: tensor(0.8752, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2892, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<32:15,  5.51s/it][A

	loss_cls: tensor(0.6548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9323, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:22<32:09,  5.51s/it][A

	loss_cls: tensor(0.4630, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2819, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7448, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:53,  5.48s/it][A

	loss_cls: tensor(0.4332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2985, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7317, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:33<32:01,  5.52s/it][A

	loss_cls: tensor(0.4124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8487, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:53,  5.52s/it][A

	loss_cls: tensor(0.5028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6137, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:44<31:56,  5.54s/it][A

	loss_cls: tensor(0.5478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6855, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:46,  5.53s/it][A

	loss_cls: tensor(0.4888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7272, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:55<31:38,  5.52s/it][A

	loss_cls: tensor(0.4853, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7581, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [01:00<31:40,  5.54s/it][A

	loss_cls: tensor(0.7620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0588, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:06<31:27,  5.52s/it][A

	loss_cls: tensor(0.3633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6001, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:11<31:31,  5.55s/it][A

	loss_cls: tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8048, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:17<31:21,  5.53s/it][A

	loss_cls: tensor(0.6187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8481, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:22<31:18,  5.54s/it][A

	loss_cls: tensor(0.4127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2242, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6369, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:28<31:13,  5.54s/it][A

	loss_cls: tensor(0.5721, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9223, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:33<31:04,  5.53s/it][A

	loss_cls: tensor(0.5390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6586, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:39<31:03,  5.55s/it][A

	loss_cls: tensor(0.4206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7078, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:45<30:52,  5.53s/it][A

	loss_cls: tensor(0.4966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7965, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:50<30:56,  5.56s/it][A

	loss_cls: tensor(0.6235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3092, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9327, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:56<30:45,  5.54s/it][A

	loss_cls: tensor(0.8135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9360, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:01<30:29,  5.51s/it][A

	loss_cls: tensor(0.4946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5798, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:06<30:07,  5.46s/it][A

	loss_cls: tensor(0.4628, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7026, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:12<29:52,  5.43s/it][A

	loss_cls: tensor(0.8232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0606, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:17<29:47,  5.43s/it][A

	loss_cls: tensor(0.5526, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0257, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:23<29:35,  5.41s/it][A

	loss_cls: tensor(0.7561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1307, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:28<29:38,  5.44s/it][A

	loss_cls: tensor(0.5599, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1822, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7421, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:34<29:27,  5.42s/it][A

	loss_cls: tensor(0.2931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3895, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:39<29:24,  5.43s/it][A

	loss_cls: tensor(0.7238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7797, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:44<29:13,  5.41s/it][A

	loss_cls: tensor(0.6750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7376, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:50<29:01,  5.39s/it][A

	loss_cls: tensor(0.6924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1731, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:55<28:58,  5.40s/it][A

	loss_cls: tensor(0.4225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6670, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [03:00<28:50,  5.39s/it][A

	loss_cls: tensor(0.4544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6572, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:06<28:53,  5.42s/it][A

	loss_cls: tensor(0.5559, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5967, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:11<28:42,  5.40s/it][A

	loss_cls: tensor(0.6012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9027, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:17<28:40,  5.41s/it][A

	loss_cls: tensor(0.7956, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0528, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:22<28:29,  5.39s/it][A

	loss_cls: tensor(0.8590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4563, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:27<28:21,  5.38s/it][A

	loss_cls: tensor(0.8001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1594, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:33<28:20,  5.40s/it][A

	loss_cls: tensor(0.5130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5847, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:38<28:12,  5.39s/it][A

	loss_cls: tensor(0.4188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6257, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:44<28:17,  5.42s/it][A

	loss_cls: tensor(0.5112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7234, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:49<28:08,  5.41s/it][A

	loss_cls: tensor(0.6801, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8204, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:55<28:04,  5.42s/it][A

	loss_cls: tensor(0.4545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8306, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [04:00<27:52,  5.39s/it][A

	loss_cls: tensor(0.8489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9997, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:05<27:43,  5.38s/it][A

	loss_cls: tensor(0.6180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7356, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:11<27:43,  5.40s/it][A

	loss_cls: tensor(0.5294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3936, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9230, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:16<27:36,  5.40s/it][A

	loss_cls: tensor(0.4400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1891, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6291, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:22<27:36,  5.41s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7681, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:27<27:26,  5.40s/it][A

	loss_cls: tensor(0.3410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4585, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:32<27:24,  5.41s/it][A

	loss_cls: tensor(0.6886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8016, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:38<27:12,  5.39s/it][A

	loss_cls: tensor(0.8100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1257, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:43<27:04,  5.38s/it][A

	loss_cls: tensor(0.5553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6447, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:49<27:07,  5.41s/it][A

	loss_cls: tensor(0.5911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1074, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6986, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:54<27:01,  5.40s/it][A

	loss_cls: tensor(0.5200, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7525, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:59<27:00,  5.42s/it][A

	loss_cls: tensor(0.7615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0056, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:05<26:47,  5.39s/it][A

	loss_cls: tensor(0.3950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6754, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:10<26:44,  5.40s/it][A

	loss_cls: tensor(0.5289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5881, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:16<26:37,  5.40s/it][A

	loss_cls: tensor(0.4119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5315, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:21<26:33,  5.40s/it][A

	loss_cls: tensor(0.5450, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8250, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:26<26:27,  5.40s/it][A

	loss_cls: tensor(0.6041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6964, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:32<26:19,  5.39s/it][A

	loss_cls: tensor(0.8226, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2864, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1090, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:37<26:18,  5.41s/it][A

	loss_cls: tensor(0.7547, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1734, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:43<26:10,  5.40s/it][A

	loss_cls: tensor(0.5756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9023, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:48<26:08,  5.41s/it][A

	loss_cls: tensor(0.7564, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1253, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:53<25:55,  5.38s/it][A

	loss_cls: tensor(0.4851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5250, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:59<25:56,  5.41s/it][A

	loss_cls: tensor(0.5076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6533, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:04<25:49,  5.40s/it][A

	loss_cls: tensor(0.6994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8892, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:09<25:41,  5.39s/it][A

	loss_cls: tensor(0.8561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2422, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:15<25:41,  5.41s/it][A

	loss_cls: tensor(0.6607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8820, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:20<25:33,  5.40s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8094, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:26<25:32,  5.41s/it][A

	loss_cls: tensor(0.4405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1039, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5444, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:31<25:23,  5.40s/it][A

	loss_cls: tensor(0.6798, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7199, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:37<25:24,  5.43s/it][A

	loss_cls: tensor(0.4621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7275, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:42<25:17,  5.42s/it][A

	loss_cls: tensor(0.6607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8994, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:47<25:13,  5.42s/it][A

	loss_cls: tensor(1.1101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3258, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:53<25:10,  5.44s/it][A

	loss_cls: tensor(0.6058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6556, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:58<25:00,  5.42s/it][A

	loss_cls: tensor(0.4881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8030, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:04<24:55,  5.42s/it][A

	loss_cls: tensor(0.5914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1374, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7288, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:09<24:46,  5.40s/it][A

	loss_cls: tensor(1.1849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3312, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:15<24:46,  5.42s/it][A

	loss_cls: tensor(0.4357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6415, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:20<24:35,  5.41s/it][A

	loss_cls: tensor(0.4590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1485, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6076, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:25<24:26,  5.39s/it][A

	loss_cls: tensor(0.4479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7007, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:31<24:24,  5.40s/it][A

	loss_cls: tensor(0.4080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4646, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:36<24:15,  5.39s/it][A

	loss_cls: tensor(0.5795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6185, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:41<24:13,  5.40s/it][A

	loss_cls: tensor(0.7092, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3095, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0187, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:47<24:04,  5.39s/it][A

	loss_cls: tensor(0.5270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8304, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:52<24:03,  5.41s/it][A

	loss_cls: tensor(0.5871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8696, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:58<23:53,  5.39s/it][A

	loss_cls: tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5699, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:03<23:46,  5.38s/it][A

	loss_cls: tensor(0.7160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9746, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:08<23:44,  5.40s/it][A

	loss_cls: tensor(0.4218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4826, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:14<23:35,  5.38s/it][A

	loss_cls: tensor(0.6948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3069, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0017, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:19<23:34,  5.40s/it][A

	loss_cls: tensor(0.4975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7583, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:25<23:27,  5.39s/it][A

	loss_cls: tensor(0.6086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9983, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:30<23:27,  5.41s/it][A

	loss_cls: tensor(0.8241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1772, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:35<23:16,  5.39s/it][A

	loss_cls: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7090, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:41<23:07,  5.38s/it][A

	loss_cls: tensor(0.5152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2776, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7927, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:46<23:06,  5.39s/it][A

	loss_cls: tensor(0.5689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6068, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:52<22:57,  5.38s/it][A

	loss_cls: tensor(0.7831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9549, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:57<22:58,  5.40s/it][A

	loss_cls: tensor(0.5143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6219, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:02<22:49,  5.39s/it][A

	loss_cls: tensor(0.8302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0682, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:08<22:48,  5.41s/it][A

	loss_cls: tensor(0.5327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7820, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:13<22:42,  5.41s/it][A

	loss_cls: tensor(1.0160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1572, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:19<22:33,  5.39s/it][A

	loss_cls: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7193, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:24<22:32,  5.41s/it][A

	loss_cls: tensor(0.5039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6520, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:29<22:22,  5.39s/it][A

	loss_cls: tensor(0.6399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8193, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:35<22:22,  5.42s/it][A

	loss_cls: tensor(0.6231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1321, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7552, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:40<22:14,  5.40s/it][A

	loss_cls: tensor(0.3924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6004, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:46<22:14,  5.42s/it][A

	loss_cls: tensor(0.3995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4504, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:51<22:05,  5.41s/it][A

	loss_cls: tensor(0.5300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7456, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:56<21:53,  5.38s/it][A

	loss_cls: tensor(0.3593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5874, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:02<21:51,  5.40s/it][A

	loss_cls: tensor(0.3112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3740, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:07<21:41,  5.38s/it][A

	loss_cls: tensor(0.3971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1106, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5077, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:13<21:46,  5.42s/it][A

	loss_cls: tensor(0.5767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8334, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:18<21:38,  5.41s/it][A

	loss_cls: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1074, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:24<21:43,  5.46s/it][A

	loss_cls: tensor(0.8110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0919, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:29<21:52,  5.51s/it][A

	loss_cls: tensor(0.5405, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1802, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7207, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:35<22:26,  5.68s/it][A

	loss_cls: tensor(0.8947, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0678, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:41<22:39,  5.76s/it][A

	loss_cls: tensor(0.9352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2401, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:47<22:21,  5.71s/it][A

	loss_cls: tensor(0.6341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8926, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:53<22:16,  5.71s/it][A

	loss_cls: tensor(0.5705, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7664, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:58<22:08,  5.70s/it][A

	loss_cls: tensor(0.7998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1019, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:04<22:06,  5.72s/it][A

	loss_cls: tensor(0.3983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6587, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:10<21:44,  5.65s/it][A

	loss_cls: tensor(0.9055, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1586, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:15<21:34,  5.63s/it][A

	loss_cls: tensor(0.6747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8462, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:21<21:34,  5.65s/it][A

	loss_cls: tensor(0.7287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1804, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:26<21:19,  5.61s/it][A

	loss_cls: tensor(0.7012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8500, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:32<21:07,  5.59s/it][A

	loss_cls: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7460, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:37<20:49,  5.53s/it][A

	loss_cls: tensor(0.4733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5403, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:43<20:40,  5.51s/it][A

	loss_cls: tensor(0.6462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7639, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:48<20:26,  5.47s/it][A

	loss_cls: tensor(0.4735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2170, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6905, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:53<20:14,  5.45s/it][A

	loss_cls: tensor(0.4241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8096, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:59<20:10,  5.45s/it][A

	loss_cls: tensor(0.5794, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7582, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:04<19:59,  5.43s/it][A

	loss_cls: tensor(0.4469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6055, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:10<19:57,  5.44s/it][A

	loss_cls: tensor(0.4190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5413, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:15<19:47,  5.42s/it][A

	loss_cls: tensor(0.4002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5032, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:21<19:43,  5.43s/it][A

	loss_cls: tensor(0.7276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8457, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:26<19:32,  5.40s/it][A

	loss_cls: tensor(0.5141, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1925, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7066, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:31<19:23,  5.38s/it][A

	loss_cls: tensor(0.8420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0394, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:37<19:21,  5.40s/it][A

	loss_cls: tensor(0.8738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4096, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:42<19:13,  5.39s/it][A

	loss_cls: tensor(0.4606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7137, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:48<19:11,  5.41s/it][A

	loss_cls: tensor(0.6247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6688, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:53<19:01,  5.38s/it][A

	loss_cls: tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4318, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9512, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:58<19:01,  5.41s/it][A

	loss_cls: tensor(0.5257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7477, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:04<18:54,  5.40s/it][A

	loss_cls: tensor(0.5162, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6993, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:09<18:46,  5.39s/it][A

	loss_cls: tensor(0.5168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7738, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:15<18:45,  5.41s/it][A

	loss_cls: tensor(0.8309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1863, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:20<18:36,  5.40s/it][A

	loss_cls: tensor(0.5861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8694, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:25<18:34,  5.41s/it][A

	loss_cls: tensor(0.4885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6000, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:31<18:26,  5.40s/it][A

	loss_cls: tensor(0.3829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7495, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:36<18:24,  5.41s/it][A

	loss_cls: tensor(0.5012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5740, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:42<18:15,  5.40s/it][A

	loss_cls: tensor(0.6232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8559, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:47<18:05,  5.37s/it][A

	loss_cls: tensor(0.6992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1631, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:52<18:02,  5.38s/it][A

	loss_cls: tensor(0.5504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6052, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:58<17:54,  5.37s/it][A

	loss_cls: tensor(0.6448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5684, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2132, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:03<17:54,  5.40s/it][A

	loss_cls: tensor(0.4609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5729, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:08<17:45,  5.38s/it][A

	loss_cls: tensor(0.6514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2725, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9240, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:14<17:42,  5.39s/it][A

	loss_cls: tensor(0.4381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5993, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:19<17:34,  5.38s/it][A

	loss_cls: tensor(0.4352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1380, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5732, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:25<17:29,  5.38s/it][A

	loss_cls: tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0027, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:30<17:23,  5.38s/it][A

	loss_cls: tensor(0.6164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8380, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:35<17:17,  5.38s/it][A

	loss_cls: tensor(0.3698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7480, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:41<17:15,  5.40s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5705, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:46<17:08,  5.38s/it][A

	loss_cls: tensor(0.6944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1633, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:52<17:05,  5.40s/it][A

	loss_cls: tensor(0.6756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8523, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:57<16:56,  5.38s/it][A

	loss_cls: tensor(0.5148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8496, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:02<16:53,  5.39s/it][A

	loss_cls: tensor(0.6979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8261, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:08<16:46,  5.38s/it][A

	loss_cls: tensor(0.5464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7094, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:13<16:40,  5.38s/it][A

	loss_cls: tensor(0.8214, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1816, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:18<16:39,  5.41s/it][A

	loss_cls: tensor(0.5240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7244, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:24<16:32,  5.39s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0655, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:29<16:28,  5.40s/it][A

	loss_cls: tensor(0.5407, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2789, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8196, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:35<16:18,  5.38s/it][A

	loss_cls: tensor(0.5829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1509, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7338, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:40<16:15,  5.39s/it][A

	loss_cls: tensor(0.4571, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5782, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:45<16:05,  5.36s/it][A

	loss_cls: tensor(0.7195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1235, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:51<15:56,  5.34s/it][A

	loss_cls: tensor(0.4606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5106, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:56<15:54,  5.36s/it][A

	loss_cls: tensor(0.4639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1327, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5966, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:01<15:48,  5.36s/it][A

	loss_cls: tensor(0.5470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8038, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:07<15:44,  5.37s/it][A

	loss_cls: tensor(0.4902, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6225, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:12<15:37,  5.36s/it][A

	loss_cls: tensor(0.6391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8017, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:17<15:35,  5.38s/it][A

	loss_cls: tensor(0.4489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8095, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:23<15:28,  5.37s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8558, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:28<15:25,  5.38s/it][A

	loss_cls: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6513, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:34<15:23,  5.40s/it][A

	loss_cls: tensor(0.4839, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6867, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:39<15:19,  5.41s/it][A

	loss_cls: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8333, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:45<15:15,  5.42s/it][A

	loss_cls: tensor(0.7573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3521, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:50<15:04,  5.38s/it][A

	loss_cls: tensor(0.7492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0025, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:55<15:00,  5.39s/it][A

	loss_cls: tensor(0.5011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6730, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:01<14:52,  5.38s/it][A

	loss_cls: tensor(0.3572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7732, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:06<14:45,  5.37s/it][A

	loss_cls: tensor(0.3741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0443, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4184, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:11<14:43,  5.39s/it][A

	loss_cls: tensor(0.2678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4510, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:17<14:36,  5.38s/it][A

	loss_cls: tensor(0.4134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6987, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:22<14:32,  5.39s/it][A

	loss_cls: tensor(0.6079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7999, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:28<14:27,  5.39s/it][A

	loss_cls: tensor(0.7355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1444, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:33<14:23,  5.40s/it][A

	loss_cls: tensor(0.6484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9247, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:38<14:18,  5.40s/it][A

	loss_cls: tensor(0.4915, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7494, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:44<14:09,  5.38s/it][A

	loss_cls: tensor(1.0139, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2195, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:49<14:05,  5.39s/it][A

	loss_cls: tensor(0.3787, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6808, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:54<13:55,  5.35s/it][A

	loss_cls: tensor(0.4205, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5622, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:00<13:52,  5.37s/it][A

	loss_cls: tensor(0.5921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8481, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:05<13:45,  5.36s/it][A

	loss_cls: tensor(0.9608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1364, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:11<13:42,  5.37s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1207, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6362, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:16<13:35,  5.36s/it][A

	loss_cls: tensor(0.5249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7169, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:21<13:28,  5.35s/it][A

	loss_cls: tensor(0.5116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6831, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:27<13:26,  5.38s/it][A

	loss_cls: tensor(0.5179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2737, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7916, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:32<13:18,  5.36s/it][A

	loss_cls: tensor(0.3991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4457, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:37<13:16,  5.38s/it][A

	loss_cls: tensor(0.5449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7000, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:43<13:10,  5.38s/it][A

	loss_cls: tensor(0.6246, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7923, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:48<13:09,  5.41s/it][A

	loss_cls: tensor(0.4361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5761, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:54<13:00,  5.38s/it][A

	loss_cls: tensor(0.5544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6225, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:59<12:53,  5.37s/it][A

	loss_cls: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8700, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:04<12:50,  5.38s/it][A

	loss_cls: tensor(0.5336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5705, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:10<12:42,  5.37s/it][A

	loss_cls: tensor(0.5663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7567, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:15<12:40,  5.39s/it][A

	loss_cls: tensor(0.8909, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1335, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:21<12:35,  5.40s/it][A

	loss_cls: tensor(0.6081, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7958, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:26<12:30,  5.40s/it][A

	loss_cls: tensor(0.5568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6232, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:31<12:22,  5.38s/it][A

	loss_cls: tensor(0.5908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2254, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:37<12:15,  5.37s/it][A

	loss_cls: tensor(0.4786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7667, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:42<12:12,  5.39s/it][A

	loss_cls: tensor(0.5296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8041, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:47<12:06,  5.38s/it][A

	loss_cls: tensor(0.6464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4153, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0617, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:53<12:03,  5.40s/it][A

	loss_cls: tensor(0.7238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2263, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:58<11:56,  5.39s/it][A

	loss_cls: tensor(0.7989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9643, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:04<11:52,  5.39s/it][A

	loss_cls: tensor(0.3729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8582, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:09<11:43,  5.37s/it][A

	loss_cls: tensor(0.4633, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6945, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:14<11:36,  5.36s/it][A

	loss_cls: tensor(0.5017, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8103, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:20<11:32,  5.37s/it][A

	loss_cls: tensor(0.6378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8894, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:25<11:26,  5.36s/it][A

	loss_cls: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:30<11:22,  5.37s/it][A

	loss_cls: tensor(0.8114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9204, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:36<11:14,  5.35s/it][A

	loss_cls: tensor(0.8543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1457, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:41<11:10,  5.36s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6924, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:46<11:03,  5.35s/it][A

	loss_cls: tensor(0.4655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0636, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5290, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:52<10:55,  5.33s/it][A

	loss_cls: tensor(0.3981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5680, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:57<10:51,  5.34s/it][A

	loss_cls: tensor(0.6293, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7172, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:02<10:44,  5.33s/it][A

	loss_cls: tensor(0.5131, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6884, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:08<10:42,  5.35s/it][A

	loss_cls: tensor(0.7592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9498, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:13<10:36,  5.35s/it][A

	loss_cls: tensor(0.6570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8099, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:19<10:38,  5.41s/it][A

	loss_cls: tensor(0.6747, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9597, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:24<10:36,  5.44s/it][A

	loss_cls: tensor(0.4582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6883, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:30<10:28,  5.42s/it][A

	loss_cls: tensor(0.7195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7666, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:35<10:27,  5.46s/it][A

	loss_cls: tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9144, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:41<10:24,  5.48s/it][A

	loss_cls: tensor(0.4972, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6739, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:46<10:22,  5.51s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7946, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:52<10:17,  5.52s/it][A

	loss_cls: tensor(0.6155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8335, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:57<10:15,  5.54s/it][A

	loss_cls: tensor(0.4346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5897, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:03<10:08,  5.54s/it][A

	loss_cls: tensor(0.5718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7633, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:08<10:01,  5.52s/it][A

	loss_cls: tensor(0.4144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5339, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:14<09:58,  5.54s/it][A

	loss_cls: tensor(0.4803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5156, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:19<09:52,  5.53s/it][A

	loss_cls: tensor(0.6095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7752, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:25<09:49,  5.56s/it][A

	loss_cls: tensor(0.6240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8047, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:31<09:41,  5.54s/it][A

	loss_cls: tensor(0.8519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0239, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:36<09:37,  5.55s/it][A

	loss_cls: tensor(0.8643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1048, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:42<09:27,  5.51s/it][A

	loss_cls: tensor(0.6690, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7227, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:47<09:21,  5.51s/it][A

	loss_cls: tensor(0.8068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9754, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:53<09:19,  5.54s/it][A

	loss_cls: tensor(0.5543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7831, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:58<09:13,  5.53s/it][A

	loss_cls: tensor(0.5168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6641, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:04<09:08,  5.54s/it][A

	loss_cls: tensor(0.5053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7086, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:09<09:02,  5.53s/it][A

	loss_cls: tensor(0.8095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2112, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0207, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:15<08:58,  5.55s/it][A

	loss_cls: tensor(0.7861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0877, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:20<08:51,  5.53s/it][A

	loss_cls: tensor(0.5479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1091, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:26<08:46,  5.54s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0540, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:31<08:39,  5.53s/it][A

	loss_cls: tensor(0.3440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4045, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:37<08:30,  5.49s/it][A

	loss_cls: tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1502, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:42<08:25,  5.49s/it][A

	loss_cls: tensor(0.4431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6116, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:48<08:15,  5.45s/it][A

	loss_cls: tensor(0.6063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6806, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:53<08:10,  5.45s/it][A

	loss_cls: tensor(0.7192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9822, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:58<08:02,  5.42s/it][A

	loss_cls: tensor(0.5377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6852, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:04<07:57,  5.43s/it][A

	loss_cls: tensor(0.5142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9320, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:09<07:50,  5.41s/it][A

	loss_cls: tensor(0.6030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8886, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:15<07:43,  5.39s/it][A

	loss_cls: tensor(0.6122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7230, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:20<07:40,  5.41s/it][A

	loss_cls: tensor(0.4446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5981, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:25<07:33,  5.40s/it][A

	loss_cls: tensor(0.7166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1007, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8172, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:31<07:28,  5.41s/it][A

	loss_cls: tensor(0.4354, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6924, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:36<07:21,  5.39s/it][A

	loss_cls: tensor(0.5003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5567, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:42<07:18,  5.41s/it][A

	loss_cls: tensor(0.5285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7199, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:47<07:12,  5.40s/it][A

	loss_cls: tensor(0.6666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2229, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8895, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:52<07:05,  5.39s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0844, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6280, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:58<07:01,  5.40s/it][A

	loss_cls: tensor(0.5170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1556, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6725, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:03<06:55,  5.39s/it][A

	loss_cls: tensor(0.7912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3044, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:09<06:50,  5.40s/it][A

	loss_cls: tensor(0.8195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2391, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0587, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:14<06:44,  5.39s/it][A

	loss_cls: tensor(0.5553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8037, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:19<06:40,  5.41s/it][A

	loss_cls: tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5984, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:25<06:34,  5.40s/it][A

	loss_cls: tensor(0.6610, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8103, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:30<06:28,  5.40s/it][A

	loss_cls: tensor(0.6521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8591, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:36<06:24,  5.41s/it][A

	loss_cls: tensor(0.5995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8075, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:41<06:17,  5.40s/it][A

	loss_cls: tensor(0.7974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4408, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2383, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:47<06:15,  5.45s/it][A

	loss_cls: tensor(0.4570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2508, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7078, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:52<06:11,  5.47s/it][A

	loss_cls: tensor(0.4664, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7142, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:58<06:09,  5.52s/it][A

	loss_cls: tensor(0.5338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7993, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:03<06:03,  5.51s/it][A

	loss_cls: tensor(0.4336, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3843, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8179, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:09<05:58,  5.51s/it][A

	loss_cls: tensor(0.5025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6225, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:14<05:53,  5.52s/it][A

	loss_cls: tensor(0.5386, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8764, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:20<05:46,  5.50s/it][A

	loss_cls: tensor(0.8616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0390, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:25<05:42,  5.52s/it][A

	loss_cls: tensor(0.4105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6593, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:31<05:36,  5.52s/it][A

	loss_cls: tensor(0.8202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1031, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:36<05:32,  5.54s/it][A

	loss_cls: tensor(0.2573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6637, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:42<05:26,  5.53s/it][A

	loss_cls: tensor(0.4678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6127, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:47<05:20,  5.53s/it][A

	loss_cls: tensor(0.4704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7264, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:53<05:16,  5.55s/it][A

	loss_cls: tensor(0.4375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7096, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:59<05:09,  5.53s/it][A

	loss_cls: tensor(0.5814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7649, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:04<05:03,  5.51s/it][A

	loss_cls: tensor(0.5343, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6867, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:09<04:55,  5.47s/it][A

	loss_cls: tensor(0.6761, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7328, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:15<04:50,  5.47s/it][A

	loss_cls: tensor(0.4997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5522, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:20<04:42,  5.44s/it][A

	loss_cls: tensor(0.5232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0481, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5713, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:26<04:36,  5.42s/it][A

	loss_cls: tensor(0.7193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7951, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:31<04:30,  5.42s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6039, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:36<04:24,  5.39s/it][A

	loss_cls: tensor(0.7669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1275, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:42<04:19,  5.41s/it][A

	loss_cls: tensor(0.6741, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7695, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:47<04:13,  5.39s/it][A

	loss_cls: tensor(0.7028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1629, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:53<04:09,  5.42s/it][A

	loss_cls: tensor(0.5701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2545, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8247, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:58<04:02,  5.39s/it][A

	loss_cls: tensor(0.5866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8311, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:03<03:56,  5.38s/it][A

	loss_cls: tensor(0.5442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8388, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:09<03:51,  5.39s/it][A

	loss_cls: tensor(0.7471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3583, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1053, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:14<03:45,  5.36s/it][A

	loss_cls: tensor(0.4940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1298, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6238, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:19<03:40,  5.39s/it][A

	loss_cls: tensor(0.6601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3923, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0524, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:25<03:34,  5.37s/it][A

	loss_cls: tensor(0.7060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0528, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:30<03:29,  5.38s/it][A

	loss_cls: tensor(0.6818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2600, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9418, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:36<03:23,  5.36s/it][A

	loss_cls: tensor(0.5797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7668, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:41<03:17,  5.34s/it][A

	loss_cls: tensor(0.5253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0034, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:46<03:12,  5.36s/it][A

	loss_cls: tensor(0.3005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4656, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:52<03:07,  5.34s/it][A

	loss_cls: tensor(0.4709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6599, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:57<03:02,  5.37s/it][A

	loss_cls: tensor(0.6369, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8610, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:02<02:56,  5.35s/it][A

	loss_cls: tensor(0.5398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6545, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:08<02:51,  5.36s/it][A

	loss_cls: tensor(0.7640, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9694, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:13<02:45,  5.34s/it][A

	loss_cls: tensor(0.7071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0788, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:18<02:40,  5.33s/it][A

	loss_cls: tensor(0.6588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9302, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:24<02:35,  5.35s/it][A

	loss_cls: tensor(0.3045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3967, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:29<02:29,  5.34s/it][A

	loss_cls: tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5840, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:34<02:25,  5.37s/it][A

	loss_cls: tensor(0.6880, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9132, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:40<02:19,  5.35s/it][A

	loss_cls: tensor(0.6609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9568, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:45<02:14,  5.36s/it][A

	loss_cls: tensor(0.4934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5359, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:50<02:08,  5.36s/it][A

	loss_cls: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0151, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:56<02:03,  5.36s/it][A

	loss_cls: tensor(0.4617, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:01<01:58,  5.36s/it][A

	loss_cls: tensor(0.7819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0673, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:07<01:52,  5.35s/it][A

	loss_cls: tensor(0.6279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6795, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:12<01:47,  5.37s/it][A

	loss_cls: tensor(0.7012, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3068, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:17<01:41,  5.35s/it][A

	loss_cls: tensor(0.6497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7159, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:23<01:36,  5.37s/it][A

	loss_cls: tensor(0.4295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5231, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:28<01:30,  5.35s/it][A

	loss_cls: tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6211, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:33<01:25,  5.33s/it][A

	loss_cls: tensor(0.6877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8360, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:39<01:20,  5.34s/it][A

	loss_cls: tensor(0.5538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8788, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:44<01:14,  5.33s/it][A

	loss_cls: tensor(0.8537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1053, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:49<01:09,  5.36s/it][A

	loss_cls: tensor(0.3379, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3763, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:55<01:04,  5.34s/it][A

	loss_cls: tensor(0.7047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8962, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:00<00:58,  5.36s/it][A

	loss_cls: tensor(0.3614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2153, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5767, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:05<00:53,  5.34s/it][A

	loss_cls: tensor(0.6198, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9692, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:11<00:47,  5.32s/it][A

	loss_cls: tensor(0.5484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8671, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:16<00:42,  5.35s/it][A

	loss_cls: tensor(0.4281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8315, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:21<00:37,  5.34s/it][A

	loss_cls: tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1855, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7037, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:27<00:32,  5.36s/it][A

	loss_cls: tensor(0.4698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5230, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:32<00:26,  5.34s/it][A

	loss_cls: tensor(0.7305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5500, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2804, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:37<00:21,  5.36s/it][A

	loss_cls: tensor(0.7048, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9095, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:43<00:16,  5.34s/it][A

	loss_cls: tensor(0.6860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9263, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:48<00:10,  5.33s/it][A

	loss_cls: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9681, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:53<00:05,  5.35s/it][A

	loss_cls: tensor(0.6375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1361, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7736, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:56<00:00,  5.41s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(1.4647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7229, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8158236057576487

	Training cls acc: 0.7083333333333334

	Training cls prec: 0.5727486927698793

	Training cls rec: 0.6083949713187001

	Training cls f1: 0.5444846446205966

--
	Training ner acc: 0.9552811662845324

	Training ner prec: 0.2811046878233874

	Training ner rec: 0.290195179129611

	Training ner f1: 0.28543641786962315

	Current Learning rate:  0.00014285714285714284



  1%|          | 1/177 [00:00<02:05,  1.41it/s][A
  1%|          | 2/177 [00:01<02:04,  1.40it/s][A
  2%|▏         | 3/177 [00:02<02:03,  1.40it/s][A
  2%|▏         | 4/177 [00:02<02:00,  1.44it/s][A
  3%|▎         | 5/177 [00:03<02:00,  1.43it/s][A
  3%|▎         | 6/177 [00:04<01:59,  1.43it/s][A
  4%|▍         | 7/177 [00:04<01:55,  1.47it/s][A
  5%|▍         | 8/177 [00:05<02:01,  1.39it/s][A
  5%|▌         | 9/177 [00:06<02:02,  1.38it/s][A
  6%|▌         | 10/177 [00:07<02:02,  1.37it/s][A
  6%|▌         | 11/177 [00:07<01:57,  1.42it/s][A
  7%|▋         | 12/177 [00:08<01:58,  1.40it/s][A
  7%|▋         | 13/177 [00:09<01:58,  1.38it/s][A
  8%|▊         | 14/177 [00:09<01:54,  1.42it/s][A
  8%|▊         | 15/177 [00:10<01:54,  1.42it/s][A
  9%|▉         | 16/177 [00:11<01:55,  1.40it/s][A
 10%|▉         | 17/177 [00:12<01:55,  1.38it/s][A
 10%|█         | 18/177 [00:12<01:51,  1.42it/s][A
 11%|█         | 19/177 [00:13<01:52,  1.41it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7698006274673225

	Validation cls acc: 0.7076271186440678

	Validation cls prec: 0.6092211460855529

	Validation cls rec: 0.5934221146085553

	Validation cls f1: 0.5555373690966912

--
	Validation ner acc: 0.9544153412265982

	Validation ner prec: 0.449782882767843

	Validation ner rec: 0.46026365348399245

	Validation ner f1: 0.4547942008744746



  0%|          | 1/354 [00:05<31:21,  5.33s/it][A

	loss_cls: tensor(0.5054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5982, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:54,  5.44s/it][A

	loss_cls: tensor(0.4152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4586, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:30,  5.39s/it][A

	loss_cls: tensor(0.7471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9443, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:26,  5.39s/it][A

	loss_cls: tensor(0.6422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7766, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:10,  5.36s/it][A

	loss_cls: tensor(0.7268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8518, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<30:57,  5.34s/it][A

	loss_cls: tensor(0.8613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0146, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<30:58,  5.36s/it][A

	loss_cls: tensor(0.6013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0074, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:42<30:46,  5.34s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6561, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:47,  5.36s/it][A

	loss_cls: tensor(0.5884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9219, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:39,  5.35s/it][A

	loss_cls: tensor(0.3919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4850, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:58<30:38,  5.36s/it][A

	loss_cls: tensor(0.3701, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4713, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8413, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:26,  5.34s/it][A

	loss_cls: tensor(0.5621, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1091, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6713, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:16,  5.33s/it][A

	loss_cls: tensor(0.4634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8013, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:14<30:18,  5.35s/it][A

	loss_cls: tensor(0.4652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5053, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:07,  5.33s/it][A

	loss_cls: tensor(0.7889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2520, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:25<30:08,  5.35s/it][A

	loss_cls: tensor(0.4393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5588, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:30<29:57,  5.33s/it][A

	loss_cls: tensor(0.5215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8453, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<29:58,  5.35s/it][A

	loss_cls: tensor(0.2899, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5507, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:41<29:50,  5.34s/it][A

	loss_cls: tensor(0.4525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6402, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:46<29:40,  5.33s/it][A

	loss_cls: tensor(0.6874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8887, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:37,  5.34s/it][A

	loss_cls: tensor(0.6497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9930, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:57<29:28,  5.33s/it][A

	loss_cls: tensor(0.5593, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7140, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:30,  5.35s/it][A

	loss_cls: tensor(0.4158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1865, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6022, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:08<29:22,  5.34s/it][A

	loss_cls: tensor(0.6053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7827, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:13<29:29,  5.38s/it][A

	loss_cls: tensor(0.4845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6991, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:17,  5.36s/it][A

	loss_cls: tensor(0.8119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1700, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:24<29:05,  5.34s/it][A

	loss_cls: tensor(0.6877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1386, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8263, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:29<29:03,  5.35s/it][A

	loss_cls: tensor(0.4306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1141, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5447, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<28:54,  5.34s/it][A

	loss_cls: tensor(0.5396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0485, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5881, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:40<28:53,  5.35s/it][A

	loss_cls: tensor(0.3684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5194, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:45<28:44,  5.34s/it][A

	loss_cls: tensor(0.5339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9505, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:51<28:44,  5.36s/it][A

	loss_cls: tensor(0.6365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7055, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:56<28:34,  5.34s/it][A

	loss_cls: tensor(0.5846, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7634, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:01<28:25,  5.33s/it][A

	loss_cls: tensor(0.6637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0060, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:07<28:27,  5.35s/it][A

	loss_cls: tensor(0.8027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1651, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:12<28:18,  5.34s/it][A

	loss_cls: tensor(0.4449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7341, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:17<28:18,  5.36s/it][A

	loss_cls: tensor(0.5785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8039, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:23<28:05,  5.33s/it][A

	loss_cls: tensor(0.4542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9768, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:28<28:04,  5.35s/it][A

	loss_cls: tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7757, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:33<27:53,  5.33s/it][A

	loss_cls: tensor(0.6393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8648, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:39<27:45,  5.32s/it][A

	loss_cls: tensor(0.6452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8917, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:44<27:46,  5.34s/it][A

	loss_cls: tensor(0.5180, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6205, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:49<27:37,  5.33s/it][A

	loss_cls: tensor(0.4449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4923, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:55<27:37,  5.35s/it][A

	loss_cls: tensor(0.5970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8847, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:00<27:24,  5.32s/it][A

	loss_cls: tensor(0.5917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1084, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7002, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:05<27:24,  5.34s/it][A

	loss_cls: tensor(0.6023, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8041, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:11<27:12,  5.32s/it][A

	loss_cls: tensor(0.5949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1060, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7009, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:16<27:18,  5.36s/it][A

	loss_cls: tensor(0.5982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7140, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:22<27:25,  5.39s/it][A

	loss_cls: tensor(0.5632, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7767, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:27<27:30,  5.43s/it][A

	loss_cls: tensor(0.6372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8396, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:33<27:40,  5.48s/it][A

	loss_cls: tensor(0.6768, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2919, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9686, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:38<27:36,  5.49s/it][A

	loss_cls: tensor(0.4863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6441, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:44<27:41,  5.52s/it][A

	loss_cls: tensor(0.5321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7285, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:49<27:33,  5.51s/it][A

	loss_cls: tensor(0.5703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8167, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:55<27:34,  5.53s/it][A

	loss_cls: tensor(0.5030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8073, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:00<27:25,  5.52s/it][A

	loss_cls: tensor(0.4517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6145, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:06<27:17,  5.51s/it][A

	loss_cls: tensor(0.5997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7389, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:11<27:17,  5.53s/it][A

	loss_cls: tensor(0.5689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2444, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:17<27:07,  5.52s/it][A

	loss_cls: tensor(0.6694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7503, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:22<27:05,  5.53s/it][A

	loss_cls: tensor(0.4583, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7636, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:28<26:50,  5.50s/it][A

	loss_cls: tensor(0.6485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9679, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:33<26:37,  5.47s/it][A

	loss_cls: tensor(0.7710, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2285, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9996, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:39<26:20,  5.43s/it][A

	loss_cls: tensor(0.6487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0178, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:44<26:08,  5.41s/it][A

	loss_cls: tensor(0.6183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8017, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:49<26:03,  5.41s/it][A

	loss_cls: tensor(0.5437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8903, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:55<25:50,  5.38s/it][A

	loss_cls: tensor(0.5717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8187, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:00<25:48,  5.40s/it][A

	loss_cls: tensor(0.7763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2251, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0014, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:05<25:39,  5.38s/it][A

	loss_cls: tensor(0.7127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2210, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9336, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:11<25:38,  5.40s/it][A

	loss_cls: tensor(0.5183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8180, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:16<25:29,  5.39s/it][A

	loss_cls: tensor(0.4813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6574, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:22<25:21,  5.38s/it][A

	loss_cls: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8681, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:27<25:21,  5.39s/it][A

	loss_cls: tensor(0.5169, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6529, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:32<25:11,  5.38s/it][A

	loss_cls: tensor(0.3031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4217, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:38<25:09,  5.39s/it][A

	loss_cls: tensor(0.6227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7740, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:43<25:22,  5.46s/it][A

	loss_cls: tensor(0.7601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0294, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:49<25:33,  5.52s/it][A

	loss_cls: tensor(0.5474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7757, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:55<25:31,  5.53s/it][A

	loss_cls: tensor(0.5258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8937, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:00<25:23,  5.52s/it][A

	loss_cls: tensor(0.7414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0569, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7983, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:06<25:30,  5.57s/it][A

	loss_cls: tensor(0.7309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0660, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7968, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:11<25:10,  5.51s/it][A

	loss_cls: tensor(0.6546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8661, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:17<25:00,  5.50s/it][A

	loss_cls: tensor(0.5201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6163, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:22<24:44,  5.46s/it][A

	loss_cls: tensor(0.2926, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5115, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:27<24:38,  5.46s/it][A

	loss_cls: tensor(0.7119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5192, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2311, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:33<24:26,  5.43s/it][A

	loss_cls: tensor(0.6112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2811, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8924, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:38<24:17,  5.42s/it][A

	loss_cls: tensor(0.8179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3949, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:44<24:13,  5.42s/it][A

	loss_cls: tensor(0.7775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0723, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:49<24:04,  5.41s/it][A

	loss_cls: tensor(0.7790, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0562, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<24:02,  5.42s/it][A

	loss_cls: tensor(0.5218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6251, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:00<23:51,  5.40s/it][A

	loss_cls: tensor(0.8000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0576, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:05<23:50,  5.42s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6938, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:11<23:42,  5.41s/it][A

	loss_cls: tensor(0.3415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0488, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3904, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:16<23:33,  5.39s/it][A

	loss_cls: tensor(0.5123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1414, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6537, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<23:31,  5.41s/it][A

	loss_cls: tensor(0.7111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8857, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:27<23:21,  5.39s/it][A

	loss_cls: tensor(0.6404, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0589, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:32<23:18,  5.40s/it][A

	loss_cls: tensor(0.5059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6853, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:38<23:15,  5.41s/it][A

	loss_cls: tensor(0.4875, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7763, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:43<23:17,  5.44s/it][A

	loss_cls: tensor(0.6306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2867, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9173, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:49<23:10,  5.43s/it][A

	loss_cls: tensor(0.5924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7390, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:54<23:01,  5.42s/it][A

	loss_cls: tensor(0.8691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1422, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:00<22:59,  5.43s/it][A

	loss_cls: tensor(0.4903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6578, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:05<22:49,  5.41s/it][A

	loss_cls: tensor(0.4647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8146, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:10<22:50,  5.44s/it][A

	loss_cls: tensor(0.5345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5859, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:42,  5.43s/it][A

	loss_cls: tensor(0.5595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7322, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:21<22:40,  5.44s/it][A

	loss_cls: tensor(0.5050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7689, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:27<22:31,  5.43s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6680, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:32<22:23,  5.42s/it][A

	loss_cls: tensor(0.4877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5995, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:38<22:21,  5.43s/it][A

	loss_cls: tensor(0.5881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5511, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1392, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:43<22:11,  5.41s/it][A

	loss_cls: tensor(0.4829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6992, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:48<22:11,  5.43s/it][A

	loss_cls: tensor(0.9854, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1765, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:54<22:01,  5.42s/it][A

	loss_cls: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7291, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:59<21:59,  5.43s/it][A

	loss_cls: tensor(0.8289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0795, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:05<21:51,  5.42s/it][A

	loss_cls: tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7586, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:10<21:49,  5.43s/it][A

	loss_cls: tensor(0.5463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8699, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:16<22:06,  5.53s/it][A

	loss_cls: tensor(0.4869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6663, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:21<22:10,  5.57s/it][A

	loss_cls: tensor(0.5616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6091, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:27<22:22,  5.64s/it][A

	loss_cls: tensor(0.6326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8631, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:33<22:07,  5.60s/it][A

	loss_cls: tensor(0.5421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6031, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:38<21:56,  5.58s/it][A

	loss_cls: tensor(0.4652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7989, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:44<21:41,  5.54s/it][A

	loss_cls: tensor(0.4943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8409, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:49<21:27,  5.50s/it][A

	loss_cls: tensor(0.3756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4413, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:55<21:22,  5.50s/it][A

	loss_cls: tensor(0.4527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0442, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4969, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:00<21:12,  5.48s/it][A

	loss_cls: tensor(0.3653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7507, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:06<21:09,  5.50s/it][A

	loss_cls: tensor(0.4481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5493, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:11<20:59,  5.48s/it][A

	loss_cls: tensor(0.6170, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0591, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6761, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:17<20:56,  5.49s/it][A

	loss_cls: tensor(0.4281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6231, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:22<20:45,  5.46s/it][A

	loss_cls: tensor(0.6567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2850, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9417, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:27<20:36,  5.45s/it][A

	loss_cls: tensor(0.6959, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1800, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8759, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:33<20:34,  5.46s/it][A

	loss_cls: tensor(0.6003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7577, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:38<20:27,  5.46s/it][A

	loss_cls: tensor(0.7059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0032, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:44<20:27,  5.48s/it][A

	loss_cls: tensor(0.4550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5732, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:49<20:18,  5.46s/it][A

	loss_cls: tensor(0.5451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7376, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:55<20:13,  5.47s/it][A

	loss_cls: tensor(0.8013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0594, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:00<20:05,  5.46s/it][A

	loss_cls: tensor(0.5186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7430, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:06<19:57,  5.44s/it][A

	loss_cls: tensor(0.4554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5930, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:11<19:54,  5.46s/it][A

	loss_cls: tensor(0.5344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7266, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:17<19:46,  5.44s/it][A

	loss_cls: tensor(0.4506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3249, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7754, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:22<19:44,  5.46s/it][A

	loss_cls: tensor(0.4328, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5671, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:27<19:35,  5.44s/it][A

	loss_cls: tensor(0.4388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6648, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:33<19:32,  5.45s/it][A

	loss_cls: tensor(0.5661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1901, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7561, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:38<19:25,  5.45s/it][A

	loss_cls: tensor(0.7693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2857, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:44<19:10,  5.40s/it][A

	loss_cls: tensor(0.5810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8339, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:49<19:12,  5.44s/it][A

	loss_cls: tensor(0.4934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4698, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9632, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:55<19:05,  5.43s/it][A

	loss_cls: tensor(0.5951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9901, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:00<19:04,  5.45s/it][A

	loss_cls: tensor(0.5629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6100, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:05<18:56,  5.44s/it][A

	loss_cls: tensor(0.6696, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0233, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:11<18:54,  5.45s/it][A

	loss_cls: tensor(0.6476, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7850, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:16<18:46,  5.44s/it][A

	loss_cls: tensor(0.4822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8040, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:22<18:43,  5.45s/it][A

	loss_cls: tensor(0.8866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2454, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:27<18:37,  5.45s/it][A

	loss_cls: tensor(0.8851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1209, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:33<18:29,  5.44s/it][A

	loss_cls: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0630, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5318, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:38<18:28,  5.46s/it][A

	loss_cls: tensor(0.4881, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3320, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8201, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:44<18:19,  5.44s/it][A

	loss_cls: tensor(0.3697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1946, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5643, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:49<18:16,  5.46s/it][A

	loss_cls: tensor(0.4470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8085, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:54<18:07,  5.44s/it][A

	loss_cls: tensor(0.3520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5491, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:00<18:05,  5.46s/it][A

	loss_cls: tensor(0.6039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6866, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:06<18:05,  5.48s/it][A

	loss_cls: tensor(0.8820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0238, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:11<17:57,  5.47s/it][A

	loss_cls: tensor(0.8426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1222, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:16<17:53,  5.48s/it][A

	loss_cls: tensor(0.5783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9140, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:22<17:45,  5.46s/it][A

	loss_cls: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7762, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:27<17:42,  5.48s/it][A

	loss_cls: tensor(0.6182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7292, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:33<17:32,  5.45s/it][A

	loss_cls: tensor(0.6781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0039, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:38<17:30,  5.47s/it][A

	loss_cls: tensor(0.4623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0203, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:44<17:21,  5.45s/it][A

	loss_cls: tensor(0.5244, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5992, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:49<17:13,  5.44s/it][A

	loss_cls: tensor(0.3464, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5806, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:55<17:07,  5.44s/it][A

	loss_cls: tensor(0.5722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6609, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:00<17:00,  5.43s/it][A

	loss_cls: tensor(0.6042, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1144, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7187, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:05<16:57,  5.44s/it][A

	loss_cls: tensor(0.4941, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6136, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:11<16:50,  5.44s/it][A

	loss_cls: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5010, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0828, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:16<16:49,  5.46s/it][A

	loss_cls: tensor(0.6510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9983, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:22<16:41,  5.45s/it][A

	loss_cls: tensor(0.6877, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7920, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:27<16:34,  5.43s/it][A

	loss_cls: tensor(0.5508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6891, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:33<16:32,  5.45s/it][A

	loss_cls: tensor(0.6327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9002, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:38<16:24,  5.44s/it][A

	loss_cls: tensor(0.5300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7196, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:44<16:21,  5.45s/it][A

	loss_cls: tensor(0.6275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3157, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9433, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:49<16:16,  5.45s/it][A

	loss_cls: tensor(0.4750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2089, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6839, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:55<16:11,  5.46s/it][A

	loss_cls: tensor(0.5142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7172, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:00<16:06,  5.46s/it][A

	loss_cls: tensor(0.5890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2941, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8831, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:05<15:56,  5.44s/it][A

	loss_cls: tensor(0.4462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5597, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:11<15:54,  5.45s/it][A

	loss_cls: tensor(0.4717, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7810, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:16<15:46,  5.44s/it][A

	loss_cls: tensor(1.0750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3287, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:22<15:42,  5.45s/it][A

	loss_cls: tensor(0.4923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5299, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:27<15:34,  5.44s/it][A

	loss_cls: tensor(0.5072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7757, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:33<15:33,  5.46s/it][A

	loss_cls: tensor(0.4409, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5760, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:38<15:26,  5.45s/it][A

	loss_cls: tensor(0.5393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6928, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:44<15:19,  5.44s/it][A

	loss_cls: tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9162, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:49<15:16,  5.45s/it][A

	loss_cls: tensor(0.3978, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4384, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:54<15:06,  5.43s/it][A

	loss_cls: tensor(0.6160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3104, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9264, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:00<15:00,  5.43s/it][A

	loss_cls: tensor(0.9149, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0672, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:05<14:55,  5.43s/it][A

	loss_cls: tensor(0.5123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7882, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:11<14:53,  5.45s/it][A

	loss_cls: tensor(0.6095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8923, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:16<14:47,  5.44s/it][A

	loss_cls: tensor(0.5740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6845, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:22<14:40,  5.43s/it][A

	loss_cls: tensor(1.0079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4239, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:27<14:37,  5.45s/it][A

	loss_cls: tensor(0.9973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3299, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3272, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:32<14:29,  5.44s/it][A

	loss_cls: tensor(0.5688, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8157, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:38<14:27,  5.46s/it][A

	loss_cls: tensor(0.4598, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7157, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:43<14:21,  5.45s/it][A

	loss_cls: tensor(0.5489, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8488, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:49<14:19,  5.47s/it][A

	loss_cls: tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0690, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3328, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:54<14:11,  5.46s/it][A

	loss_cls: tensor(0.5685, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8349, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:00<14:04,  5.45s/it][A

	loss_cls: tensor(0.6112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2348, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8460, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:05<14:02,  5.47s/it][A

	loss_cls: tensor(0.3704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7560, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:11<13:54,  5.46s/it][A

	loss_cls: tensor(1.1576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2962, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4538, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:16<14:01,  5.54s/it][A

	loss_cls: tensor(0.7548, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9437, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:22<13:51,  5.51s/it][A

	loss_cls: tensor(0.6022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7647, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:27<13:45,  5.51s/it][A

	loss_cls: tensor(0.5238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6582, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:33<13:35,  5.47s/it][A

	loss_cls: tensor(0.4627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3240, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7867, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:38<13:26,  5.45s/it][A

	loss_cls: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6219, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:44<13:22,  5.46s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1205, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6301, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:49<13:15,  5.45s/it][A

	loss_cls: tensor(0.6674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9428, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:55<13:13,  5.47s/it][A

	loss_cls: tensor(0.5959, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1526, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7485, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:00<13:05,  5.45s/it][A

	loss_cls: tensor(0.6692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8411, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:05<13:00,  5.46s/it][A

	loss_cls: tensor(0.5773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0986, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6759, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:11<12:52,  5.44s/it][A

	loss_cls: tensor(0.6366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8327, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:16<12:45,  5.43s/it][A

	loss_cls: tensor(0.6348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7281, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:22<12:43,  5.46s/it][A

	loss_cls: tensor(0.4864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1339, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6203, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:27<12:34,  5.43s/it][A

	loss_cls: tensor(0.6923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1201, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:33<12:33,  5.46s/it][A

	loss_cls: tensor(0.7161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8832, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:38<12:26,  5.45s/it][A

	loss_cls: tensor(0.8448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1262, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:44<12:23,  5.47s/it][A

	loss_cls: tensor(0.5921, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4898, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0820, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:49<12:15,  5.45s/it][A

	loss_cls: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4727, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:54<12:08,  5.44s/it][A

	loss_cls: tensor(0.6259, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7322, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:00<12:04,  5.45s/it][A

	loss_cls: tensor(0.7609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0001, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:05<11:58,  5.44s/it][A

	loss_cls: tensor(0.7795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2239, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0033, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:11<11:56,  5.47s/it][A

	loss_cls: tensor(0.4164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5960, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:16<11:49,  5.46s/it][A

	loss_cls: tensor(0.3366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5961, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9327, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:22<11:45,  5.47s/it][A

	loss_cls: tensor(0.5060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6177, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:27<11:37,  5.45s/it][A

	loss_cls: tensor(0.7297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3903, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1200, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:33<11:31,  5.44s/it][A

	loss_cls: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4486, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0428, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:38<11:27,  5.46s/it][A

	loss_cls: tensor(0.8771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1404, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:44<11:21,  5.45s/it][A

	loss_cls: tensor(0.8940, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3282, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2222, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:49<11:19,  5.48s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8829, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:54<11:10,  5.45s/it][A

	loss_cls: tensor(0.5335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6549, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:00<11:06,  5.46s/it][A

	loss_cls: tensor(0.8466, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0072, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:05<10:59,  5.45s/it][A

	loss_cls: tensor(0.5309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7337, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:11<10:49,  5.42s/it][A

	loss_cls: tensor(0.8188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0313, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:16<10:47,  5.44s/it][A

	loss_cls: tensor(0.8144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1108, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9252, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:22<10:42,  5.44s/it][A

	loss_cls: tensor(0.5609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9153, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:27<10:39,  5.47s/it][A

	loss_cls: tensor(0.6098, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8188, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:33<10:33,  5.46s/it][A

	loss_cls: tensor(0.7594, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1333, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8927, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:38<10:29,  5.48s/it][A

	loss_cls: tensor(0.4994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7008, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:44<10:22,  5.46s/it][A

	loss_cls: tensor(0.5578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7487, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:49<10:15,  5.45s/it][A

	loss_cls: tensor(0.4998, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8329, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:55<10:13,  5.47s/it][A

	loss_cls: tensor(0.5059, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6702, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:00<10:06,  5.47s/it][A

	loss_cls: tensor(0.6207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7232, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:06<10:03,  5.49s/it][A

	loss_cls: tensor(0.4922, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1724, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6646, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:11<09:56,  5.48s/it][A

	loss_cls: tensor(0.5661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6184, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:17<09:53,  5.49s/it][A

	loss_cls: tensor(0.8066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8684, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:22<09:45,  5.47s/it][A

	loss_cls: tensor(0.5972, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7010, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:27<09:41,  5.48s/it][A

	loss_cls: tensor(0.5953, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7552, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:33<09:35,  5.48s/it][A

	loss_cls: tensor(0.5868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7436, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:38<09:27,  5.46s/it][A

	loss_cls: tensor(0.8071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9405, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:44<09:24,  5.48s/it][A

	loss_cls: tensor(0.5820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6586, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:49<09:16,  5.46s/it][A

	loss_cls: tensor(0.4438, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1672, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6111, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:55<09:12,  5.47s/it][A

	loss_cls: tensor(0.5242, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8043, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:00<09:04,  5.45s/it][A

	loss_cls: tensor(0.4662, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6217, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:06<09:00,  5.46s/it][A

	loss_cls: tensor(0.3312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4724, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:11<08:55,  5.46s/it][A

	loss_cls: tensor(0.6939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8001, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:17<08:48,  5.44s/it][A

	loss_cls: tensor(0.3693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3970, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:22<08:43,  5.45s/it][A

	loss_cls: tensor(0.5014, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3991, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9006, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:27<08:37,  5.45s/it][A

	loss_cls: tensor(0.9177, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2428, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:33<08:33,  5.46s/it][A

	loss_cls: tensor(0.3359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6741, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:38<08:26,  5.45s/it][A

	loss_cls: tensor(0.6702, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8763, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:44<08:23,  5.47s/it][A

	loss_cls: tensor(0.4928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6273, device='cuda:0', grad_fn=<AddBackward0>)
	loss_cls: 


 74%|███████▍  | 263/354 [23:51<09:10,  6.05s/it][A

tensor(0.9819, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6348, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:57<08:48,  5.87s/it][A

	loss_cls: tensor(0.4148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5200, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:03<08:41,  5.86s/it][A

	loss_cls: tensor(0.8003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3235, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1238, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:08<08:29,  5.79s/it][A

	loss_cls: tensor(0.6733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8638, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:14<08:15,  5.70s/it][A

	loss_cls: tensor(0.7750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2126, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:19<08:02,  5.61s/it][A

	loss_cls: tensor(0.4153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5341, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:25<07:54,  5.58s/it][A

	loss_cls: tensor(0.5241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0419, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5660, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:30<07:45,  5.54s/it][A

	loss_cls: tensor(0.4809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6634, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:35<07:35,  5.49s/it][A

	loss_cls: tensor(0.4722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0388, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5110, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:41<07:30,  5.49s/it][A

	loss_cls: tensor(0.4678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5294, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:46<07:22,  5.46s/it][A

	loss_cls: tensor(0.4820, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5315, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:52<07:17,  5.46s/it][A

	loss_cls: tensor(0.6377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7733, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:57<07:10,  5.45s/it][A

	loss_cls: tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7637, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [25:03<07:06,  5.46s/it][A

	loss_cls: tensor(0.4135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4817, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:08<06:59,  5.44s/it][A

	loss_cls: tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2712, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8573, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:13<06:53,  5.44s/it][A

	loss_cls: tensor(0.4871, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6648, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:19<06:49,  5.46s/it][A

	loss_cls: tensor(0.8353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1139, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:24<06:42,  5.43s/it][A

	loss_cls: tensor(0.8618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0241, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:30<06:40,  5.49s/it][A

	loss_cls: tensor(1.1208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2586, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:36<06:36,  5.50s/it][A

	loss_cls: tensor(0.4860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8342, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:41<06:32,  5.53s/it][A

	loss_cls: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7887, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:47<06:27,  5.53s/it][A

	loss_cls: tensor(0.6322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9129, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:52<06:21,  5.53s/it][A

	loss_cls: tensor(0.5629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2611, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8240, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:58<06:17,  5.55s/it][A

	loss_cls: tensor(0.4215, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0734, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4948, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [26:03<06:10,  5.53s/it][A

	loss_cls: tensor(0.6748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0104, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:09<06:06,  5.55s/it][A

	loss_cls: tensor(0.5626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4257, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9883, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:14<05:59,  5.54s/it][A

	loss_cls: tensor(0.8028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3018, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1046, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:20<05:55,  5.56s/it][A

	loss_cls: tensor(0.6117, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5651, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1768, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:26<05:50,  5.56s/it][A

	loss_cls: tensor(0.5616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7956, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:31<05:43,  5.54s/it][A

	loss_cls: tensor(0.8346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1773, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:37<05:38,  5.54s/it][A

	loss_cls: tensor(0.4295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2579, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6873, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:42<05:31,  5.53s/it][A

	loss_cls: tensor(0.7281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9354, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:48<05:27,  5.56s/it][A

	loss_cls: tensor(0.8001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2127, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0128, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:53<05:21,  5.55s/it][A

	loss_cls: tensor(0.6431, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1177, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7608, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:59<05:16,  5.56s/it][A

	loss_cls: tensor(0.4528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3013, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7541, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:04<05:10,  5.54s/it][A

	loss_cls: tensor(0.4910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7068, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:10<05:04,  5.53s/it][A

	loss_cls: tensor(0.8289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0977, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:15<04:59,  5.55s/it][A

	loss_cls: tensor(0.6367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7536, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:21<04:53,  5.53s/it][A

	loss_cls: tensor(0.5377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1204, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6581, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:27<04:49,  5.56s/it][A

	loss_cls: tensor(0.5619, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8112, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:32<04:41,  5.52s/it][A

	loss_cls: tensor(0.6681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8679, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:38<04:37,  5.55s/it][A

	loss_cls: tensor(0.7749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8731, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:43<04:31,  5.54s/it][A

	loss_cls: tensor(0.4776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6655, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:49<04:25,  5.54s/it][A

	loss_cls: tensor(0.5029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8806, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:54<04:20,  5.55s/it][A

	loss_cls: tensor(0.5024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6958, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [28:00<04:14,  5.54s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6136, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [28:05<04:10,  5.56s/it][A

	loss_cls: tensor(0.7312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0733, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:11<04:04,  5.55s/it][A

	loss_cls: tensor(0.9601, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1432, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:16<03:59,  5.57s/it][A

	loss_cls: tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2608, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7764, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:22<03:53,  5.55s/it][A

	loss_cls: tensor(0.7724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9456, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:27<03:46,  5.53s/it][A

	loss_cls: tensor(0.5064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7065, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:33<03:41,  5.55s/it][A

	loss_cls: tensor(0.5733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5867, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1600, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:38<03:29,  5.36s/it][A

	loss_cls: tensor(0.4018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5910, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:43<03:19,  5.26s/it][A

	loss_cls: tensor(0.6201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0479, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6680, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:48<03:11,  5.18s/it][A

	loss_cls: tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7410, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:53<03:06,  5.17s/it][A

	loss_cls: tensor(0.4645, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1112, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5757, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:59<03:04,  5.27s/it][A

	loss_cls: tensor(0.5440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1271, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [29:04<03:01,  5.34s/it][A

	loss_cls: tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1685, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7435, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:10<02:58,  5.42s/it][A

	loss_cls: tensor(0.5158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8796, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:15<02:54,  5.46s/it][A

	loss_cls: tensor(0.7916, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0806, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:21<02:49,  5.47s/it][A

	loss_cls: tensor(0.6475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8248, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:26<02:43,  5.45s/it][A

	loss_cls: tensor(0.8960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0464, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:32<02:38,  5.46s/it][A

	loss_cls: tensor(0.5016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6552, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:37<02:31,  5.42s/it][A

	loss_cls: tensor(1.0485, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2007, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:42<02:26,  5.42s/it][A

	loss_cls: tensor(0.5323, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1722, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7045, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:48<02:21,  5.43s/it][A

	loss_cls: tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6933, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:53<02:15,  5.42s/it][A

	loss_cls: tensor(0.7263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8724, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:59<02:10,  5.43s/it][A

	loss_cls: tensor(0.8231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1114, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9346, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:04<02:04,  5.41s/it][A

	loss_cls: tensor(0.4620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8133, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:10<01:59,  5.42s/it][A

	loss_cls: tensor(0.5655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8811, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:15<01:53,  5.41s/it][A

	loss_cls: tensor(0.4508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7933, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:20<01:47,  5.39s/it][A

	loss_cls: tensor(0.7929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0173, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:26<01:42,  5.41s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1252, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7341, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:31<01:37,  5.40s/it][A

	loss_cls: tensor(0.6799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9577, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:37<01:32,  5.43s/it][A

	loss_cls: tensor(0.7025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9945, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:42<01:26,  5.42s/it][A

	loss_cls: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0904, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:47<01:21,  5.44s/it][A

	loss_cls: tensor(0.5096, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7299, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:53<01:15,  5.42s/it][A

	loss_cls: tensor(0.4955, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6723, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:58<01:10,  5.41s/it][A

	loss_cls: tensor(0.8105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1720, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9825, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:04<01:05,  5.42s/it][A

	loss_cls: tensor(0.5786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3586, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9372, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:09<00:59,  5.41s/it][A

	loss_cls: tensor(0.7391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1120, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:15<00:54,  5.44s/it][A

	loss_cls: tensor(0.7643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9099, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:20<00:48,  5.43s/it][A

	loss_cls: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7002, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:25<00:43,  5.44s/it][A

	loss_cls: tensor(0.6766, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8321, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:31<00:37,  5.43s/it][A

	loss_cls: tensor(0.5367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7304, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:36<00:32,  5.43s/it][A

	loss_cls: tensor(0.8097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1357, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:42<00:27,  5.42s/it][A

	loss_cls: tensor(0.4356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5842, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:47<00:21,  5.41s/it][A

	loss_cls: tensor(0.6236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3513, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9749, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:53<00:16,  5.43s/it][A

	loss_cls: tensor(0.6410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0408, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:58<00:10,  5.42s/it][A

	loss_cls: tensor(0.8161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0079, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:03<00:05,  5.43s/it][A

	loss_cls: tensor(0.6656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9996, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:05<00:00,  5.44s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.8174, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2198, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8223844445863012

	Training cls acc: 0.6940913370998117

	Training cls prec: 0.5732177466287635

	Training cls rec: 0.6209063888089312

	Training cls f1: 0.5413156518708553

--
	Training ner acc: 0.9552812482305645

	Training ner prec: 0.2705049117567998

	Training ner rec: 0.27922483546134064

	Training ner f1: 0.27452484251029474

	Current Learning rate:  0.00011428571428571428



  1%|          | 1/177 [00:00<02:15,  1.30it/s][A
  1%|          | 2/177 [00:01<02:10,  1.34it/s][A
  2%|▏         | 3/177 [00:02<02:03,  1.40it/s][A
  2%|▏         | 4/177 [00:02<02:05,  1.38it/s][A
  3%|▎         | 5/177 [00:03<02:06,  1.36it/s][A
  3%|▎         | 6/177 [00:04<02:01,  1.41it/s][A
  4%|▍         | 7/177 [00:05<02:02,  1.39it/s][A
  5%|▍         | 8/177 [00:05<02:03,  1.37it/s][A
  5%|▌         | 9/177 [00:06<02:03,  1.36it/s][A
  6%|▌         | 10/177 [00:07<01:59,  1.40it/s][A
  6%|▌         | 11/177 [00:07<02:00,  1.38it/s][A
  7%|▋         | 12/177 [00:08<02:00,  1.37it/s][A
  7%|▋         | 13/177 [00:09<01:56,  1.41it/s][A
  8%|▊         | 14/177 [00:10<01:56,  1.40it/s][A
  8%|▊         | 15/177 [00:10<01:57,  1.38it/s][A
  9%|▉         | 16/177 [00:11<01:57,  1.37it/s][A
 10%|▉         | 17/177 [00:12<01:53,  1.41it/s][A
 10%|█         | 18/177 [00:13<01:54,  1.39it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.38it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.789600205286748

	Validation cls acc: 0.6732580037664783

	Validation cls prec: 0.6021926284638149

	Validation cls rec: 0.5723601022329836

	Validation cls f1: 0.5319319726099386

--
	Validation ner acc: 0.9544103979400342

	Validation ner prec: 0.4261632007023146

	Validation ner rec: 0.436723163841808

	Validation ner f1: 0.43122155623365044



  0%|          | 1/354 [00:05<31:31,  5.36s/it][A

	loss_cls: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0514, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5937, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:21,  5.34s/it][A

	loss_cls: tensor(0.5943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7705, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:26,  5.38s/it][A

	loss_cls: tensor(0.5971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1563, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7534, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:17,  5.36s/it][A

	loss_cls: tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5673, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:22,  5.39s/it][A

	loss_cls: tensor(0.7544, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9891, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:11,  5.38s/it][A

	loss_cls: tensor(0.5865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7660, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:14,  5.40s/it][A

	loss_cls: tensor(0.9079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2811, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:02,  5.38s/it][A

	loss_cls: tensor(0.5967, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1812, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7779, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:49,  5.36s/it][A

	loss_cls: tensor(0.5401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7949, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:49,  5.38s/it][A

	loss_cls: tensor(0.6050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8325, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:43,  5.37s/it][A

	loss_cls: tensor(0.6003, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2976, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8979, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:45,  5.40s/it][A

	loss_cls: tensor(0.5686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2356, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8043, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:40,  5.40s/it][A

	loss_cls: tensor(0.7411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1433, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8844, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<31:33,  5.57s/it][A

	loss_cls: tensor(0.6160, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7791, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<31:23,  5.56s/it][A

	loss_cls: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1790, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6836, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<32:08,  5.70s/it][A

	loss_cls: tensor(0.5511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7133, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:33<32:04,  5.71s/it][A

	loss_cls: tensor(0.7265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8942, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<31:26,  5.61s/it][A

	loss_cls: tensor(0.7677, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4208, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:44<30:58,  5.55s/it][A

	loss_cls: tensor(0.5996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6906, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:49<30:37,  5.50s/it][A

	loss_cls: tensor(0.4689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6317, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:28,  5.49s/it][A

	loss_cls: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0675, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6419, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [02:00<30:15,  5.47s/it][A

	loss_cls: tensor(0.5283, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:05<30:00,  5.44s/it][A

	loss_cls: tensor(0.5271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6574, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:11<29:53,  5.43s/it][A

	loss_cls: tensor(0.5371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9253, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:16<29:40,  5.41s/it][A

	loss_cls: tensor(0.5231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7876, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:21<29:40,  5.43s/it][A

	loss_cls: tensor(0.5399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7114, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:27<29:27,  5.41s/it][A

	loss_cls: tensor(0.5652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8164, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:32<29:23,  5.41s/it][A

	loss_cls: tensor(0.5312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5763, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:38<29:12,  5.39s/it][A

	loss_cls: tensor(0.5022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5486, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:43<28:59,  5.37s/it][A

	loss_cls: tensor(0.6900, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0826, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7726, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:48<29:00,  5.39s/it][A

	loss_cls: tensor(0.7445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9506, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:54<28:49,  5.37s/it][A

	loss_cls: tensor(0.6974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8046, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:59<28:52,  5.40s/it][A

	loss_cls: tensor(0.6546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0314, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:04<28:43,  5.39s/it][A

	loss_cls: tensor(0.6141, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7682, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:10<28:43,  5.40s/it][A

	loss_cls: tensor(0.6182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1497, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7679, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:15<28:32,  5.39s/it][A

	loss_cls: tensor(0.6462, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2662, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9125, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:21<28:32,  5.40s/it][A

	loss_cls: tensor(0.5452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6853, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:27<29:12,  5.55s/it][A

	loss_cls: tensor(0.5262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2612, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7874, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:32<29:08,  5.55s/it][A

	loss_cls: tensor(0.4216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2764, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6980, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:38<29:22,  5.61s/it][A

	loss_cls: tensor(0.7693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9940, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:43<28:56,  5.55s/it][A

	loss_cls: tensor(0.6332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3283, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9615, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:49<28:44,  5.53s/it][A

	loss_cls: tensor(0.4961, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8104, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:54<28:33,  5.51s/it][A

	loss_cls: tensor(0.4829, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3132, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7961, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [04:00<28:22,  5.49s/it][A

	loss_cls: tensor(0.5618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9185, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:05<28:04,  5.45s/it][A

	loss_cls: tensor(0.6146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8928, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:10<27:51,  5.43s/it][A

	loss_cls: tensor(0.4041, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5081, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:16<27:49,  5.44s/it][A

	loss_cls: tensor(0.6054, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8148, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:21<27:38,  5.42s/it][A

	loss_cls: tensor(0.5270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0699, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5969, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:27<27:34,  5.42s/it][A

	loss_cls: tensor(0.4954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0598, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:32<27:20,  5.40s/it][A

	loss_cls: tensor(0.5884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1767, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7651, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:37<27:18,  5.41s/it][A

	loss_cls: tensor(0.4320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6077, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:43<27:09,  5.40s/it][A

	loss_cls: tensor(0.6439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6914, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:48<27:00,  5.38s/it][A

	loss_cls: tensor(0.4535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9677, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:54<27:00,  5.40s/it][A

	loss_cls: tensor(0.4670, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2953, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7623, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:59<26:53,  5.40s/it][A

	loss_cls: tensor(0.7418, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8231, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:04<26:51,  5.41s/it][A

	loss_cls: tensor(0.7689, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0112, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:10<26:42,  5.40s/it][A

	loss_cls: tensor(0.5964, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4886, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0850, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:15<26:43,  5.42s/it][A

	loss_cls: tensor(0.7319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2827, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0147, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:21<26:34,  5.40s/it][A

	loss_cls: tensor(0.6033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6622, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:26<26:26,  5.40s/it][A

	loss_cls: tensor(0.8422, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2587, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1009, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:31<26:23,  5.41s/it][A

	loss_cls: tensor(0.4767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6901, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:37<26:15,  5.39s/it][A

	loss_cls: tensor(0.5376, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6559, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:42<26:13,  5.41s/it][A

	loss_cls: tensor(0.4834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6466, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:48<26:03,  5.39s/it][A

	loss_cls: tensor(0.4540, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6706, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:53<26:06,  5.42s/it][A

	loss_cls: tensor(0.7793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0313, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:58<26:00,  5.42s/it][A

	loss_cls: tensor(0.7287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5061, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2348, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:04<25:50,  5.40s/it][A

	loss_cls: tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4105, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9814, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:09<25:52,  5.43s/it][A

	loss_cls: tensor(0.5187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8103, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:15<25:45,  5.42s/it][A

	loss_cls: tensor(0.4359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5662, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:20<25:41,  5.43s/it][A

	loss_cls: tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6641, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:26<25:30,  5.41s/it][A

	loss_cls: tensor(0.7508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8525, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:31<25:34,  5.44s/it][A

	loss_cls: tensor(0.5203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7334, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:37<25:30,  5.45s/it][A

	loss_cls: tensor(0.5760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7408, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:42<25:16,  5.42s/it][A

	loss_cls: tensor(0.3493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3969, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:47<25:11,  5.42s/it][A

	loss_cls: tensor(0.6638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8102, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:53<24:58,  5.39s/it][A

	loss_cls: tensor(0.6288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8922, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:58<24:55,  5.40s/it][A

	loss_cls: tensor(0.5861, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1817, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7678, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:03<24:46,  5.39s/it][A

	loss_cls: tensor(0.6671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0553, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7224, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:09<24:45,  5.40s/it][A

	loss_cls: tensor(0.5951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9343, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:14<24:37,  5.39s/it][A

	loss_cls: tensor(0.5584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3997, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9581, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:20<24:27,  5.38s/it][A

	loss_cls: tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7487, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:25<24:25,  5.39s/it][A

	loss_cls: tensor(0.5463, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8912, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:30<24:16,  5.37s/it][A

	loss_cls: tensor(0.4958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9420, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:36<24:16,  5.40s/it][A

	loss_cls: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3752, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0235, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:41<24:09,  5.39s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7126, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:47<24:10,  5.41s/it][A

	loss_cls: tensor(0.5276, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6745, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:52<24:01,  5.40s/it][A

	loss_cls: tensor(0.4731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1786, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:57<23:52,  5.39s/it][A

	loss_cls: tensor(0.4532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1255, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5787, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:03<23:52,  5.41s/it][A

	loss_cls: tensor(0.5068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7100, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:08<23:39,  5.38s/it][A

	loss_cls: tensor(0.3451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1113, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4563, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:14<23:40,  5.40s/it][A

	loss_cls: tensor(0.4193, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5536, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:19<23:31,  5.39s/it][A

	loss_cls: tensor(0.5737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1428, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7165, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:24<23:30,  5.40s/it][A

	loss_cls: tensor(0.5497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8054, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:30<23:21,  5.39s/it][A

	loss_cls: tensor(0.4236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1780, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6016, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:35<23:14,  5.38s/it][A

	loss_cls: tensor(0.5000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6216, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:41<23:14,  5.40s/it][A

	loss_cls: tensor(0.4758, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0785, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5543, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:46<23:05,  5.39s/it][A

	loss_cls: tensor(0.5978, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9427, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:51<23:02,  5.40s/it][A

	loss_cls: tensor(0.6697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0956, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:57<22:55,  5.39s/it][A

	loss_cls: tensor(0.5577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6118, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:02<22:52,  5.40s/it][A

	loss_cls: tensor(0.5846, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8604, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:07<22:43,  5.39s/it][A

	loss_cls: tensor(0.3897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8766, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:13<22:35,  5.38s/it][A

	loss_cls: tensor(0.7064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1334, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8399, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:18<22:35,  5.40s/it][A

	loss_cls: tensor(0.4912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3405, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8317, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:24<22:27,  5.39s/it][A

	loss_cls: tensor(0.6258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6797, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:29<22:27,  5.41s/it][A

	loss_cls: tensor(0.4344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1077, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5421, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:34<22:18,  5.40s/it][A

	loss_cls: tensor(0.5428, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7638, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:40<22:16,  5.41s/it][A

	loss_cls: tensor(0.8377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0471, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:45<22:05,  5.39s/it][A

	loss_cls: tensor(0.6305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8430, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:51<21:58,  5.38s/it][A

	loss_cls: tensor(1.1432, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3902, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:56<21:58,  5.40s/it][A

	loss_cls: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8630, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:01<21:50,  5.39s/it][A

	loss_cls: tensor(0.5643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6097, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:07<21:52,  5.42s/it][A

	loss_cls: tensor(0.4218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4863, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:12<21:44,  5.41s/it][A

	loss_cls: tensor(0.8519, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9658, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:18<21:40,  5.42s/it][A

	loss_cls: tensor(0.5722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8242, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:23<21:32,  5.41s/it][A

	loss_cls: tensor(0.5350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7107, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:28<21:22,  5.39s/it][A

	loss_cls: tensor(0.7289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0832, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8121, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:34<21:22,  5.41s/it][A

	loss_cls: tensor(0.4695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1213, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5908, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:39<21:13,  5.39s/it][A

	loss_cls: tensor(0.3378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4407, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:45<21:09,  5.40s/it][A

	loss_cls: tensor(0.4238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1732, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5970, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:50<21:01,  5.39s/it][A

	loss_cls: tensor(0.7134, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2938, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0072, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:56<21:01,  5.41s/it][A

	loss_cls: tensor(0.5352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1574, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6927, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:01<20:50,  5.39s/it][A

	loss_cls: tensor(0.4251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4832, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:06<20:41,  5.37s/it][A

	loss_cls: tensor(0.7580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9374, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:12<20:39,  5.39s/it][A

	loss_cls: tensor(0.7389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0835, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:17<20:30,  5.38s/it][A

	loss_cls: tensor(0.7706, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9012, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:22<20:29,  5.39s/it][A

	loss_cls: tensor(0.6167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0054, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:28<20:21,  5.38s/it][A

	loss_cls: tensor(0.6672, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8650, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:33<20:21,  5.40s/it][A

	loss_cls: tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8817, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:39<20:11,  5.38s/it][A

	loss_cls: tensor(0.6372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8072, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:44<20:01,  5.37s/it][A

	loss_cls: tensor(0.4957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6762, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:49<20:02,  5.39s/it][A

	loss_cls: tensor(0.3561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1174, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4735, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:55<19:56,  5.39s/it][A

	loss_cls: tensor(0.5743, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7933, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:00<19:55,  5.41s/it][A

	loss_cls: tensor(0.4470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6528, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:06<19:47,  5.40s/it][A

	loss_cls: tensor(0.7186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2763, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:11<19:44,  5.41s/it][A

	loss_cls: tensor(0.4913, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6954, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:16<19:38,  5.41s/it][A

	loss_cls: tensor(1.0025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2109, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:22<19:35,  5.42s/it][A

	loss_cls: tensor(0.8228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0494, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:27<19:27,  5.40s/it][A

	loss_cls: tensor(0.3892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6885, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:33<19:18,  5.39s/it][A

	loss_cls: tensor(0.4936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9183, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:38<19:16,  5.41s/it][A

	loss_cls: tensor(0.4397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2598, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6995, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:43<19:07,  5.39s/it][A

	loss_cls: tensor(0.4178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5828, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:49<19:04,  5.40s/it][A

	loss_cls: tensor(0.4126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4598, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:54<18:53,  5.37s/it][A

	loss_cls: tensor(0.4063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1199, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5262, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:59<18:49,  5.38s/it][A

	loss_cls: tensor(0.4482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4893, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:05<18:42,  5.37s/it][A

	loss_cls: tensor(0.6207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0139, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:10<18:36,  5.37s/it][A

	loss_cls: tensor(0.6784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9937, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:16<18:33,  5.38s/it][A

	loss_cls: tensor(0.5159, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7219, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:21<18:26,  5.37s/it][A

	loss_cls: tensor(0.7983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0422, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:26<18:21,  5.38s/it][A

	loss_cls: tensor(0.6025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6914, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:32<18:15,  5.37s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6574, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:37<18:25,  5.45s/it][A

	loss_cls: tensor(0.3775, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5854, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:43<18:17,  5.43s/it][A

	loss_cls: tensor(1.0321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2533, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:48<18:07,  5.41s/it][A

	loss_cls: tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9068, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:54<18:04,  5.42s/it][A

	loss_cls: tensor(0.9950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1628, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:59<17:55,  5.40s/it][A

	loss_cls: tensor(0.6712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9462, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:04<17:52,  5.42s/it][A

	loss_cls: tensor(0.4678, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5922, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:10<17:43,  5.40s/it][A

	loss_cls: tensor(0.6637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2250, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8888, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:15<17:44,  5.43s/it][A

	loss_cls: tensor(0.6874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0146, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:21<17:37,  5.42s/it][A

	loss_cls: tensor(0.7217, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0785, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:26<17:29,  5.41s/it][A

	loss_cls: tensor(0.4314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7054, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:31<17:25,  5.42s/it][A

	loss_cls: tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7326, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:37<17:20,  5.42s/it][A

	loss_cls: tensor(0.6893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0883, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7776, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:42<17:16,  5.43s/it][A

	loss_cls: tensor(0.5746, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7730, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:48<17:09,  5.42s/it][A

	loss_cls: tensor(0.6797, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5214, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2012, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:53<17:07,  5.44s/it][A

	loss_cls: tensor(0.4804, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7985, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:59<17:00,  5.43s/it][A

	loss_cls: tensor(0.5529, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3054, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8583, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:04<16:52,  5.41s/it][A

	loss_cls: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9388, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:09<16:49,  5.43s/it][A

	loss_cls: tensor(0.5620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9408, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:15<16:40,  5.41s/it][A

	loss_cls: tensor(0.6121, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2289, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:20<16:38,  5.43s/it][A

	loss_cls: tensor(0.8425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0303, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:26<16:31,  5.42s/it][A

	loss_cls: tensor(0.7837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2845, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0682, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:31<16:28,  5.43s/it][A

	loss_cls: tensor(0.6083, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8202, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:36<16:20,  5.42s/it][A

	loss_cls: tensor(0.5709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7385, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:42<16:13,  5.41s/it][A

	loss_cls: tensor(0.6213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7867, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:47<16:10,  5.42s/it][A

	loss_cls: tensor(0.5023, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3073, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8095, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:53<16:03,  5.41s/it][A

	loss_cls: tensor(0.5613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6184, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:58<16:01,  5.43s/it][A

	loss_cls: tensor(0.3426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5633, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:04<15:54,  5.43s/it][A

	loss_cls: tensor(0.7187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4143, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1330, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:09<15:51,  5.43s/it][A

	loss_cls: tensor(0.4553, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7111, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:14<15:43,  5.42s/it][A

	loss_cls: tensor(0.4910, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7514, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:20<15:35,  5.41s/it][A

	loss_cls: tensor(0.5865, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1988, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7853, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:25<15:33,  5.43s/it][A

	loss_cls: tensor(0.4814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8736, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:31<15:27,  5.43s/it][A

	loss_cls: tensor(1.0201, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0813, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1013, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:36<15:26,  5.45s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7700, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:42<15:18,  5.43s/it][A

	loss_cls: tensor(0.3901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6641, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:47<15:15,  5.45s/it][A

	loss_cls: tensor(0.5270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7304, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:52<15:07,  5.43s/it][A

	loss_cls: tensor(0.5656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9790, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:58<15:00,  5.42s/it][A

	loss_cls: tensor(0.5718, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1757, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7475, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:03<14:58,  5.44s/it][A

	loss_cls: tensor(0.6479, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9096, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:09<14:49,  5.43s/it][A

	loss_cls: tensor(0.5789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2594, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8383, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:14<14:48,  5.45s/it][A

	loss_cls: tensor(0.6210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7594, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:20<14:40,  5.44s/it][A

	loss_cls: tensor(0.5046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6334, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:25<14:36,  5.44s/it][A

	loss_cls: tensor(0.5884, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1906, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7790, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:30<14:27,  5.42s/it][A

	loss_cls: tensor(0.4837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6486, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:36<14:20,  5.41s/it][A

	loss_cls: tensor(0.7024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8244, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:41<14:17,  5.42s/it][A

	loss_cls: tensor(0.4211, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2172, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6383, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:47<14:09,  5.41s/it][A

	loss_cls: tensor(0.4669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7249, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:52<14:08,  5.44s/it][A

	loss_cls: tensor(0.4448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1012, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5460, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:58<14:00,  5.42s/it][A

	loss_cls: tensor(0.5862, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8593, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:03<13:56,  5.43s/it][A

	loss_cls: tensor(0.5889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7664, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:08<13:48,  5.42s/it][A

	loss_cls: tensor(0.6122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3514, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9636, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:14<13:41,  5.40s/it][A

	loss_cls: tensor(0.5996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2771, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8767, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:19<13:38,  5.42s/it][A

	loss_cls: tensor(0.5311, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8966, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:25<13:31,  5.41s/it][A

	loss_cls: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3233, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:30<13:28,  5.42s/it][A

	loss_cls: tensor(0.5472, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6582, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:36<13:23,  5.43s/it][A

	loss_cls: tensor(0.3954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1248, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5202, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:41<13:21,  5.45s/it][A

	loss_cls: tensor(0.4019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5793, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:47<13:17,  5.47s/it][A

	loss_cls: tensor(0.5711, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8134, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:52<13:13,  5.47s/it][A

	loss_cls: tensor(0.5101, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8565, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:58<13:12,  5.50s/it][A

	loss_cls: tensor(0.6637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0230, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:03<13:06,  5.50s/it][A

	loss_cls: tensor(0.4044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4540, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:09<13:04,  5.52s/it][A

	loss_cls: tensor(0.4010, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2357, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6366, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:14<12:57,  5.52s/it][A

	loss_cls: tensor(0.6842, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1369, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8210, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:20<12:54,  5.53s/it][A

	loss_cls: tensor(0.4892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4109, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9001, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:25<12:49,  5.54s/it][A

	loss_cls: tensor(0.5068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4146, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9215, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:31<12:40,  5.51s/it][A

	loss_cls: tensor(0.5192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5850, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:36<12:36,  5.52s/it][A

	loss_cls: tensor(0.5477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2316, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7793, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:42<12:31,  5.53s/it][A

	loss_cls: tensor(0.6161, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6650, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:47<12:27,  5.53s/it][A

	loss_cls: tensor(0.6918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2417, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9335, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:53<12:20,  5.53s/it][A

	loss_cls: tensor(0.5228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6640, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:58<12:17,  5.55s/it][A

	loss_cls: tensor(0.6281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6720, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:04<12:11,  5.54s/it][A

	loss_cls: tensor(0.5625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9012, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:10<12:04,  5.53s/it][A

	loss_cls: tensor(0.5668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1728, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7396, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:15<12:00,  5.54s/it][A

	loss_cls: tensor(0.7112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1261, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:21<11:54,  5.54s/it][A

	loss_cls: tensor(0.4622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2622, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7244, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:26<11:52,  5.56s/it][A

	loss_cls: tensor(0.5178, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0882, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6060, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:32<11:44,  5.55s/it][A

	loss_cls: tensor(0.3488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0688, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4176, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:37<11:40,  5.56s/it][A

	loss_cls: tensor(0.4007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6873, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:43<11:31,  5.54s/it][A

	loss_cls: tensor(0.6939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3435, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0375, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:48<11:24,  5.52s/it][A

	loss_cls: tensor(0.5253, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6354, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:54<11:21,  5.54s/it][A

	loss_cls: tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9945, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:59<11:14,  5.53s/it][A

	loss_cls: tensor(0.7105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1473, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:05<11:11,  5.55s/it][A

	loss_cls: tensor(0.4254, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1346, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5601, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:11<11:05,  5.55s/it][A

	loss_cls: tensor(0.8917, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1608, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:16<11:01,  5.56s/it][A

	loss_cls: tensor(0.6424, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8376, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:22<10:55,  5.55s/it][A

	loss_cls: tensor(0.6400, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7575, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:27<10:50,  5.56s/it][A

	loss_cls: tensor(0.7962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2009, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9971, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:33<10:44,  5.56s/it][A

	loss_cls: tensor(0.6210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2595, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8806, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:38<10:38,  5.55s/it][A

	loss_cls: tensor(0.9709, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1954, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:44<10:34,  5.57s/it][A

	loss_cls: tensor(0.6744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2524, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9269, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:49<10:07,  5.37s/it][A

	loss_cls: tensor(0.9107, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1762, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:54<09:48,  5.25s/it][A

	loss_cls: tensor(0.5361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7530, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:59<09:32,  5.16s/it][A

	loss_cls: tensor(1.0300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0996, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1296, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:04<09:21,  5.10s/it][A

	loss_cls: tensor(0.5282, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7000, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:09<09:11,  5.06s/it][A

	loss_cls: tensor(0.9116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2686, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1801, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:14<09:02,  5.02s/it][A

	loss_cls: tensor(0.6011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6483, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:19<08:55,  5.00s/it][A

	loss_cls: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7308, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:24<08:47,  4.98s/it][A

	loss_cls: tensor(0.7152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2975, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0127, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:28<08:43,  4.98s/it][A

	loss_cls: tensor(0.3838, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4926, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8764, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:33<08:37,  4.97s/it][A

	loss_cls: tensor(0.4112, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5146, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:39<08:46,  5.11s/it][A

	loss_cls: tensor(0.5981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8171, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:44<08:54,  5.24s/it][A

	loss_cls: tensor(0.6221, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7549, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:50<08:56,  5.31s/it][A

	loss_cls: tensor(0.6889, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3823, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0712, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:55<08:57,  5.38s/it][A

	loss_cls: tensor(0.5436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6403, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:01<08:56,  5.42s/it][A

	loss_cls: tensor(0.5637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8904, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:07<08:56,  5.47s/it][A

	loss_cls: tensor(0.3494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5280, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:12<08:51,  5.48s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9042, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:18<08:50,  5.53s/it][A

	loss_cls: tensor(0.7203, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9097, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:23<08:45,  5.53s/it][A

	loss_cls: tensor(0.4473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3731, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8204, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:29<08:39,  5.53s/it][A

	loss_cls: tensor(0.3352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0381, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3733, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:34<08:36,  5.55s/it][A

	loss_cls: tensor(0.4602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9049, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:40<08:29,  5.54s/it][A

	loss_cls: tensor(0.4751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6523, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:45<08:25,  5.55s/it][A

	loss_cls: tensor(0.5426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2885, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8311, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:51<08:18,  5.54s/it][A

	loss_cls: tensor(0.4604, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6945, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:57<08:15,  5.57s/it][A

	loss_cls: tensor(0.7840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5083, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2923, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:02<08:09,  5.56s/it][A

	loss_cls: tensor(0.3891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8089, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:08<08:02,  5.54s/it][A

	loss_cls: tensor(0.7008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9916, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:13<07:58,  5.57s/it][A

	loss_cls: tensor(1.0140, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1894, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2035, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:19<07:51,  5.55s/it][A

	loss_cls: tensor(0.5584, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2094, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7678, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:24<07:47,  5.57s/it][A

	loss_cls: tensor(0.5535, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7854, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:30<07:40,  5.54s/it][A

	loss_cls: tensor(0.3298, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3341, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6639, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:35<07:35,  5.56s/it][A

	loss_cls: tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7876, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:41<07:28,  5.54s/it][A

	loss_cls: tensor(0.4637, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3059, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7696, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:46<07:22,  5.53s/it][A

	loss_cls: tensor(0.6816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8236, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:52<07:17,  5.53s/it][A

	loss_cls: tensor(0.5123, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3045, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8168, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:57<07:10,  5.52s/it][A

	loss_cls: tensor(0.5784, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1367, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7151, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:03<07:07,  5.55s/it][A

	loss_cls: tensor(0.6660, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1068, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7728, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:09<07:03,  5.57s/it][A

	loss_cls: tensor(0.8124, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0169, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:14<06:58,  5.58s/it][A

	loss_cls: tensor(0.7191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7945, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:20<06:51,  5.56s/it][A

	loss_cls: tensor(0.7000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1058, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8058, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:25<06:44,  5.54s/it][A

	loss_cls: tensor(0.5009, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5983, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:31<06:40,  5.56s/it][A

	loss_cls: tensor(0.5312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7394, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:36<06:33,  5.54s/it][A

	loss_cls: tensor(0.4322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7103, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:42<06:29,  5.56s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4033, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9521, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:48<06:22,  5.54s/it][A

	loss_cls: tensor(0.5488, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1082, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6570, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:53<06:18,  5.56s/it][A

	loss_cls: tensor(0.5639, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1955, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7594, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:59<06:11,  5.54s/it][A

	loss_cls: tensor(0.7115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1853, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8969, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:04<06:05,  5.53s/it][A

	loss_cls: tensor(0.7748, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1978, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:10<06:00,  5.54s/it][A

	loss_cls: tensor(0.4040, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5489, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:15<05:54,  5.53s/it][A

	loss_cls: tensor(0.4936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1680, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6615, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:21<05:50,  5.57s/it][A

	loss_cls: tensor(0.4296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5590, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:26<05:44,  5.56s/it][A

	loss_cls: tensor(0.4603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1772, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6374, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:32<05:39,  5.57s/it][A

	loss_cls: tensor(0.7019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7490, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:37<05:32,  5.55s/it][A

	loss_cls: tensor(0.6533, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9051, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:43<05:26,  5.53s/it][A

	loss_cls: tensor(0.3930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5883, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:49<05:22,  5.55s/it][A

	loss_cls: tensor(0.6314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1253, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7568, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:54<05:15,  5.54s/it][A

	loss_cls: tensor(0.4848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0383, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5231, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [27:00<05:11,  5.57s/it][A

	loss_cls: tensor(1.0053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1854, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1908, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:05<05:04,  5.53s/it][A

	loss_cls: tensor(0.4908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9375, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:11<04:58,  5.53s/it][A

	loss_cls: tensor(0.4852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2166, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7019, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:16<04:52,  5.53s/it][A

	loss_cls: tensor(0.5320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7229, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:22<04:47,  5.53s/it][A

	loss_cls: tensor(0.7522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0275, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:27<04:32,  5.35s/it][A

	loss_cls: tensor(0.5512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6977, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:32<04:21,  5.23s/it][A

	loss_cls: tensor(0.4447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3825, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8272, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:37<04:20,  5.32s/it][A

	loss_cls: tensor(1.2135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.6166, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:42<04:09,  5.20s/it][A

	loss_cls: tensor(0.4675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5976, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:47<04:01,  5.14s/it][A

	loss_cls: tensor(0.6448, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7860, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:53<04:02,  5.28s/it][A

	loss_cls: tensor(0.4437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6596, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:58<04:00,  5.34s/it][A

	loss_cls: tensor(0.4868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6871, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:04<03:58,  5.42s/it][A

	loss_cls: tensor(0.5339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9479, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:09<03:54,  5.45s/it][A

	loss_cls: tensor(0.4979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3887, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8866, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:15<03:50,  5.50s/it][A

	loss_cls: tensor(0.7294, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0312, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:20<03:45,  5.50s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6547, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:26<03:41,  5.53s/it][A

	loss_cls: tensor(0.5855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7507, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:32<03:35,  5.52s/it][A

	loss_cls: tensor(0.7397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3886, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1283, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:37<03:30,  5.53s/it][A

	loss_cls: tensor(0.8856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4928, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:43<03:24,  5.54s/it][A

	loss_cls: tensor(0.8375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2245, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0621, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:48<03:19,  5.55s/it][A

	loss_cls: tensor(0.3655, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7037, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:54<03:14,  5.56s/it][A

	loss_cls: tensor(0.4020, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6427, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:59<03:09,  5.57s/it][A

	loss_cls: tensor(0.6826, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7964, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:05<03:04,  5.58s/it][A

	loss_cls: tensor(0.6786, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9443, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:10<02:57,  5.55s/it][A

	loss_cls: tensor(0.6896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1543, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8439, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:16<02:50,  5.51s/it][A

	loss_cls: tensor(0.5241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6269, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:21<02:46,  5.54s/it][A

	loss_cls: tensor(0.5289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8222, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:27<02:40,  5.53s/it][A

	loss_cls: tensor(0.5675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7976, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:33<02:35,  5.56s/it][A

	loss_cls: tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5440, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:38<02:29,  5.54s/it][A

	loss_cls: tensor(0.5044, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8498, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:44<02:24,  5.56s/it][A

	loss_cls: tensor(0.7113, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3795, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0907, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:49<02:18,  5.54s/it][A

	loss_cls: tensor(0.4793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3884, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8677, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:55<02:12,  5.53s/it][A

	loss_cls: tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0615, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6466, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [30:00<02:07,  5.54s/it][A

	loss_cls: tensor(0.4943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0555, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5497, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:06<02:01,  5.54s/it][A

	loss_cls: tensor(0.3345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3909, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:11<01:56,  5.55s/it][A

	loss_cls: tensor(0.3833, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5681, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:17<01:50,  5.54s/it][A

	loss_cls: tensor(0.7808, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1015, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:22<01:45,  5.55s/it][A

	loss_cls: tensor(0.3869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5772, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:28<01:39,  5.53s/it][A

	loss_cls: tensor(0.9732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0513, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:34<01:34,  5.54s/it][A

	loss_cls: tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1532, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6387, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:39<01:28,  5.54s/it][A

	loss_cls: tensor(0.6446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1758, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8204, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:45<01:22,  5.53s/it][A

	loss_cls: tensor(0.5516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0638, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:50<01:17,  5.55s/it][A

	loss_cls: tensor(0.3994, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5645, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:56<01:11,  5.53s/it][A

	loss_cls: tensor(0.4890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5919, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [31:01<01:06,  5.55s/it][A

	loss_cls: tensor(0.8130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0934, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:07<01:00,  5.53s/it][A

	loss_cls: tensor(0.9550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2940, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:12<00:55,  5.55s/it][A

	loss_cls: tensor(0.4327, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1403, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5729, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:18<00:49,  5.52s/it][A

	loss_cls: tensor(0.4981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5485, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:23<00:44,  5.50s/it][A

	loss_cls: tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6671, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:29<00:38,  5.53s/it][A

	loss_cls: tensor(0.6216, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7439, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:34<00:33,  5.53s/it][A

	loss_cls: tensor(0.7106, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0005, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:40<00:27,  5.54s/it][A

	loss_cls: tensor(0.4852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8549, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:45<00:21,  5.42s/it][A

	loss_cls: tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4647, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1590, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:51<00:16,  5.48s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1432, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8202, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:56<00:10,  5.49s/it][A

	loss_cls: tensor(0.5504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6811, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:02<00:05,  5.49s/it][A

	loss_cls: tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1616, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7969, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:04<00:00,  5.44s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.3969, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4430, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8092927219672391

	Training cls acc: 0.7013300376647834

	Training cls prec: 0.5794325221232001

	Training cls rec: 0.6179882441005322

	Training cls f1: 0.5474820526936088

--
	Training ner acc: 0.9550985197400338

	Training ner prec: 0.2777221399828292

	Training ner rec: 0.285645373923921

	Training ner f1: 0.2810620769952919

	Current Learning rate:  8.571428571428571e-05



  1%|          | 1/177 [00:00<02:15,  1.30it/s][A
  1%|          | 2/177 [00:01<02:06,  1.38it/s][A
  2%|▏         | 3/177 [00:02<02:08,  1.35it/s][A
  2%|▏         | 4/177 [00:02<02:08,  1.34it/s][A
  3%|▎         | 5/177 [00:03<02:03,  1.39it/s][A
  3%|▎         | 6/177 [00:04<02:04,  1.37it/s][A
  4%|▍         | 7/177 [00:05<02:05,  1.35it/s][A
  5%|▍         | 8/177 [00:05<02:05,  1.35it/s][A
  5%|▌         | 9/177 [00:06<02:01,  1.38it/s][A
  6%|▌         | 10/177 [00:07<02:02,  1.36it/s][A
  6%|▌         | 11/177 [00:08<02:03,  1.35it/s][A
  7%|▋         | 12/177 [00:08<01:58,  1.39it/s][A
  7%|▋         | 13/177 [00:09<01:59,  1.38it/s][A
  8%|▊         | 14/177 [00:10<02:00,  1.36it/s][A
  8%|▊         | 15/177 [00:11<02:00,  1.34it/s][A
  9%|▉         | 16/177 [00:11<01:56,  1.38it/s][A
 10%|▉         | 17/177 [00:12<01:57,  1.36it/s][A
 10%|█         | 18/177 [00:13<01:57,  1.35it/s][A
 11%|█         | 19/177 [00:13<01:57,  1.34it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7689872727555743

	Validation cls acc: 0.7208097928436912

	Validation cls prec: 0.6047585418348129

	Validation cls rec: 0.5953524347592144

	Validation cls f1: 0.5602055822394806

--
	Validation ner acc: 0.9542322607041435

	Validation ner prec: 0.41232108567706194

	Validation ner rec: 0.42297551789077215

	Validation ner f1: 0.417430184718797



  0%|          | 1/354 [00:05<31:54,  5.42s/it][A

	loss_cls: tensor(0.4859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2788, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7647, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:32,  5.38s/it][A

	loss_cls: tensor(0.5120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7976, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:37,  5.40s/it][A

	loss_cls: tensor(0.5726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7660, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:23,  5.38s/it][A

	loss_cls: tensor(0.4719, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5500, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:12,  5.37s/it][A

	loss_cls: tensor(0.4444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6677, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:09,  5.37s/it][A

	loss_cls: tensor(0.4144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6415, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:04,  5.37s/it][A

	loss_cls: tensor(0.3876, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0415, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4291, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:08,  5.40s/it][A

	loss_cls: tensor(0.7032, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8528, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<30:54,  5.37s/it][A

	loss_cls: tensor(0.4245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7512, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:53<30:54,  5.39s/it][A

	loss_cls: tensor(0.8629, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9776, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<30:41,  5.37s/it][A

	loss_cls: tensor(0.6186, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7707, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:04<30:31,  5.35s/it][A

	loss_cls: tensor(0.6523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8724, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:09<30:28,  5.36s/it][A

	loss_cls: tensor(0.4912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0973, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5885, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:28,  5.38s/it][A

	loss_cls: tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7755, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:20<30:25,  5.38s/it][A

	loss_cls: tensor(0.6461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0592, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7053, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:22,  5.39s/it][A

	loss_cls: tensor(0.5105, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5476, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:31<30:19,  5.40s/it][A

	loss_cls: tensor(0.4805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7994, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:36<30:09,  5.39s/it][A

	loss_cls: tensor(0.5089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2782, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7870, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<29:58,  5.37s/it][A

	loss_cls: tensor(0.4514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5388, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:47<30:00,  5.39s/it][A

	loss_cls: tensor(0.7698, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5234, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:52<29:51,  5.38s/it][A

	loss_cls: tensor(0.4256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6455, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:58<29:52,  5.40s/it][A

	loss_cls: tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8082, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:03<29:40,  5.38s/it][A

	loss_cls: tensor(0.5924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8358, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:39,  5.39s/it][A

	loss_cls: tensor(0.5720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6745, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:14<29:26,  5.37s/it][A

	loss_cls: tensor(0.3882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0708, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4590, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:19<29:16,  5.36s/it][A

	loss_cls: tensor(0.4191, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4595, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:25<29:15,  5.37s/it][A

	loss_cls: tensor(0.4372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4767, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:30<29:07,  5.36s/it][A

	loss_cls: tensor(0.5212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8083, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:35<29:08,  5.38s/it][A

	loss_cls: tensor(0.6647, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8732, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:41<28:57,  5.36s/it][A

	loss_cls: tensor(0.6287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6734, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:46<28:55,  5.37s/it][A

	loss_cls: tensor(1.0254, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1822, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2075, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:52<28:45,  5.36s/it][A

	loss_cls: tensor(0.9062, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1292, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:57<28:44,  5.37s/it][A

	loss_cls: tensor(0.8795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3869, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2663, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:02<28:38,  5.37s/it][A

	loss_cls: tensor(0.2675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0343, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3018, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:08<28:28,  5.36s/it][A

	loss_cls: tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1753, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:13<28:31,  5.38s/it][A

	loss_cls: tensor(0.4445, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6365, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:18<28:20,  5.37s/it][A

	loss_cls: tensor(0.4144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5899, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:24<28:28,  5.41s/it][A

	loss_cls: tensor(0.9595, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2324, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1919, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:29<28:20,  5.40s/it][A

	loss_cls: tensor(0.5969, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3107, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9076, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:35<28:15,  5.40s/it][A

	loss_cls: tensor(0.5007, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6940, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:40<28:03,  5.38s/it][A

	loss_cls: tensor(1.1805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1581, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3385, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:45<27:54,  5.37s/it][A

	loss_cls: tensor(0.4626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1449, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6074, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:51<27:52,  5.38s/it][A

	loss_cls: tensor(0.3675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4700, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:56<27:44,  5.37s/it][A

	loss_cls: tensor(0.4596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2421, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7017, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:02<27:42,  5.38s/it][A

	loss_cls: tensor(0.6152, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7629, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:07<27:31,  5.36s/it][A

	loss_cls: tensor(0.6189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0890, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7079, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:12<27:33,  5.38s/it][A

	loss_cls: tensor(0.6019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8143, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:18<27:25,  5.38s/it][A

	loss_cls: tensor(1.0830, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5622, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:23<27:17,  5.37s/it][A

	loss_cls: tensor(0.5494, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2695, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8188, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:28<27:18,  5.39s/it][A

	loss_cls: tensor(0.4115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8898, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:34<27:05,  5.37s/it][A

	loss_cls: tensor(0.4189, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6891, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:39<27:04,  5.38s/it][A

	loss_cls: tensor(0.5906, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8254, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:44<26:52,  5.36s/it][A

	loss_cls: tensor(0.7754, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2605, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0359, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:50<26:53,  5.38s/it][A

	loss_cls: tensor(0.7061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9556, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:55<26:46,  5.37s/it][A

	loss_cls: tensor(0.5607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8510, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:01<26:39,  5.37s/it][A

	loss_cls: tensor(0.3722, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4411, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:06<26:36,  5.38s/it][A

	loss_cls: tensor(0.5421, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7335, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:11<26:27,  5.36s/it][A

	loss_cls: tensor(0.5697, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8375, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:17<26:23,  5.37s/it][A

	loss_cls: tensor(0.5605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9081, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:22<26:16,  5.36s/it][A

	loss_cls: tensor(0.8143, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1896, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0039, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:27<26:14,  5.37s/it][A

	loss_cls: tensor(0.6475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2354, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8829, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:33<26:15,  5.39s/it][A

	loss_cls: tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6111, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:38<26:06,  5.38s/it][A

	loss_cls: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2799, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7925, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:44<26:03,  5.39s/it][A

	loss_cls: tensor(1.0904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1669, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:49<25:51,  5.37s/it][A

	loss_cls: tensor(0.9634, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1016, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0650, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:54<25:50,  5.38s/it][A

	loss_cls: tensor(0.3848, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2237, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6084, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:00<25:41,  5.37s/it][A

	loss_cls: tensor(0.6771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9563, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:05<25:41,  5.39s/it][A

	loss_cls: tensor(0.5641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3152, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8793, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:10<25:30,  5.37s/it][A

	loss_cls: tensor(0.4904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5737, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:16<25:23,  5.37s/it][A

	loss_cls: tensor(0.7891, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9891, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:21<25:21,  5.38s/it][A

	loss_cls: tensor(0.4983, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6164, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:27<25:12,  5.36s/it][A

	loss_cls: tensor(0.3923, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5033, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:32<25:10,  5.38s/it][A

	loss_cls: tensor(0.5319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7922, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:37<25:04,  5.37s/it][A

	loss_cls: tensor(0.4951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2422, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7372, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:43<25:03,  5.39s/it][A

	loss_cls: tensor(0.6300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8753, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:48<24:52,  5.37s/it][A

	loss_cls: tensor(0.6757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8108, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:53<24:42,  5.35s/it][A

	loss_cls: tensor(0.6979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1028, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [06:59<24:42,  5.37s/it][A

	loss_cls: tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8129, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:04<24:31,  5.35s/it][A

	loss_cls: tensor(0.4114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4584, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:10<24:33,  5.38s/it][A

	loss_cls: tensor(0.3957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2163, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6119, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:15<24:23,  5.36s/it][A

	loss_cls: tensor(0.6795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1301, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8097, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:20<24:23,  5.38s/it][A

	loss_cls: tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8496, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:26<24:14,  5.37s/it][A

	loss_cls: tensor(0.4912, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8377, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:31<24:06,  5.36s/it][A

	loss_cls: tensor(0.6043, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7558, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:36<24:07,  5.38s/it][A

	loss_cls: tensor(0.4229, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6702, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:42<24:01,  5.38s/it][A

	loss_cls: tensor(0.7015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9722, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:47<24:06,  5.42s/it][A

	loss_cls: tensor(0.8582, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1253, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:53<23:57,  5.40s/it][A

	loss_cls: tensor(0.4475, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3975, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8450, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [07:58<23:51,  5.40s/it][A

	loss_cls: tensor(0.4290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5709, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:03<23:42,  5.39s/it][A

	loss_cls: tensor(1.2931, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7222, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:09<23:31,  5.37s/it][A

	loss_cls: tensor(0.8380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9378, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:14<23:29,  5.38s/it][A

	loss_cls: tensor(0.5390, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5855, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:19<23:19,  5.36s/it][A

	loss_cls: tensor(0.7046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0233, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:25<23:22,  5.39s/it][A

	loss_cls: tensor(0.5322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8442, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:30<23:11,  5.37s/it][A

	loss_cls: tensor(0.8292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0219, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:36<23:10,  5.39s/it][A

	loss_cls: tensor(0.3648, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4351, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:41<22:59,  5.37s/it][A

	loss_cls: tensor(0.4918, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0641, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5559, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:46<22:51,  5.36s/it][A

	loss_cls: tensor(0.3879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4771, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:52<22:49,  5.37s/it][A

	loss_cls: tensor(0.6930, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9925, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [08:57<22:42,  5.36s/it][A

	loss_cls: tensor(0.6234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2713, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8947, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:03<22:41,  5.38s/it][A

	loss_cls: tensor(0.5692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6180, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:08<22:32,  5.37s/it][A

	loss_cls: tensor(0.4230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4967, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9197, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:13<22:31,  5.38s/it][A

	loss_cls: tensor(0.4822, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6845, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:19<22:22,  5.37s/it][A

	loss_cls: tensor(0.6568, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7550, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:24<22:14,  5.36s/it][A

	loss_cls: tensor(0.6006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0279, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:29<22:12,  5.37s/it][A

	loss_cls: tensor(0.4366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0439, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4805, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:35<22:03,  5.36s/it][A

	loss_cls: tensor(0.5751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6672, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:40<22:02,  5.38s/it][A

	loss_cls: tensor(0.8252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4520, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:46<22:00,  5.39s/it][A

	loss_cls: tensor(0.5596, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7387, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:51<22:02,  5.42s/it][A

	loss_cls: tensor(0.7247, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5029, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2276, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [09:56<21:48,  5.38s/it][A

	loss_cls: tensor(0.4218, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8662, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:02<21:40,  5.37s/it][A

	loss_cls: tensor(1.0240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1781, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:07<21:35,  5.38s/it][A

	loss_cls: tensor(0.8053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0312, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:12<21:29,  5.37s/it][A

	loss_cls: tensor(0.6858, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9532, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:18<21:27,  5.39s/it][A

	loss_cls: tensor(0.4423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7288, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:23<21:21,  5.38s/it][A

	loss_cls: tensor(0.5551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1290, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6841, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:29<21:18,  5.39s/it][A

	loss_cls: tensor(0.4914, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6099, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:34<21:10,  5.38s/it][A

	loss_cls: tensor(0.4668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6174, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:39<20:58,  5.36s/it][A

	loss_cls: tensor(0.5066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0457, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5524, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:45<20:55,  5.37s/it][A

	loss_cls: tensor(0.5635, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7088, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:50<20:48,  5.36s/it][A

	loss_cls: tensor(0.5537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0314, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:56<20:58,  5.42s/it][A

	loss_cls: tensor(0.4975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0243, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:01<20:57,  5.44s/it][A

	loss_cls: tensor(1.0620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2254, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:07<21:01,  5.48s/it][A

	loss_cls: tensor(0.5986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7170, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:12<20:57,  5.49s/it][A

	loss_cls: tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2527, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8387, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:18<20:51,  5.49s/it][A

	loss_cls: tensor(0.4831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2309, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7140, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:23<20:52,  5.52s/it][A

	loss_cls: tensor(0.3271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3689, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:29<20:45,  5.51s/it][A

	loss_cls: tensor(0.6100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1962, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8062, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:34<20:45,  5.53s/it][A

	loss_cls: tensor(0.6419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8100, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:40<20:37,  5.52s/it][A

	loss_cls: tensor(0.8317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1137, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:45<20:33,  5.53s/it][A

	loss_cls: tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8751, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:51<20:22,  5.51s/it][A

	loss_cls: tensor(0.4895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5367, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:56<20:14,  5.49s/it][A

	loss_cls: tensor(0.4480, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0452, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4932, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:02<20:01,  5.46s/it][A

	loss_cls: tensor(0.6115, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8616, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:07<19:50,  5.43s/it][A

	loss_cls: tensor(0.5473, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8654, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:12<19:44,  5.43s/it][A

	loss_cls: tensor(0.6227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7643, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:18<19:33,  5.41s/it][A

	loss_cls: tensor(0.5863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1335, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7198, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:23<19:30,  5.42s/it][A

	loss_cls: tensor(0.4477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5918, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:29<19:17,  5.39s/it][A

	loss_cls: tensor(0.5970, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7359, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:34<19:14,  5.40s/it][A

	loss_cls: tensor(0.3975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5063, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:39<19:08,  5.39s/it][A

	loss_cls: tensor(0.6179, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1972, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8151, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:45<19:00,  5.38s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7702, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:50<18:58,  5.40s/it][A

	loss_cls: tensor(0.7287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0829, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:55<18:50,  5.38s/it][A

	loss_cls: tensor(0.4079, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2038, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6117, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:01<18:44,  5.38s/it][A

	loss_cls: tensor(0.7097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9026, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:06<18:39,  5.38s/it][A

	loss_cls: tensor(0.5439, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3306, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8745, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:12<18:36,  5.40s/it][A

	loss_cls: tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2040, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6619, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:17<18:30,  5.39s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3222, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8788, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:22<18:21,  5.38s/it][A

	loss_cls: tensor(0.4514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1521, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6034, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:28<18:19,  5.39s/it][A

	loss_cls: tensor(0.6263, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6666, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2929, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:33<18:13,  5.39s/it][A

	loss_cls: tensor(0.4551, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5319, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:39<18:13,  5.42s/it][A

	loss_cls: tensor(0.7466, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9038, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:44<18:06,  5.41s/it][A

	loss_cls: tensor(0.3807, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0418, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4225, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:50<18:05,  5.43s/it][A

	loss_cls: tensor(0.6911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9061, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:55<17:59,  5.42s/it][A

	loss_cls: tensor(0.8825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0204, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:00<17:53,  5.42s/it][A

	loss_cls: tensor(0.7332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1547, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8880, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:06<17:50,  5.43s/it][A

	loss_cls: tensor(0.3375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4663, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:11<17:41,  5.42s/it][A

	loss_cls: tensor(0.3487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1645, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5132, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:17<17:38,  5.43s/it][A

	loss_cls: tensor(0.7034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9785, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:22<17:29,  5.41s/it][A

	loss_cls: tensor(0.7322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1588, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:28<17:27,  5.43s/it][A

	loss_cls: tensor(0.5556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7716, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:33<17:21,  5.42s/it][A

	loss_cls: tensor(0.6238, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3638, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9877, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:38<17:13,  5.41s/it][A

	loss_cls: tensor(0.6729, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1796, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8525, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:44<17:13,  5.44s/it][A

	loss_cls: tensor(0.7856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9394, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:49<17:03,  5.42s/it][A

	loss_cls: tensor(0.4523, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5010, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:55<17:01,  5.43s/it][A

	loss_cls: tensor(0.5783, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1533, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7316, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:00<16:54,  5.43s/it][A

	loss_cls: tensor(0.4156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0469, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4625, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:06<16:52,  5.44s/it][A

	loss_cls: tensor(0.5257, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9468, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:11<16:43,  5.42s/it][A

	loss_cls: tensor(0.4228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2902, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7129, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:16<16:35,  5.41s/it][A

	loss_cls: tensor(0.3457, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7021, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:22<16:33,  5.43s/it][A

	loss_cls: tensor(0.5992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0113, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:27<16:25,  5.41s/it][A

	loss_cls: tensor(0.3966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6412, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:33<16:22,  5.43s/it][A

	loss_cls: tensor(0.6004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1281, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7285, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:38<16:16,  5.42s/it][A

	loss_cls: tensor(0.7222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8552, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:43<16:13,  5.44s/it][A

	loss_cls: tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3037, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8787, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:49<16:05,  5.42s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7792, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:54<15:57,  5.41s/it][A

	loss_cls: tensor(0.4911, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5391, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:00<15:56,  5.43s/it][A

	loss_cls: tensor(0.5860, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2423, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8283, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:05<15:51,  5.44s/it][A

	loss_cls: tensor(0.6089, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0536, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6625, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:11<15:56,  5.50s/it][A

	loss_cls: tensor(0.7800, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1454, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:16<15:52,  5.50s/it][A

	loss_cls: tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0416, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:22<15:52,  5.54s/it][A

	loss_cls: tensor(0.5646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9353, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:27<15:44,  5.52s/it][A

	loss_cls: tensor(0.4905, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7336, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:33<15:38,  5.52s/it][A

	loss_cls: tensor(0.6814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3366, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0179, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:39<15:35,  5.53s/it][A

	loss_cls: tensor(0.8708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1966, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0674, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:43<14:58,  5.35s/it][A

	loss_cls: tensor(0.5321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1708, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7029, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:49<14:38,  5.26s/it][A

	loss_cls: tensor(0.6315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1798, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8112, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:54<14:47,  5.35s/it][A

	loss_cls: tensor(0.4461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3241, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7702, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:00<14:52,  5.41s/it][A

	loss_cls: tensor(0.5110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2154, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7265, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:05<14:51,  5.44s/it][A

	loss_cls: tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2018, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5852, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:11<14:46,  5.44s/it][A

	loss_cls: tensor(0.6361, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8142, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:16<14:47,  5.48s/it][A

	loss_cls: tensor(0.3381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5406, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:22<14:42,  5.48s/it][A

	loss_cls: tensor(0.6033, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8311, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:27<14:43,  5.52s/it][A

	loss_cls: tensor(0.4725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7950, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:33<14:36,  5.51s/it][A

	loss_cls: tensor(0.6308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7905, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:38<14:35,  5.54s/it][A

	loss_cls: tensor(0.4013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1787, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5800, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:44<14:26,  5.52s/it][A

	loss_cls: tensor(0.4408, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8075, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:49<14:18,  5.50s/it][A

	loss_cls: tensor(0.3592, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5462, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:55<14:15,  5.52s/it][A

	loss_cls: tensor(0.4943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1044, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5987, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:00<14:10,  5.53s/it][A

	loss_cls: tensor(0.5279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2644, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7924, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:06<14:09,  5.55s/it][A

	loss_cls: tensor(0.4979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2389, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7367, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:11<14:01,  5.53s/it][A

	loss_cls: tensor(0.6546, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1628, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8174, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:17<13:51,  5.50s/it][A

	loss_cls: tensor(0.5128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1694, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6822, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:23<13:54,  5.56s/it][A

	loss_cls: tensor(0.6901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3145, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0046, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:28<13:59,  5.64s/it][A

	loss_cls: tensor(0.4536, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0538, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5074, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:34<13:59,  5.67s/it][A

	loss_cls: tensor(0.5767, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0523, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6290, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:40<13:53,  5.67s/it][A

	loss_cls: tensor(0.9694, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3081, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:46<13:58,  5.74s/it][A

	loss_cls: tensor(0.2845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4524, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:51<13:38,  5.64s/it][A

	loss_cls: tensor(0.7355, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0061, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:57<13:25,  5.59s/it][A

	loss_cls: tensor(0.7389, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9139, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:02<13:11,  5.54s/it][A

	loss_cls: tensor(0.5333, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5760, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:07<12:59,  5.49s/it][A

	loss_cls: tensor(0.4460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5975, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:13<12:54,  5.49s/it][A

	loss_cls: tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5860, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:18<12:45,  5.47s/it][A

	loss_cls: tensor(0.4674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0401, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5075, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:24<12:41,  5.48s/it][A

	loss_cls: tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7703, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:29<12:33,  5.46s/it][A

	loss_cls: tensor(0.3751, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5303, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:35<12:28,  5.47s/it][A

	loss_cls: tensor(0.7468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8985, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:40<12:21,  5.45s/it][A

	loss_cls: tensor(0.5460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1455, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:46<12:12,  5.43s/it][A

	loss_cls: tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5573, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:51<12:09,  5.45s/it][A

	loss_cls: tensor(0.4683, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1888, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6572, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:56<12:01,  5.43s/it][A

	loss_cls: tensor(0.5235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6323, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:02<11:59,  5.45s/it][A

	loss_cls: tensor(0.6245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8643, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:07<11:52,  5.44s/it][A

	loss_cls: tensor(0.5145, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3289, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8433, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:13<11:49,  5.46s/it][A

	loss_cls: tensor(0.8233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2193, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:18<11:43,  5.45s/it][A

	loss_cls: tensor(0.6345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8401, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:24<11:32,  5.41s/it][A

	loss_cls: tensor(0.6228, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4339, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0568, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:29<11:32,  5.45s/it][A

	loss_cls: tensor(0.5835, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7202, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:35<11:27,  5.46s/it][A

	loss_cls: tensor(0.4118, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5688, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:40<11:24,  5.48s/it][A

	loss_cls: tensor(0.4449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9664, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:46<11:18,  5.47s/it][A

	loss_cls: tensor(0.5682, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9343, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:51<11:14,  5.49s/it][A

	loss_cls: tensor(0.6805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1256, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:57<11:07,  5.48s/it][A

	loss_cls: tensor(0.6715, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0830, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7545, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:02<11:03,  5.48s/it][A

	loss_cls: tensor(0.6414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2669, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9083, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:08<10:57,  5.48s/it][A

	loss_cls: tensor(0.6410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9342, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:13<10:50,  5.46s/it][A

	loss_cls: tensor(0.4604, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6955, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:18<10:47,  5.48s/it][A

	loss_cls: tensor(0.4979, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1134, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6113, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:24<10:40,  5.47s/it][A

	loss_cls: tensor(0.4623, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3373, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7996, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:29<10:36,  5.48s/it][A

	loss_cls: tensor(0.6770, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5379, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2149, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:35<10:28,  5.47s/it][A

	loss_cls: tensor(0.5888, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2185, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8072, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:40<10:25,  5.49s/it][A

	loss_cls: tensor(0.8738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2323, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1061, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:46<10:18,  5.48s/it][A

	loss_cls: tensor(0.4586, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5346, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:51<10:12,  5.47s/it][A

	loss_cls: tensor(0.4772, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3159, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7931, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:57<10:08,  5.48s/it][A

	loss_cls: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3649, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9137, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:02<10:01,  5.47s/it][A

	loss_cls: tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5059, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:08<09:57,  5.48s/it][A

	loss_cls: tensor(0.5183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3169, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8352, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:13<09:50,  5.47s/it][A

	loss_cls: tensor(0.6260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2228, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8488, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:19<09:47,  5.49s/it][A

	loss_cls: tensor(0.8050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0474, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:24<09:41,  5.48s/it][A

	loss_cls: tensor(1.0868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3827, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:30<09:32,  5.45s/it][A

	loss_cls: tensor(0.5006, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7135, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:35<09:27,  5.45s/it][A

	loss_cls: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9780, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:40<09:21,  5.45s/it][A

	loss_cls: tensor(0.5789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7065, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:46<09:18,  5.48s/it][A

	loss_cls: tensor(0.4825, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3220, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8045, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:51<09:11,  5.46s/it][A

	loss_cls: tensor(0.6895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2238, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9134, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:57<09:06,  5.47s/it][A

	loss_cls: tensor(0.6929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1496, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8425, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:02<09:00,  5.46s/it][A

	loss_cls: tensor(0.8447, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3023, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:08<08:54,  5.45s/it][A

	loss_cls: tensor(0.6896, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1957, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8853, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:13<08:50,  5.47s/it][A

	loss_cls: tensor(0.7960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3114, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1074, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:19<08:43,  5.45s/it][A

	loss_cls: tensor(0.5604, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1175, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6780, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:24<08:37,  5.45s/it][A

	loss_cls: tensor(0.6992, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2286, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9278, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:30<08:31,  5.44s/it][A

	loss_cls: tensor(0.6785, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8984, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:35<08:28,  5.47s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2562, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7787, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:41<08:22,  5.46s/it][A

	loss_cls: tensor(0.4972, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8334, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:46<08:17,  5.47s/it][A

	loss_cls: tensor(0.5869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8492, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:52<08:14,  5.49s/it][A

	loss_cls: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9036, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:57<08:07,  5.47s/it][A

	loss_cls: tensor(0.7495, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1471, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8966, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:03<08:02,  5.48s/it][A

	loss_cls: tensor(0.5380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0518, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5898, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:08<07:55,  5.46s/it][A

	loss_cls: tensor(0.5182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2434, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7617, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:13<07:51,  5.48s/it][A

	loss_cls: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0784, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6272, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:19<07:44,  5.46s/it][A

	loss_cls: tensor(0.5137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7252, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:24<07:37,  5.45s/it][A

	loss_cls: tensor(0.8840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1576, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0416, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:30<07:33,  5.46s/it][A

	loss_cls: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8333, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:35<07:26,  5.45s/it][A

	loss_cls: tensor(0.8550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0084, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:41<07:23,  5.47s/it][A

	loss_cls: tensor(0.5425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1270, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6695, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:46<07:17,  5.47s/it][A

	loss_cls: tensor(0.4477, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6839, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:52<07:12,  5.48s/it][A

	loss_cls: tensor(0.6301, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8755, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:57<07:06,  5.47s/it][A

	loss_cls: tensor(0.9133, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3222, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2355, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:03<07:00,  5.46s/it][A

	loss_cls: tensor(0.5047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1971, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7018, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:08<06:55,  5.47s/it][A

	loss_cls: tensor(0.9426, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0623, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:13<06:48,  5.45s/it][A

	loss_cls: tensor(0.5756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1726, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7482, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:19<06:45,  5.48s/it][A

	loss_cls: tensor(0.4573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3017, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7590, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:24<06:38,  5.46s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2397, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7789, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:30<06:34,  5.48s/it][A

	loss_cls: tensor(0.5423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1551, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6974, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:35<06:27,  5.46s/it][A

	loss_cls: tensor(0.6487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8684, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:41<06:21,  5.45s/it][A

	loss_cls: tensor(0.8027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0720, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:46<06:16,  5.46s/it][A

	loss_cls: tensor(0.6122, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8211, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:52<06:10,  5.45s/it][A

	loss_cls: tensor(0.6078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7214, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:57<06:06,  5.47s/it][A

	loss_cls: tensor(0.4210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0746, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:03<06:00,  5.46s/it][A

	loss_cls: tensor(0.6225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0742, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6968, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:08<05:56,  5.48s/it][A

	loss_cls: tensor(0.5704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3186, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8891, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:14<05:49,  5.47s/it][A

	loss_cls: tensor(0.6514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8365, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:19<05:43,  5.46s/it][A

	loss_cls: tensor(0.6892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0843, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:25<05:39,  5.47s/it][A

	loss_cls: tensor(0.5222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1190, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6412, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:30<05:32,  5.46s/it][A

	loss_cls: tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7980, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:36<05:28,  5.48s/it][A

	loss_cls: tensor(0.4087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3221, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7307, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:41<05:22,  5.46s/it][A

	loss_cls: tensor(0.5452, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2149, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7601, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:46<05:15,  5.45s/it][A

	loss_cls: tensor(0.4435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7114, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:52<05:10,  5.44s/it][A

	loss_cls: tensor(0.4531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6050, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:57<05:04,  5.44s/it][A

	loss_cls: tensor(0.3981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7175, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:03<05:00,  5.46s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2370, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7385, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:08<04:54,  5.45s/it][A

	loss_cls: tensor(0.6380, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0476, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6857, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:14<04:50,  5.48s/it][A

	loss_cls: tensor(0.6417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7209, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:19<04:44,  5.47s/it][A

	loss_cls: tensor(0.6666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1262, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7927, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:25<04:39,  5.48s/it][A

	loss_cls: tensor(0.4167, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8679, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:30<04:33,  5.46s/it][A

	loss_cls: tensor(0.4476, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7673, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:36<04:26,  5.45s/it][A

	loss_cls: tensor(0.5425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2520, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7945, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:41<04:22,  5.47s/it][A

	loss_cls: tensor(0.4943, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1351, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6294, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:46<04:16,  5.46s/it][A

	loss_cls: tensor(0.6000, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6838, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:52<04:12,  5.48s/it][A

	loss_cls: tensor(0.3799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4993, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:57<04:05,  5.47s/it][A

	loss_cls: tensor(0.3493, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2746, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6239, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:03<04:01,  5.48s/it][A

	loss_cls: tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2810, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6952, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:08<03:54,  5.46s/it][A

	loss_cls: tensor(0.6423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2933, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9356, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:14<03:48,  5.45s/it][A

	loss_cls: tensor(0.6897, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8414, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:19<03:43,  5.46s/it][A

	loss_cls: tensor(0.4778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5378, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0156, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:25<03:38,  5.46s/it][A

	loss_cls: tensor(0.6516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9653, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:30<03:33,  5.47s/it][A

	loss_cls: tensor(0.7686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1441, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9127, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:36<03:27,  5.46s/it][A

	loss_cls: tensor(0.7483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9586, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:41<03:22,  5.47s/it][A

	loss_cls: tensor(0.6171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8888, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:47<03:16,  5.46s/it][A

	loss_cls: tensor(0.4135, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7534, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:52<03:10,  5.43s/it][A

	loss_cls: tensor(0.7809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1959, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9768, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:58<03:05,  5.47s/it][A

	loss_cls: tensor(0.7590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0541, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:03<03:00,  5.46s/it][A

	loss_cls: tensor(0.6616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0914, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7529, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:08<02:55,  5.48s/it][A

	loss_cls: tensor(0.5095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3319, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8414, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:14<02:49,  5.47s/it][A

	loss_cls: tensor(0.5904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8268, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:19<02:44,  5.48s/it][A

	loss_cls: tensor(0.5980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2231, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8211, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:25<02:38,  5.47s/it][A

	loss_cls: tensor(0.6080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2347, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8426, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:30<02:32,  5.46s/it][A

	loss_cls: tensor(0.7965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5710, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3675, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:36<02:27,  5.48s/it][A

	loss_cls: tensor(0.3681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2786, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6467, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:41<02:22,  5.46s/it][A

	loss_cls: tensor(0.4802, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8173, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:47<02:17,  5.49s/it][A

	loss_cls: tensor(0.5459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6975, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:52<02:11,  5.47s/it][A

	loss_cls: tensor(0.5934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7537, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:58<02:05,  5.48s/it][A

	loss_cls: tensor(0.4504, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7967, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:03<01:59,  5.45s/it][A

	loss_cls: tensor(0.3781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0760, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4541, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:09<01:54,  5.46s/it][A

	loss_cls: tensor(0.4072, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6040, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:14<01:49,  5.46s/it][A

	loss_cls: tensor(0.4233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5313, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:19<01:43,  5.44s/it][A

	loss_cls: tensor(0.4415, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2665, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7080, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:25<01:38,  5.47s/it][A

	loss_cls: tensor(0.5288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6789, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:30<01:32,  5.45s/it][A

	loss_cls: tensor(0.7446, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1522, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:36<01:27,  5.47s/it][A

	loss_cls: tensor(0.4451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1806, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6257, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:41<01:21,  5.45s/it][A

	loss_cls: tensor(0.4554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6036, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:47<01:16,  5.47s/it][A

	loss_cls: tensor(0.5939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7632, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:52<01:10,  5.46s/it][A

	loss_cls: tensor(0.9460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1127, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:58<01:05,  5.44s/it][A

	loss_cls: tensor(0.6367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9707, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:03<00:59,  5.43s/it][A

	loss_cls: tensor(0.5887, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5142, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1030, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:09<00:54,  5.43s/it][A

	loss_cls: tensor(0.7286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1740, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9026, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:14<00:49,  5.45s/it][A

	loss_cls: tensor(0.4656, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5804, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:19<00:43,  5.44s/it][A

	loss_cls: tensor(0.6982, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1001, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7983, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:25<00:38,  5.46s/it][A

	loss_cls: tensor(0.5411, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2783, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8194, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:30<00:32,  5.46s/it][A

	loss_cls: tensor(0.7836, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9236, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:36<00:27,  5.46s/it][A

	loss_cls: tensor(0.5305, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6329, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:41<00:21,  5.48s/it][A

	loss_cls: tensor(1.0357, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2630, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:47<00:16,  5.46s/it][A

	loss_cls: tensor(0.4712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4160, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8872, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:52<00:10,  5.47s/it][A

	loss_cls: tensor(0.5609, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1664, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7273, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:58<00:05,  5.45s/it][A

	loss_cls: tensor(0.9330, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1552, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0882, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:00<00:00,  5.42s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.2663, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1273, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.3935, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8086691361531025

	Training cls acc: 0.7105108286252354

	Training cls prec: 0.5793150783616885

	Training cls rec: 0.6216456213278247

	Training cls f1: 0.5512535932870295

--
	Training ner acc: 0.9553852802343535

	Training ner prec: 0.27717740692571896

	Training ner rec: 0.2844938620587641

	Training ner f1: 0.28020301670681813

	Current Learning rate:  5.714285714285714e-05



  1%|          | 1/177 [00:00<02:08,  1.37it/s][A
  1%|          | 2/177 [00:01<02:10,  1.34it/s][A
  2%|▏         | 3/177 [00:02<02:10,  1.33it/s][A
  2%|▏         | 4/177 [00:02<02:03,  1.40it/s][A
  3%|▎         | 5/177 [00:03<02:04,  1.38it/s][A
  3%|▎         | 6/177 [00:04<02:05,  1.36it/s][A
  4%|▍         | 7/177 [00:05<02:05,  1.35it/s][A
  5%|▍         | 8/177 [00:05<02:00,  1.40it/s][A
  5%|▌         | 9/177 [00:06<02:02,  1.37it/s][A
  6%|▌         | 10/177 [00:07<02:02,  1.36it/s][A
  6%|▌         | 11/177 [00:07<01:58,  1.40it/s][A
  7%|▋         | 12/177 [00:08<01:59,  1.38it/s][A
  7%|▋         | 13/177 [00:09<01:59,  1.37it/s][A
  8%|▊         | 14/177 [00:10<02:00,  1.35it/s][A
  8%|▊         | 15/177 [00:10<01:57,  1.37it/s][A
  9%|▉         | 16/177 [00:11<01:57,  1.37it/s][A
 10%|▉         | 17/177 [00:12<01:58,  1.36it/s][A
 10%|█         | 18/177 [00:13<01:58,  1.34it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.38it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7767422110010676

	Validation cls acc: 0.714924670433145

	Validation cls prec: 0.6138182674199624

	Validation cls rec: 0.6110236750067258

	Validation cls f1: 0.5662671760129387

--
	Validation ner acc: 0.9537176374651111

	Validation ner prec: 0.3925570162682131

	Validation ner rec: 0.40320150659133713

	Validation ner f1: 0.39767172826932606



  0%|          | 1/354 [00:05<31:44,  5.39s/it][A

	loss_cls: tensor(0.5799, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2465, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8264, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:57,  5.45s/it][A

	loss_cls: tensor(0.4831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5415, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:47,  5.44s/it][A

	loss_cls: tensor(0.4512, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0404, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4916, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:54,  5.47s/it][A

	loss_cls: tensor(0.4981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2215, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7196, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:27<31:42,  5.45s/it][A

	loss_cls: tensor(0.8795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2153, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0948, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:44,  5.47s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3115, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8506, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:38<31:31,  5.45s/it][A

	loss_cls: tensor(0.6114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6745, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:20,  5.44s/it][A

	loss_cls: tensor(0.5313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6689, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:49<31:20,  5.45s/it][A

	loss_cls: tensor(0.6138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7973, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:12,  5.44s/it][A

	loss_cls: tensor(0.4206, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2934, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7140, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<31:13,  5.46s/it][A

	loss_cls: tensor(0.4879, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7580, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<30:52,  5.42s/it][A

	loss_cls: tensor(0.5528, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1164, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6693, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:57,  5.45s/it][A

	loss_cls: tensor(0.8395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9745, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:16<30:49,  5.44s/it][A

	loss_cls: tensor(0.3341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2247, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5588, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:39,  5.42s/it][A

	loss_cls: tensor(1.5387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1691, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7078, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:27<30:42,  5.45s/it][A

	loss_cls: tensor(0.6132, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7969, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:34,  5.44s/it][A

	loss_cls: tensor(0.5374, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1112, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:38<30:37,  5.47s/it][A

	loss_cls: tensor(0.4887, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7161, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:43<30:23,  5.44s/it][A

	loss_cls: tensor(0.6616, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4430, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1046, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:24,  5.46s/it][A

	loss_cls: tensor(0.5686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1211, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6896, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:54<30:10,  5.44s/it][A

	loss_cls: tensor(0.5515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1055, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6570, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<30:08,  5.45s/it][A

	loss_cls: tensor(0.5395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1277, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6672, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:05<30:01,  5.44s/it][A

	loss_cls: tensor(0.5993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4362, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0354, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:10<29:54,  5.44s/it][A

	loss_cls: tensor(0.6525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8288, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:16<29:55,  5.46s/it][A

	loss_cls: tensor(0.7976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2995, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0971, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:21<29:45,  5.44s/it][A

	loss_cls: tensor(0.3681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0661, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4342, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:27<29:43,  5.45s/it][A

	loss_cls: tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1792, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7344, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:32<29:34,  5.44s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6599, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:37<29:33,  5.46s/it][A

	loss_cls: tensor(0.5674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7943, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:43<29:25,  5.45s/it][A

	loss_cls: tensor(0.5817, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6301, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:48<29:18,  5.44s/it][A

	loss_cls: tensor(0.4031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4758, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:54<29:19,  5.47s/it][A

	loss_cls: tensor(0.5397, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8991, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:59<29:09,  5.45s/it][A

	loss_cls: tensor(0.6591, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2830, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9420, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:05<29:08,  5.46s/it][A

	loss_cls: tensor(0.5666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2137, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7803, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:10<28:54,  5.44s/it][A

	loss_cls: tensor(0.5471, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8470, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:16<28:52,  5.45s/it][A

	loss_cls: tensor(0.8197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1992, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0189, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:21<28:46,  5.45s/it][A

	loss_cls: tensor(0.4638, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0984, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5622, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:27<28:40,  5.44s/it][A

	loss_cls: tensor(0.5572, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7488, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:32<28:42,  5.47s/it][A

	loss_cls: tensor(0.8651, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2223, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:37<28:33,  5.46s/it][A

	loss_cls: tensor(0.4703, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0606, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5310, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:43<28:32,  5.47s/it][A

	loss_cls: tensor(0.5270, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7575, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:48<28:23,  5.46s/it][A

	loss_cls: tensor(0.4980, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6496, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:54<28:24,  5.48s/it][A

	loss_cls: tensor(0.4816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6375, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:59<28:16,  5.47s/it][A

	loss_cls: tensor(0.6127, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2420, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8547, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:05<28:06,  5.46s/it][A

	loss_cls: tensor(0.4220, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5420, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:10<28:04,  5.47s/it][A

	loss_cls: tensor(0.5901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1632, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7532, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:16<27:54,  5.45s/it][A

	loss_cls: tensor(0.5252, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5659, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:21<27:54,  5.47s/it][A

	loss_cls: tensor(0.6401, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2011, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8412, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:27<27:45,  5.46s/it][A

	loss_cls: tensor(0.5449, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0377, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5826, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:32<27:44,  5.47s/it][A

	loss_cls: tensor(0.4005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1139, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5144, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:38<27:36,  5.47s/it][A

	loss_cls: tensor(0.4483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1924, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6407, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:43<27:27,  5.45s/it][A

	loss_cls: tensor(0.3627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2080, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5707, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:49<27:27,  5.47s/it][A

	loss_cls: tensor(0.7004, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8840, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:54<27:16,  5.46s/it][A

	loss_cls: tensor(0.5002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2716, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7718, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:59<27:16,  5.47s/it][A

	loss_cls: tensor(0.7850, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8827, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:05<27:07,  5.46s/it][A

	loss_cls: tensor(0.6674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1867, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:10<27:05,  5.47s/it][A

	loss_cls: tensor(0.5893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0670, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6563, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:16<26:52,  5.45s/it][A

	loss_cls: tensor(0.4532, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3634, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8166, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:21<26:41,  5.43s/it][A

	loss_cls: tensor(0.4908, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8478, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:27<26:43,  5.45s/it][A

	loss_cls: tensor(0.5620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7588, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:32<26:35,  5.45s/it][A

	loss_cls: tensor(0.4314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5298, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:38<26:38,  5.47s/it][A

	loss_cls: tensor(0.4554, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0671, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5225, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:43<26:29,  5.46s/it][A

	loss_cls: tensor(0.4615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0489, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5104, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:49<26:28,  5.48s/it][A

	loss_cls: tensor(0.4068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6432, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:54<26:18,  5.46s/it][A

	loss_cls: tensor(0.5172, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1363, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6535, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:59<26:08,  5.45s/it][A

	loss_cls: tensor(0.5603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6608, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:05<26:07,  5.46s/it][A

	loss_cls: tensor(0.7281, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9356, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:10<25:59,  5.45s/it][A

	loss_cls: tensor(0.4116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1215, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:16<25:59,  5.47s/it][A

	loss_cls: tensor(1.0993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2458, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:21<25:51,  5.46s/it][A

	loss_cls: tensor(0.4265, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1046, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5311, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:27<25:48,  5.47s/it][A

	loss_cls: tensor(0.4396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6424, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:32<25:40,  5.46s/it][A

	loss_cls: tensor(0.5235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3056, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8291, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:38<25:28,  5.44s/it][A

	loss_cls: tensor(0.6870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8783, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:43<25:24,  5.44s/it][A

	loss_cls: tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5559, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:48<25:11,  5.42s/it][A

	loss_cls: tensor(0.3760, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4097, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:54<25:12,  5.44s/it][A

	loss_cls: tensor(0.5942, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8021, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:59<25:03,  5.43s/it][A

	loss_cls: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1344, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5610, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:05<25:04,  5.45s/it][A

	loss_cls: tensor(0.5704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7921, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:10<24:54,  5.43s/it][A

	loss_cls: tensor(0.4812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5848, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:16<24:45,  5.42s/it][A

	loss_cls: tensor(0.4704, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5885, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:21<24:45,  5.44s/it][A

	loss_cls: tensor(1.0395, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1848, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2243, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:26<24:29,  5.40s/it][A

	loss_cls: tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1430, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:32<24:35,  5.44s/it][A

	loss_cls: tensor(0.5362, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9235, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:37<24:25,  5.43s/it][A

	loss_cls: tensor(0.4087, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1654, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5742, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:43<24:23,  5.44s/it][A

	loss_cls: tensor(0.7111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8696, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:48<24:20,  5.45s/it][A

	loss_cls: tensor(0.5976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1839, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7816, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:54<24:08,  5.42s/it][A

	loss_cls: tensor(0.4646, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3395, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8040, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:59<24:07,  5.44s/it][A

	loss_cls: tensor(0.7084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2264, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9348, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:05<23:57,  5.42s/it][A

	loss_cls: tensor(0.6977, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2025, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9002, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:10<23:55,  5.44s/it][A

	loss_cls: tensor(0.4724, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6637, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:15<23:42,  5.41s/it][A

	loss_cls: tensor(0.3828, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5392, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:21<23:41,  5.43s/it][A

	loss_cls: tensor(0.5099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3561, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8660, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:26<23:30,  5.40s/it][A

	loss_cls: tensor(0.4691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6552, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:32<23:20,  5.39s/it][A

	loss_cls: tensor(0.5608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8523, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:37<23:18,  5.40s/it][A

	loss_cls: tensor(0.5543, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3964, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9507, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:42<23:09,  5.38s/it][A

	loss_cls: tensor(0.4146, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5795, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:48<23:10,  5.41s/it][A

	loss_cls: tensor(0.5810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9266, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:53<23:03,  5.41s/it][A

	loss_cls: tensor(0.4513, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0769, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5282, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:59<23:02,  5.42s/it][A

	loss_cls: tensor(0.5515, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1831, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7346, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:04<22:52,  5.41s/it][A

	loss_cls: tensor(0.6384, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8915, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:09<22:43,  5.39s/it][A

	loss_cls: tensor(0.6990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2431, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9421, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:15<22:42,  5.41s/it][A

	loss_cls: tensor(0.8936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2928, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1864, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:20<22:34,  5.40s/it][A

	loss_cls: tensor(0.5331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2709, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8040, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:26<22:37,  5.43s/it][A

	loss_cls: tensor(0.6034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6513, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:31<22:32,  5.43s/it][A

	loss_cls: tensor(0.7018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2027, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9045, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:37<22:32,  5.45s/it][A

	loss_cls: tensor(0.6561, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2072, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8633, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:42<22:21,  5.43s/it][A

	loss_cls: tensor(0.6332, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8555, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:47<22:12,  5.42s/it][A

	loss_cls: tensor(0.4919, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8127, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:53<22:07,  5.42s/it][A

	loss_cls: tensor(0.6666, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2187, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8852, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:58<21:59,  5.41s/it][A

	loss_cls: tensor(0.5074, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6753, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:04<21:56,  5.42s/it][A

	loss_cls: tensor(0.7137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8584, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:09<21:47,  5.40s/it][A

	loss_cls: tensor(0.8652, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3546, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:14<21:43,  5.41s/it][A

	loss_cls: tensor(0.5366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9920, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:20<21:36,  5.40s/it][A

	loss_cls: tensor(0.5796, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7779, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:25<21:29,  5.40s/it][A

	loss_cls: tensor(0.4335, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5903, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:31<21:28,  5.41s/it][A

	loss_cls: tensor(0.6417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0191, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:36<21:22,  5.41s/it][A

	loss_cls: tensor(0.4781, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6276, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:42<21:20,  5.43s/it][A

	loss_cls: tensor(0.5597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1773, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7370, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:47<21:09,  5.40s/it][A

	loss_cls: tensor(0.4342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7609, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:52<21:08,  5.42s/it][A

	loss_cls: tensor(0.6202, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3658, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9860, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:58<20:57,  5.40s/it][A

	loss_cls: tensor(0.7001, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8352, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [11:03<20:54,  5.41s/it][A

	loss_cls: tensor(0.6465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4604, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1070, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:08<20:45,  5.39s/it][A

	loss_cls: tensor(0.6565, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2657, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9222, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:14<20:38,  5.39s/it][A

	loss_cls: tensor(0.3346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4465, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:19<20:36,  5.40s/it][A

	loss_cls: tensor(0.4506, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3303, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7809, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:25<20:30,  5.40s/it][A

	loss_cls: tensor(0.3726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2130, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5856, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:30<20:25,  5.40s/it][A

	loss_cls: tensor(0.6417, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2034, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8451, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:35<20:19,  5.40s/it][A

	loss_cls: tensor(0.9863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2371, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2234, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:41<20:15,  5.40s/it][A

	loss_cls: tensor(0.4255, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5974, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:46<20:09,  5.40s/it][A

	loss_cls: tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2485, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9823, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:52<20:03,  5.40s/it][A

	loss_cls: tensor(0.9314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0932, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:57<20:03,  5.42s/it][A

	loss_cls: tensor(0.5194, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2035, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7229, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [12:02<19:54,  5.40s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9203, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:08<19:51,  5.42s/it][A

	loss_cls: tensor(0.3541, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1886, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5426, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:13<19:41,  5.39s/it][A

	loss_cls: tensor(0.5391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7247, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:19<19:40,  5.42s/it][A

	loss_cls: tensor(0.4866, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6889, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:24<19:32,  5.40s/it][A

	loss_cls: tensor(0.5949, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9047, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:29<19:25,  5.40s/it][A

	loss_cls: tensor(0.5297, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1227, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6524, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:35<19:24,  5.41s/it][A

	loss_cls: tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5464, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:40<19:16,  5.40s/it][A

	loss_cls: tensor(0.8602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3206, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1808, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:46<19:12,  5.41s/it][A

	loss_cls: tensor(0.6859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2554, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9413, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:51<19:05,  5.40s/it][A

	loss_cls: tensor(0.4469, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6336, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:57<19:02,  5.42s/it][A

	loss_cls: tensor(0.4744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6824, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [13:02<18:56,  5.41s/it][A

	loss_cls: tensor(0.4577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1918, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6495, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:07<18:48,  5.40s/it][A

	loss_cls: tensor(0.9084, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3331, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2415, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:13<18:48,  5.42s/it][A

	loss_cls: tensor(0.5895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1932, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7827, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:18<18:38,  5.40s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1897, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7164, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:24<18:35,  5.41s/it][A

	loss_cls: tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0010, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:29<18:28,  5.41s/it][A

	loss_cls: tensor(0.7053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0002, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:35<18:27,  5.43s/it][A

	loss_cls: tensor(0.5290, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1052, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6342, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:40<18:18,  5.41s/it][A

	loss_cls: tensor(0.6845, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2751, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9596, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:45<18:17,  5.44s/it][A

	loss_cls: tensor(0.6789, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9492, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:51<18:19,  5.47s/it][A

	loss_cls: tensor(0.5925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9796, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:56<18:17,  5.49s/it][A

	loss_cls: tensor(0.4985, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3707, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8692, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [14:02<18:17,  5.52s/it][A

	loss_cls: tensor(0.3874, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2861, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6735, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:08<18:14,  5.53s/it][A

	loss_cls: tensor(0.5810, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3935, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9745, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:13<18:03,  5.50s/it][A

	loss_cls: tensor(0.4334, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5517, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:19<18:02,  5.52s/it][A

	loss_cls: tensor(0.8929, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0015, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:24<17:56,  5.52s/it][A

	loss_cls: tensor(0.4066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4595, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:30<17:55,  5.54s/it][A

	loss_cls: tensor(0.8423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3828, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2251, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:35<17:47,  5.53s/it][A

	loss_cls: tensor(0.7631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1053, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8684, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:41<17:45,  5.55s/it][A

	loss_cls: tensor(0.8213, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1263, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:46<17:39,  5.55s/it][A

	loss_cls: tensor(0.6764, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8851, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:52<17:37,  5.57s/it][A

	loss_cls: tensor(0.6587, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3062, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9649, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:57<17:29,  5.55s/it][A

	loss_cls: tensor(0.5342, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2195, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7537, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [15:03<17:22,  5.55s/it][A

	loss_cls: tensor(0.5732, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1743, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7474, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:09<17:19,  5.56s/it][A

	loss_cls: tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9206, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:14<17:10,  5.54s/it][A

	loss_cls: tensor(0.5367, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8586, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:20<17:09,  5.56s/it][A

	loss_cls: tensor(0.8600, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0781, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9381, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:25<16:52,  5.50s/it][A

	loss_cls: tensor(0.6433, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1625, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8058, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:31<16:43,  5.48s/it][A

	loss_cls: tensor(1.1527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3546, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5074, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:36<16:31,  5.45s/it][A

	loss_cls: tensor(0.6248, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3565, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9813, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:41<16:19,  5.41s/it][A

	loss_cls: tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1993, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:47<16:14,  5.41s/it][A

	loss_cls: tensor(0.5870, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9859, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:52<16:07,  5.41s/it][A

	loss_cls: tensor(0.4166, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4085, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8252, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:57<16:05,  5.42s/it][A

	loss_cls: tensor(0.4699, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2713, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7413, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [16:03<15:57,  5.41s/it][A

	loss_cls: tensor(0.7053, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2079, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9133, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:08<15:54,  5.42s/it][A

	loss_cls: tensor(0.6654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0899, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7553, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:14<15:45,  5.40s/it][A

	loss_cls: tensor(0.5287, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0736, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6024, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:19<15:38,  5.39s/it][A

	loss_cls: tensor(0.5045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3789, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8833, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:24<15:34,  5.40s/it][A

	loss_cls: tensor(0.6168, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7649, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:30<15:28,  5.40s/it][A

	loss_cls: tensor(0.4514, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4359, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8873, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:35<15:28,  5.43s/it][A

	loss_cls: tensor(0.5556, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6170, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:41<15:21,  5.42s/it][A

	loss_cls: tensor(0.4969, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7984, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:46<15:17,  5.43s/it][A

	loss_cls: tensor(1.0745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3090, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:52<15:11,  5.43s/it][A

	loss_cls: tensor(0.5378, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1548, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6926, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:57<15:04,  5.42s/it][A

	loss_cls: tensor(0.7928, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0158, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [17:03<15:03,  5.44s/it][A

	loss_cls: tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9276, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:08<14:57,  5.44s/it][A

	loss_cls: tensor(0.5331, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2498, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7829, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:13<14:55,  5.46s/it][A

	loss_cls: tensor(0.4604, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2097, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6701, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:19<14:47,  5.45s/it][A

	loss_cls: tensor(0.7527, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2024, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9550, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:24<14:44,  5.46s/it][A

	loss_cls: tensor(0.6459, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1571, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8030, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:30<14:36,  5.45s/it][A

	loss_cls: tensor(0.3975, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8233, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:35<14:28,  5.43s/it][A

	loss_cls: tensor(0.7013, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8743, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:41<14:25,  5.44s/it][A

	loss_cls: tensor(0.4720, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5263, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9983, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:46<14:17,  5.43s/it][A

	loss_cls: tensor(0.6576, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0121, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:51<14:13,  5.44s/it][A

	loss_cls: tensor(0.7626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0922, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8549, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:57<14:06,  5.43s/it][A

	loss_cls: tensor(0.5058, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8599, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [18:02<14:03,  5.44s/it][A

	loss_cls: tensor(0.5735, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6657, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:08<13:58,  5.44s/it][A

	loss_cls: tensor(0.6344, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2749, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9093, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:13<13:50,  5.43s/it][A

	loss_cls: tensor(0.6086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1464, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7550, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:19<13:46,  5.44s/it][A

	loss_cls: tensor(0.4313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2700, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7013, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:24<13:39,  5.43s/it][A

	loss_cls: tensor(0.6730, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1259, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7989, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:30<13:35,  5.44s/it][A

	loss_cls: tensor(0.7266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3937, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1204, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:35<13:28,  5.43s/it][A

	loss_cls: tensor(0.5274, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0567, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5841, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:40<13:23,  5.43s/it][A

	loss_cls: tensor(0.4181, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5010, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:46<13:15,  5.41s/it][A

	loss_cls: tensor(0.7137, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2805, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9942, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:51<13:08,  5.40s/it][A

	loss_cls: tensor(1.0066, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1116, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1183, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:57<13:05,  5.41s/it][A

	loss_cls: tensor(0.5435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1291, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6726, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [19:02<12:59,  5.41s/it][A

	loss_cls: tensor(0.5412, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7261, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [19:07<12:55,  5.42s/it][A

	loss_cls: tensor(0.6869, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2189, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9058, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:13<12:48,  5.42s/it][A

	loss_cls: tensor(0.4631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2968, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7599, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:18<12:44,  5.42s/it][A

	loss_cls: tensor(0.5936, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8299, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:24<12:37,  5.41s/it][A

	loss_cls: tensor(0.6429, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7517, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:29<12:30,  5.40s/it][A

	loss_cls: tensor(0.4507, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2087, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6594, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:34<12:26,  5.41s/it][A

	loss_cls: tensor(0.6957, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8366, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:40<12:19,  5.40s/it][A

	loss_cls: tensor(0.5968, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1913, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7881, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:45<12:17,  5.42s/it][A

	loss_cls: tensor(0.5626, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7613, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:51<12:09,  5.41s/it][A

	loss_cls: tensor(0.5712, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1031, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6743, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:56<12:05,  5.42s/it][A

	loss_cls: tensor(0.5468, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3531, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8999, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [20:01<11:58,  5.40s/it][A

	loss_cls: tensor(0.7811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1696, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9506, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [20:07<11:55,  5.42s/it][A

	loss_cls: tensor(0.4695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1512, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6207, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:12<11:51,  5.43s/it][A

	loss_cls: tensor(0.5725, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6402, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:18<11:44,  5.42s/it][A

	loss_cls: tensor(0.5225, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6245, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:23<11:40,  5.43s/it][A

	loss_cls: tensor(0.5156, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6483, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:29<11:34,  5.42s/it][A

	loss_cls: tensor(0.7674, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1585, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:34<11:31,  5.44s/it][A

	loss_cls: tensor(0.6128, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2032, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8160, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:40<11:23,  5.42s/it][A

	loss_cls: tensor(0.5726, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8993, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:45<11:19,  5.44s/it][A

	loss_cls: tensor(0.6892, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1987, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8879, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:50<11:12,  5.42s/it][A

	loss_cls: tensor(0.9222, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2030, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1252, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:56<11:05,  5.41s/it][A

	loss_cls: tensor(0.4063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1889, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5952, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [21:01<11:03,  5.44s/it][A

	loss_cls: tensor(0.7518, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2564, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0082, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [21:07<10:55,  5.42s/it][A

	loss_cls: tensor(0.7684, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0504, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:12<10:52,  5.43s/it][A

	loss_cls: tensor(0.4840, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0427, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5267, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:17<10:43,  5.41s/it][A

	loss_cls: tensor(0.5018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0394, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5412, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:23<10:41,  5.43s/it][A

	loss_cls: tensor(0.7284, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1580, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8864, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:28<10:33,  5.42s/it][A

	loss_cls: tensor(0.5052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5481, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:34<10:27,  5.41s/it][A

	loss_cls: tensor(0.7285, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2243, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9528, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:39<10:23,  5.42s/it][A

	loss_cls: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7287, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:45<10:17,  5.41s/it][A

	loss_cls: tensor(0.6680, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8251, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:50<10:12,  5.42s/it][A

	loss_cls: tensor(0.5759, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6637, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:55<10:05,  5.41s/it][A

	loss_cls: tensor(0.7588, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8824, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [22:01<10:01,  5.42s/it][A

	loss_cls: tensor(0.4560, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2015, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6575, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [22:06<09:54,  5.41s/it][A

	loss_cls: tensor(0.5686, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8263, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:12<09:49,  5.41s/it][A

	loss_cls: tensor(0.6024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7706, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:17<09:46,  5.43s/it][A

	loss_cls: tensor(0.6086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7085, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:23<09:41,  5.43s/it][A

	loss_cls: tensor(0.5097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1179, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6276, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:28<09:37,  5.45s/it][A

	loss_cls: tensor(0.5236, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4274, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9510, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:33<09:29,  5.42s/it][A

	loss_cls: tensor(0.5812, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8316, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:39<09:25,  5.44s/it][A

	loss_cls: tensor(0.6438, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1322, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7760, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:44<09:17,  5.42s/it][A

	loss_cls: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6973, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:50<09:11,  5.41s/it][A

	loss_cls: tensor(0.7478, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8389, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:55<09:07,  5.42s/it][A

	loss_cls: tensor(0.5322, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2028, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7349, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [23:00<09:00,  5.40s/it][A

	loss_cls: tensor(0.4602, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9039, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [23:06<08:55,  5.41s/it][A

	loss_cls: tensor(0.4951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2607, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7558, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:11<08:49,  5.40s/it][A

	loss_cls: tensor(0.5567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5944, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:17<08:44,  5.41s/it][A

	loss_cls: tensor(0.3454, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2272, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5725, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:22<08:38,  5.40s/it][A

	loss_cls: tensor(0.5256, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2099, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7355, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:27<08:31,  5.39s/it][A

	loss_cls: tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1947, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6703, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:33<08:27,  5.40s/it][A

	loss_cls: tensor(0.6008, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3770, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9778, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:38<08:20,  5.38s/it][A

	loss_cls: tensor(0.4933, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3667, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8600, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:44<08:16,  5.40s/it][A

	loss_cls: tensor(0.6924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3123, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0046, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:49<08:10,  5.39s/it][A

	loss_cls: tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9132, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:54<08:06,  5.40s/it][A

	loss_cls: tensor(0.4731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1950, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6681, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [24:00<07:59,  5.39s/it][A

	loss_cls: tensor(0.3643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3043, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6685, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [24:05<07:53,  5.38s/it][A

	loss_cls: tensor(0.7946, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1948, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9894, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:11<07:49,  5.39s/it][A

	loss_cls: tensor(0.6465, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9082, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:16<07:42,  5.38s/it][A

	loss_cls: tensor(0.4818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6063, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:21<07:40,  5.41s/it][A

	loss_cls: tensor(0.7693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2355, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0048, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:27<07:35,  5.42s/it][A

	loss_cls: tensor(1.1581, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1701, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3282, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:32<07:31,  5.44s/it][A

	loss_cls: tensor(0.6207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8116, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:38<07:24,  5.42s/it][A

	loss_cls: tensor(0.4749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6367, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:43<07:17,  5.40s/it][A

	loss_cls: tensor(0.5620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1679, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7299, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:48<07:12,  5.41s/it][A

	loss_cls: tensor(0.4262, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0993, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5256, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:54<07:05,  5.39s/it][A

	loss_cls: tensor(0.4851, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1747, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6598, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:59<07:02,  5.42s/it][A

	loss_cls: tensor(0.6423, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1921, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8344, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [25:05<06:56,  5.41s/it][A

	loss_cls: tensor(0.5702, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6574, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:10<06:52,  5.43s/it][A

	loss_cls: tensor(0.7309, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2710, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0020, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:15<06:45,  5.41s/it][A

	loss_cls: tensor(0.5316, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2131, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7447, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:21<06:39,  5.39s/it][A

	loss_cls: tensor(0.2793, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2515, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5308, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:26<06:34,  5.41s/it][A

	loss_cls: tensor(0.4142, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7213, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:32<06:29,  5.40s/it][A

	loss_cls: tensor(0.5740, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7787, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:37<06:25,  5.43s/it][A

	loss_cls: tensor(0.4136, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3117, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7253, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:43<06:18,  5.41s/it][A

	loss_cls: tensor(0.7531, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8873, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:48<06:13,  5.41s/it][A

	loss_cls: tensor(0.4436, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5714, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:53<06:07,  5.40s/it][A

	loss_cls: tensor(0.5235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3462, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8698, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:59<06:01,  5.39s/it][A

	loss_cls: tensor(0.5188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2200, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7387, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:04<05:56,  5.41s/it][A

	loss_cls: tensor(0.5771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2447, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8218, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:10<05:50,  5.40s/it][A

	loss_cls: tensor(0.7484, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0234, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:15<05:47,  5.42s/it][A

	loss_cls: tensor(0.5631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1762, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7392, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:20<05:41,  5.42s/it][A

	loss_cls: tensor(0.5086, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2288, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7374, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:26<05:36,  5.43s/it][A

	loss_cls: tensor(0.4057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3036, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7093, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:31<05:32,  5.45s/it][A

	loss_cls: tensor(0.8649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3275, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1924, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:37<05:24,  5.41s/it][A

	loss_cls: tensor(0.4068, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2268, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6336, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:42<05:19,  5.41s/it][A

	loss_cls: tensor(0.4816, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2648, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7464, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:47<05:12,  5.39s/it][A

	loss_cls: tensor(0.5522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7890, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:53<05:08,  5.41s/it][A

	loss_cls: tensor(0.8306, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1284, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9589, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:58<05:02,  5.41s/it][A

	loss_cls: tensor(0.5618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3293, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8912, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:04<04:57,  5.42s/it][A

	loss_cls: tensor(0.4240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6069, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:09<04:51,  5.40s/it][A

	loss_cls: tensor(0.4064, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4196, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8260, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:14<04:45,  5.38s/it][A

	loss_cls: tensor(0.5605, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7724, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:20<04:40,  5.40s/it][A

	loss_cls: tensor(0.6158, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.7588, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3746, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:25<04:35,  5.40s/it][A

	loss_cls: tensor(0.7241, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1623, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8864, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:31<04:31,  5.42s/it][A

	loss_cls: tensor(0.7315, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3821, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1136, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:36<04:24,  5.41s/it][A

	loss_cls: tensor(0.8525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3474, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:42<04:20,  5.42s/it][A

	loss_cls: tensor(0.5868, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1929, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7797, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:47<04:13,  5.40s/it][A

	loss_cls: tensor(0.4580, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5949, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:52<04:07,  5.39s/it][A

	loss_cls: tensor(0.6036, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7434, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:58<04:03,  5.40s/it][A

	loss_cls: tensor(0.6834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0979, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7814, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:03<03:57,  5.40s/it][A

	loss_cls: tensor(0.4116, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2866, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6982, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:09<03:52,  5.41s/it][A

	loss_cls: tensor(0.6641, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5345, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1986, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:14<03:46,  5.40s/it][A

	loss_cls: tensor(0.6088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2637, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8726, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:19<03:41,  5.41s/it][A

	loss_cls: tensor(0.4490, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5744, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:25<03:35,  5.39s/it][A

	loss_cls: tensor(0.4326, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1399, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5725, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:30<03:29,  5.37s/it][A

	loss_cls: tensor(0.6856, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1765, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8622, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:35<03:25,  5.40s/it][A

	loss_cls: tensor(0.5792, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7990, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:41<03:19,  5.39s/it][A

	loss_cls: tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7748, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:46<03:15,  5.42s/it][A

	loss_cls: tensor(0.5359, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1718, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7076, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:52<03:09,  5.41s/it][A

	loss_cls: tensor(0.5399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1051, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6450, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:57<03:04,  5.42s/it][A

	loss_cls: tensor(0.6197, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6756, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:03<02:58,  5.40s/it][A

	loss_cls: tensor(0.6346, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3119, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9465, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:08<02:52,  5.40s/it][A

	loss_cls: tensor(0.5130, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1578, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6708, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:13<02:47,  5.40s/it][A

	loss_cls: tensor(0.5258, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8369, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:19<02:41,  5.39s/it][A

	loss_cls: tensor(0.8067, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2646, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0714, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:24<02:37,  5.42s/it][A

	loss_cls: tensor(1.2338, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4858, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.7196, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:30<02:31,  5.41s/it][A

	loss_cls: tensor(0.6524, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2927, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9451, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:35<02:26,  5.42s/it][A

	loss_cls: tensor(0.8230, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3601, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1831, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:40<02:20,  5.40s/it][A

	loss_cls: tensor(0.4606, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3951, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8557, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:46<02:15,  5.41s/it][A

	loss_cls: tensor(1.3300, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1570, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4870, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:51<02:09,  5.40s/it][A

	loss_cls: tensor(0.4005, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7054, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:57<02:03,  5.39s/it][A

	loss_cls: tensor(0.5337, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6946, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:02<01:59,  5.42s/it][A

	loss_cls: tensor(0.7802, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2078, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9880, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:07<01:53,  5.40s/it][A

	loss_cls: tensor(0.4687, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2300, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6987, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:13<01:48,  5.41s/it][A

	loss_cls: tensor(0.8352, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1838, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0190, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:18<01:42,  5.40s/it][A

	loss_cls: tensor(0.5071, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5564, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:24<01:37,  5.42s/it][A

	loss_cls: tensor(0.5731, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1070, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6801, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:29<01:31,  5.40s/it][A

	loss_cls: tensor(0.4673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2525, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7198, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:34<01:26,  5.40s/it][A

	loss_cls: tensor(0.6414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1325, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7739, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:40<01:21,  5.41s/it][A

	loss_cls: tensor(0.6060, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9894, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:45<01:15,  5.42s/it][A

	loss_cls: tensor(0.5966, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2982, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8948, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:51<01:10,  5.42s/it][A

	loss_cls: tensor(0.7627, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0002, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:56<01:04,  5.41s/it][A

	loss_cls: tensor(0.6603, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1398, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8001, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:02<00:59,  5.44s/it][A

	loss_cls: tensor(0.6597, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4050, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0647, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:07<00:54,  5.42s/it][A

	loss_cls: tensor(0.5018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1558, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6575, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:12<00:48,  5.40s/it][A

	loss_cls: tensor(0.7091, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1473, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8564, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:18<00:43,  5.42s/it][A

	loss_cls: tensor(0.6681, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0422, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:23<00:37,  5.40s/it][A

	loss_cls: tensor(0.7414, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0759, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8172, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:29<00:32,  5.41s/it][A

	loss_cls: tensor(0.7075, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9185, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:34<00:26,  5.40s/it][A

	loss_cls: tensor(0.7516, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9952, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:39<00:21,  5.41s/it][A

	loss_cls: tensor(0.7119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1559, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8678, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:45<00:16,  5.39s/it][A

	loss_cls: tensor(0.6268, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1279, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7547, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:50<00:10,  5.38s/it][A

	loss_cls: tensor(0.4430, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6002, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [31:56<00:05,  5.39s/it][A

	loss_cls: tensor(0.4243, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2225, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6468, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [31:58<00:00,  5.42s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.5939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0487, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6426, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8165552487312737

	Training cls acc: 0.6992702448210922

	Training cls prec: 0.5775583565837803

	Training cls rec: 0.6127355048435557

	Training cls f1: 0.5441252037837586

--
	Training ner acc: 0.9555629884464638

	Training ner prec: 0.2570565042700972

	Training ner rec: 0.2657109788850915

	Training ner f1: 0.26098117506260404

	Current Learning rate:  2.857142857142857e-05



  1%|          | 1/177 [00:00<02:16,  1.29it/s][A
  1%|          | 2/177 [00:01<02:14,  1.30it/s][A
  2%|▏         | 3/177 [00:02<02:05,  1.39it/s][A
  2%|▏         | 4/177 [00:02<02:05,  1.38it/s][A
  3%|▎         | 5/177 [00:03<02:06,  1.36it/s][A
  3%|▎         | 6/177 [00:04<02:06,  1.36it/s][A
  4%|▍         | 7/177 [00:05<02:01,  1.40it/s][A
  5%|▍         | 8/177 [00:05<02:02,  1.38it/s][A
  5%|▌         | 9/177 [00:06<02:02,  1.37it/s][A
  6%|▌         | 10/177 [00:07<01:58,  1.41it/s][A
  6%|▌         | 11/177 [00:07<01:58,  1.40it/s][A
  7%|▋         | 12/177 [00:08<01:59,  1.38it/s][A
  7%|▋         | 13/177 [00:09<01:59,  1.37it/s][A
  8%|▊         | 14/177 [00:10<01:55,  1.41it/s][A
  8%|▊         | 15/177 [00:10<01:57,  1.38it/s][A
  9%|▉         | 16/177 [00:11<01:57,  1.37it/s][A
 10%|▉         | 17/177 [00:12<01:57,  1.36it/s][A
 10%|█         | 18/177 [00:13<01:53,  1.41it/s][A
 11%|█         | 19/177 [00:13<01:54,  1.38it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7714538323340443

	Validation cls acc: 0.7076271186440678

	Validation cls prec: 0.6088613128867366

	Validation cls rec: 0.5954465967177832

	Validation cls f1: 0.5566463354598948

--
	Validation ner acc: 0.9549828855784597

	Validation ner prec: 0.42116140458729495

	Validation ner rec: 0.4315442561205273

	Validation ner f1: 0.42614423806120827



  0%|          | 1/354 [00:05<31:45,  5.40s/it][A

	loss_cls: tensor(0.4971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1236, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6207, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 2/354 [00:10<31:49,  5.42s/it][A

	loss_cls: tensor(0.5070, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1863, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6934, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 3/354 [00:16<31:28,  5.38s/it][A

	loss_cls: tensor(0.6733, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0450, device='cuda:0', grad_fn=<AddBackward0>)



  1%|          | 4/354 [00:21<31:19,  5.37s/it][A

	loss_cls: tensor(0.9028, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2463, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1491, device='cuda:0', grad_fn=<AddBackward0>)



  1%|▏         | 5/354 [00:26<31:22,  5.39s/it][A

	loss_cls: tensor(0.5487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4088, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9575, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 6/354 [00:32<31:15,  5.39s/it][A

	loss_cls: tensor(0.8661, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0752, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 7/354 [00:37<31:16,  5.41s/it][A

	loss_cls: tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7827, device='cuda:0', grad_fn=<AddBackward0>)



  2%|▏         | 8/354 [00:43<31:08,  5.40s/it][A

	loss_cls: tensor(0.4924, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7311, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 9/354 [00:48<31:09,  5.42s/it][A

	loss_cls: tensor(0.5993, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0470, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6463, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 10/354 [00:54<31:06,  5.43s/it][A

	loss_cls: tensor(0.8945, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0622, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 11/354 [00:59<31:13,  5.46s/it][A

	loss_cls: tensor(0.7080, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1639, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8719, device='cuda:0', grad_fn=<AddBackward0>)



  3%|▎         | 12/354 [01:05<31:03,  5.45s/it][A

	loss_cls: tensor(0.4266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4803, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9069, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▎         | 13/354 [01:10<30:49,  5.42s/it][A

	loss_cls: tensor(0.4750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5244, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 14/354 [01:15<30:49,  5.44s/it][A

	loss_cls: tensor(0.4237, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1631, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5869, device='cuda:0', grad_fn=<AddBackward0>)



  4%|▍         | 15/354 [01:21<30:36,  5.42s/it][A

	loss_cls: tensor(0.7934, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1552, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 16/354 [01:26<30:35,  5.43s/it][A

	loss_cls: tensor(0.3171, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2003, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5174, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▍         | 17/354 [01:32<30:22,  5.41s/it][A

	loss_cls: tensor(0.4814, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0874, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5689, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 18/354 [01:37<30:22,  5.42s/it][A

	loss_cls: tensor(0.5566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6704, device='cuda:0', grad_fn=<AddBackward0>)



  5%|▌         | 19/354 [01:42<30:13,  5.41s/it][A

	loss_cls: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2330, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6161, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 20/354 [01:48<30:05,  5.41s/it][A

	loss_cls: tensor(0.4275, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2161, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6436, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 21/354 [01:53<30:03,  5.42s/it][A

	loss_cls: tensor(0.6461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1376, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7838, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▌         | 22/354 [01:59<29:54,  5.40s/it][A

	loss_cls: tensor(0.7769, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8669, device='cuda:0', grad_fn=<AddBackward0>)



  6%|▋         | 23/354 [02:04<29:51,  5.41s/it][A

	loss_cls: tensor(0.4292, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7561, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 24/354 [02:09<29:41,  5.40s/it][A

	loss_cls: tensor(0.5173, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4048, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9222, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 25/354 [02:15<29:40,  5.41s/it][A

	loss_cls: tensor(0.4821, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1689, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6509, device='cuda:0', grad_fn=<AddBackward0>)



  7%|▋         | 26/354 [02:20<29:30,  5.40s/it][A

	loss_cls: tensor(0.5668, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2349, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8017, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 27/354 [02:26<29:21,  5.39s/it][A

	loss_cls: tensor(0.6045, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5375, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1420, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 28/354 [02:31<29:23,  5.41s/it][A

	loss_cls: tensor(0.5251, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1892, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 29/354 [02:36<29:12,  5.39s/it][A

	loss_cls: tensor(0.3907, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2183, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6090, device='cuda:0', grad_fn=<AddBackward0>)



  8%|▊         | 30/354 [02:42<29:11,  5.41s/it][A

	loss_cls: tensor(0.4419, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1952, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6371, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 31/354 [02:47<28:59,  5.39s/it][A

	loss_cls: tensor(0.5025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5937, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 32/354 [02:53<28:58,  5.40s/it][A

	loss_cls: tensor(0.4948, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2201, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7149, device='cuda:0', grad_fn=<AddBackward0>)



  9%|▉         | 33/354 [02:58<28:51,  5.40s/it][A

	loss_cls: tensor(0.5499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2400, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7899, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 34/354 [03:03<28:44,  5.39s/it][A

	loss_cls: tensor(0.5070, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1860, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6929, device='cuda:0', grad_fn=<AddBackward0>)



 10%|▉         | 35/354 [03:09<28:46,  5.41s/it][A

	loss_cls: tensor(0.7319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3295, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0614, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 36/354 [03:14<28:36,  5.40s/it][A

	loss_cls: tensor(0.6525, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0875, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7400, device='cuda:0', grad_fn=<AddBackward0>)



 10%|█         | 37/354 [03:20<28:35,  5.41s/it][A

	loss_cls: tensor(0.5883, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0678, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6561, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 38/354 [03:25<28:27,  5.40s/it][A

	loss_cls: tensor(0.5034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6763, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█         | 39/354 [03:30<28:25,  5.41s/it][A

	loss_cls: tensor(0.4903, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6968, device='cuda:0', grad_fn=<AddBackward0>)



 11%|█▏        | 40/354 [03:36<28:17,  5.41s/it][A

	loss_cls: tensor(0.4997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2178, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7176, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 41/354 [03:41<28:05,  5.39s/it][A

	loss_cls: tensor(0.8296, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3426, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1722, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 42/354 [03:47<28:08,  5.41s/it][A

	loss_cls: tensor(0.4487, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2677, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7164, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 43/354 [03:52<27:57,  5.39s/it][A

	loss_cls: tensor(0.5363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1267, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6630, device='cuda:0', grad_fn=<AddBackward0>)



 12%|█▏        | 44/354 [03:57<27:55,  5.41s/it][A

	loss_cls: tensor(0.5165, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0778, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5943, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 45/354 [04:03<27:45,  5.39s/it][A

	loss_cls: tensor(0.7973, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2643, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0616, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 46/354 [04:08<27:46,  5.41s/it][A

	loss_cls: tensor(0.5776, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3148, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8924, device='cuda:0', grad_fn=<AddBackward0>)



 13%|█▎        | 47/354 [04:14<27:35,  5.39s/it][A

	loss_cls: tensor(0.4614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9448, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▎        | 48/354 [04:19<27:29,  5.39s/it][A

	loss_cls: tensor(0.4809, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7006, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 49/354 [04:24<27:25,  5.40s/it][A

	loss_cls: tensor(0.7895, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0399, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 50/354 [04:30<27:18,  5.39s/it][A

	loss_cls: tensor(0.5437, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2577, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8014, device='cuda:0', grad_fn=<AddBackward0>)



 14%|█▍        | 51/354 [04:35<27:17,  5.40s/it][A

	loss_cls: tensor(0.5960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1193, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7154, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 52/354 [04:41<27:10,  5.40s/it][A

	loss_cls: tensor(0.4538, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1337, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5875, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▍        | 53/354 [04:46<27:09,  5.41s/it][A

	loss_cls: tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1118, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6000, device='cuda:0', grad_fn=<AddBackward0>)



 15%|█▌        | 54/354 [04:51<27:00,  5.40s/it][A

	loss_cls: tensor(0.6700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2775, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9475, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 55/354 [04:57<26:52,  5.39s/it][A

	loss_cls: tensor(0.6859, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0504, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7363, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 56/354 [05:02<26:49,  5.40s/it][A

	loss_cls: tensor(0.5745, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1621, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7367, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▌        | 57/354 [05:08<26:46,  5.41s/it][A

	loss_cls: tensor(0.6011, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2075, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8085, device='cuda:0', grad_fn=<AddBackward0>)



 16%|█▋        | 58/354 [05:13<26:53,  5.45s/it][A

	loss_cls: tensor(0.7392, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3076, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0467, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 59/354 [05:19<26:41,  5.43s/it][A

	loss_cls: tensor(0.5771, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2540, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8311, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 60/354 [05:24<26:38,  5.44s/it][A

	loss_cls: tensor(0.5636, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1599, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7235, device='cuda:0', grad_fn=<AddBackward0>)



 17%|█▋        | 61/354 [05:29<26:27,  5.42s/it][A

	loss_cls: tensor(0.8590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1232, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9821, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 62/354 [05:35<26:17,  5.40s/it][A

	loss_cls: tensor(0.6088, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3407, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9495, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 63/354 [05:40<26:15,  5.41s/it][A

	loss_cls: tensor(0.5025, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4676, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9700, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 64/354 [05:46<26:05,  5.40s/it][A

	loss_cls: tensor(0.6737, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2727, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9463, device='cuda:0', grad_fn=<AddBackward0>)



 18%|█▊        | 65/354 [05:51<26:06,  5.42s/it][A

	loss_cls: tensor(0.5266, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2719, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7985, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▊        | 66/354 [05:56<25:58,  5.41s/it][A

	loss_cls: tensor(0.3753, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4509, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 67/354 [06:02<25:56,  5.42s/it][A

	loss_cls: tensor(0.4280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5382, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 68/354 [06:07<25:47,  5.41s/it][A

	loss_cls: tensor(0.5974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3276, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9250, device='cuda:0', grad_fn=<AddBackward0>)



 19%|█▉        | 69/354 [06:13<25:40,  5.41s/it][A

	loss_cls: tensor(0.4849, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3111, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7960, device='cuda:0', grad_fn=<AddBackward0>)



 20%|█▉        | 70/354 [06:18<25:37,  5.41s/it][A

	loss_cls: tensor(0.6570, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4127, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0697, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 71/354 [06:23<25:27,  5.40s/it][A

	loss_cls: tensor(0.4095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0710, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4805, device='cuda:0', grad_fn=<AddBackward0>)



 20%|██        | 72/354 [06:29<25:29,  5.43s/it][A

	loss_cls: tensor(0.5545, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4856, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0401, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 73/354 [06:34<25:19,  5.41s/it][A

	loss_cls: tensor(0.4643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3219, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7862, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 74/354 [06:40<25:16,  5.42s/it][A

	loss_cls: tensor(0.6971, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2412, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9383, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██        | 75/354 [06:45<25:07,  5.40s/it][A

	loss_cls: tensor(0.3691, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0451, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4143, device='cuda:0', grad_fn=<AddBackward0>)



 21%|██▏       | 76/354 [06:50<24:58,  5.39s/it][A

	loss_cls: tensor(0.4886, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0735, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5621, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 77/354 [06:56<24:57,  5.41s/it][A

	loss_cls: tensor(0.5302, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2372, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7674, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 78/354 [07:01<24:49,  5.40s/it][A

	loss_cls: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1668, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7485, device='cuda:0', grad_fn=<AddBackward0>)



 22%|██▏       | 79/354 [07:07<24:49,  5.42s/it][A

	loss_cls: tensor(1.1358, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4421, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 80/354 [07:12<24:39,  5.40s/it][A

	loss_cls: tensor(0.4974, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1393, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6367, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 81/354 [07:18<24:44,  5.44s/it][A

	loss_cls: tensor(0.6095, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1909, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8004, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 82/354 [07:23<24:34,  5.42s/it][A

	loss_cls: tensor(0.3885, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3122, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7007, device='cuda:0', grad_fn=<AddBackward0>)



 23%|██▎       | 83/354 [07:28<24:23,  5.40s/it][A

	loss_cls: tensor(0.5960, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1493, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7453, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▎       | 84/354 [07:34<24:23,  5.42s/it][A

	loss_cls: tensor(0.4187, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1920, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6107, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 85/354 [07:39<24:13,  5.40s/it][A

	loss_cls: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1871, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7982, device='cuda:0', grad_fn=<AddBackward0>)



 24%|██▍       | 86/354 [07:45<24:13,  5.42s/it][A

	loss_cls: tensor(0.4153, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3340, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7493, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 87/354 [07:50<24:01,  5.40s/it][A

	loss_cls: tensor(0.7030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2975, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▍       | 88/354 [07:55<24:02,  5.42s/it][A

	loss_cls: tensor(0.5366, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1911, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7277, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 89/354 [08:01<23:52,  5.40s/it][A

	loss_cls: tensor(0.4232, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2444, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6676, device='cuda:0', grad_fn=<AddBackward0>)



 25%|██▌       | 90/354 [08:06<23:40,  5.38s/it][A

	loss_cls: tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1519, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8571, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 91/354 [08:12<23:38,  5.39s/it][A

	loss_cls: tensor(0.6052, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3777, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9829, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▌       | 92/354 [08:17<23:29,  5.38s/it][A

	loss_cls: tensor(0.3029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6153, device='cuda:0', grad_fn=<AddBackward0>)



 26%|██▋       | 93/354 [08:22<23:31,  5.41s/it][A

	loss_cls: tensor(0.4578, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6779, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 94/354 [08:28<23:24,  5.40s/it][A

	loss_cls: tensor(0.4031, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2153, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6185, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 95/354 [08:33<23:21,  5.41s/it][A

	loss_cls: tensor(0.6481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3390, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9872, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 96/354 [08:39<23:12,  5.40s/it][A

	loss_cls: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2585, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8090, device='cuda:0', grad_fn=<AddBackward0>)



 27%|██▋       | 97/354 [08:44<23:03,  5.38s/it][A

	loss_cls: tensor(0.8373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3999, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2372, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 98/354 [08:49<23:01,  5.40s/it][A

	loss_cls: tensor(0.4233, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1484, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5717, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 99/354 [08:55<22:53,  5.39s/it][A

	loss_cls: tensor(0.4805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1673, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6478, device='cuda:0', grad_fn=<AddBackward0>)



 28%|██▊       | 100/354 [09:00<22:54,  5.41s/it][A

	loss_cls: tensor(0.5675, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7969, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▊       | 101/354 [09:06<22:44,  5.39s/it][A

	loss_cls: tensor(0.5818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3124, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8942, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 102/354 [09:11<22:43,  5.41s/it][A

	loss_cls: tensor(0.5607, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2312, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7919, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 103/354 [09:16<22:33,  5.39s/it][A

	loss_cls: tensor(0.6371, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3271, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9643, device='cuda:0', grad_fn=<AddBackward0>)



 29%|██▉       | 104/354 [09:22<22:28,  5.40s/it][A

	loss_cls: tensor(0.6573, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1879, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8451, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 105/354 [09:27<22:26,  5.41s/it][A

	loss_cls: tensor(0.4034, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4655, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8688, device='cuda:0', grad_fn=<AddBackward0>)



 30%|██▉       | 106/354 [09:33<22:22,  5.41s/it][A

	loss_cls: tensor(0.5673, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1715, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7388, device='cuda:0', grad_fn=<AddBackward0>)



 30%|███       | 107/354 [09:38<22:21,  5.43s/it][A

	loss_cls: tensor(0.3653, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1472, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5125, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 108/354 [09:43<22:12,  5.42s/it][A

	loss_cls: tensor(0.4018, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1135, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5154, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 109/354 [09:49<22:09,  5.43s/it][A

	loss_cls: tensor(0.6631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7099, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███       | 110/354 [09:54<22:00,  5.41s/it][A

	loss_cls: tensor(0.6279, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1000, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7280, device='cuda:0', grad_fn=<AddBackward0>)



 31%|███▏      | 111/354 [10:00<21:56,  5.42s/it][A

	loss_cls: tensor(0.7558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2537, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0095, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 112/354 [10:05<21:48,  5.41s/it][A

	loss_cls: tensor(0.5750, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8188, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 113/354 [10:11<21:41,  5.40s/it][A

	loss_cls: tensor(0.6319, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1198, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7517, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 114/354 [10:16<21:42,  5.43s/it][A

	loss_cls: tensor(0.3904, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1755, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5659, device='cuda:0', grad_fn=<AddBackward0>)



 32%|███▏      | 115/354 [10:21<21:33,  5.41s/it][A

	loss_cls: tensor(0.4425, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1230, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5655, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 116/354 [10:27<21:28,  5.41s/it][A

	loss_cls: tensor(0.5289, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6681, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 117/354 [10:32<21:20,  5.40s/it][A

	loss_cls: tensor(0.4981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5426, device='cuda:0', grad_fn=<AddBackward0>)



 33%|███▎      | 118/354 [10:38<21:18,  5.42s/it][A

	loss_cls: tensor(0.5406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2566, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7972, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▎      | 119/354 [10:43<21:08,  5.40s/it][A

	loss_cls: tensor(0.6212, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1364, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7576, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 120/354 [10:48<21:02,  5.39s/it][A

	loss_cls: tensor(0.6381, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2049, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8431, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 121/354 [10:54<20:59,  5.41s/it][A

	loss_cls: tensor(0.4962, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1603, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6566, device='cuda:0', grad_fn=<AddBackward0>)



 34%|███▍      | 122/354 [10:59<20:52,  5.40s/it][A

	loss_cls: tensor(0.6046, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3529, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9574, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▍      | 123/354 [11:05<20:49,  5.41s/it][A

	loss_cls: tensor(0.7837, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2836, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0674, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 124/354 [11:10<20:40,  5.39s/it][A

	loss_cls: tensor(0.6631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1173, device='cuda:0', grad_fn=<AddBackward0>)



 35%|███▌      | 125/354 [11:15<20:37,  5.41s/it][A

	loss_cls: tensor(0.4567, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0867, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5435, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 126/354 [11:21<20:29,  5.39s/it][A

	loss_cls: tensor(0.5261, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2126, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7386, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 127/354 [11:26<20:21,  5.38s/it][A

	loss_cls: tensor(0.4403, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1411, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5814, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▌      | 128/354 [11:32<20:21,  5.40s/it][A

	loss_cls: tensor(0.7387, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2100, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9488, device='cuda:0', grad_fn=<AddBackward0>)



 36%|███▋      | 129/354 [11:37<20:15,  5.40s/it][A

	loss_cls: tensor(0.4757, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2233, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6991, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 130/354 [11:42<20:12,  5.41s/it][A

	loss_cls: tensor(0.4375, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0808, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5183, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 131/354 [11:48<20:02,  5.39s/it][A

	loss_cls: tensor(0.5631, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2360, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7991, device='cuda:0', grad_fn=<AddBackward0>)



 37%|███▋      | 132/354 [11:53<20:01,  5.41s/it][A

	loss_cls: tensor(0.4995, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2023, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7018, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 133/354 [11:59<19:52,  5.40s/it][A

	loss_cls: tensor(0.6590, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3136, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9726, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 134/354 [12:04<19:45,  5.39s/it][A

	loss_cls: tensor(0.4792, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2942, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7734, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 135/354 [12:09<19:43,  5.40s/it][A

	loss_cls: tensor(0.3163, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3302, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6465, device='cuda:0', grad_fn=<AddBackward0>)



 38%|███▊      | 136/354 [12:15<19:34,  5.39s/it][A

	loss_cls: tensor(0.3778, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0915, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4693, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▊      | 137/354 [12:20<19:30,  5.40s/it][A

	loss_cls: tensor(0.7260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2266, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9527, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 138/354 [12:26<19:23,  5.39s/it][A

	loss_cls: tensor(0.6320, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3445, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9765, device='cuda:0', grad_fn=<AddBackward0>)



 39%|███▉      | 139/354 [12:31<19:20,  5.40s/it][A

	loss_cls: tensor(0.5195, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1450, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6646, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 140/354 [12:36<19:12,  5.39s/it][A

	loss_cls: tensor(0.7937, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0393, device='cuda:0', grad_fn=<AddBackward0>)



 40%|███▉      | 141/354 [12:42<19:04,  5.37s/it][A

	loss_cls: tensor(0.5976, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3989, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9965, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 142/354 [12:47<19:03,  5.39s/it][A

	loss_cls: tensor(0.6818, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3194, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0012, device='cuda:0', grad_fn=<AddBackward0>)



 40%|████      | 143/354 [12:52<18:55,  5.38s/it][A

	loss_cls: tensor(0.4901, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5814, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 144/354 [12:58<18:51,  5.39s/it][A

	loss_cls: tensor(0.4492, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4385, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8877, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 145/354 [13:03<18:44,  5.38s/it][A

	loss_cls: tensor(0.6460, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8214, device='cuda:0', grad_fn=<AddBackward0>)



 41%|████      | 146/354 [13:09<18:44,  5.41s/it][A

	loss_cls: tensor(1.0120, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1833, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1952, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 147/354 [13:14<18:35,  5.39s/it][A

	loss_cls: tensor(0.5558, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0998, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6556, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 148/354 [13:19<18:27,  5.38s/it][A

	loss_cls: tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2096, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7601, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 149/354 [13:25<18:23,  5.38s/it][A

	loss_cls: tensor(0.4114, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1246, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5360, device='cuda:0', grad_fn=<AddBackward0>)



 42%|████▏     | 150/354 [13:30<18:14,  5.37s/it][A

	loss_cls: tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3738, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8631, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 151/354 [13:36<18:11,  5.38s/it][A

	loss_cls: tensor(0.6481, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2949, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9430, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 152/354 [13:41<18:01,  5.35s/it][A

	loss_cls: tensor(0.5151, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1468, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6619, device='cuda:0', grad_fn=<AddBackward0>)



 43%|████▎     | 153/354 [13:46<18:00,  5.37s/it][A

	loss_cls: tensor(0.8093, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2916, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1008, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▎     | 154/354 [13:52<17:54,  5.37s/it][A

	loss_cls: tensor(0.5388, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8393, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 155/354 [13:57<17:47,  5.36s/it][A

	loss_cls: tensor(0.8518, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0963, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9481, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 156/354 [14:02<17:45,  5.38s/it][A

	loss_cls: tensor(0.6310, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7811, device='cuda:0', grad_fn=<AddBackward0>)



 44%|████▍     | 157/354 [14:08<17:37,  5.37s/it][A

	loss_cls: tensor(0.5061, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1609, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6669, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 158/354 [14:13<17:36,  5.39s/it][A

	loss_cls: tensor(1.0030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4560, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4590, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▍     | 159/354 [14:18<17:28,  5.38s/it][A

	loss_cls: tensor(0.6520, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8974, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 160/354 [14:24<17:26,  5.39s/it][A

	loss_cls: tensor(0.5991, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2687, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8678, device='cuda:0', grad_fn=<AddBackward0>)



 45%|████▌     | 161/354 [14:29<17:16,  5.37s/it][A

	loss_cls: tensor(1.0356, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4721, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.5076, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 162/354 [14:35<17:09,  5.36s/it][A

	loss_cls: tensor(0.7056, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0837, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7893, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▌     | 163/354 [14:40<17:06,  5.38s/it][A

	loss_cls: tensor(0.6577, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1071, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7648, device='cuda:0', grad_fn=<AddBackward0>)



 46%|████▋     | 164/354 [14:45<17:00,  5.37s/it][A

	loss_cls: tensor(0.4470, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1294, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5764, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 165/354 [14:51<16:59,  5.39s/it][A

	loss_cls: tensor(0.5245, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1328, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6572, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 166/354 [14:56<16:52,  5.38s/it][A

	loss_cls: tensor(0.5410, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1930, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7340, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 167/354 [15:02<16:49,  5.40s/it][A

	loss_cls: tensor(0.3852, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0945, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4798, device='cuda:0', grad_fn=<AddBackward0>)



 47%|████▋     | 168/354 [15:07<16:42,  5.39s/it][A

	loss_cls: tensor(0.6517, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1368, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7885, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 169/354 [15:12<16:33,  5.37s/it][A

	loss_cls: tensor(0.3986, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1593, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5579, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 170/354 [15:18<16:31,  5.39s/it][A

	loss_cls: tensor(0.6288, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2269, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8557, device='cuda:0', grad_fn=<AddBackward0>)



 48%|████▊     | 171/354 [15:23<16:23,  5.38s/it][A

	loss_cls: tensor(0.5030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0510, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5540, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▊     | 172/354 [15:29<16:24,  5.41s/it][A

	loss_cls: tensor(0.5235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0459, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5694, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 173/354 [15:34<16:16,  5.39s/it][A

	loss_cls: tensor(0.5406, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7835, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 174/354 [15:39<16:12,  5.40s/it][A

	loss_cls: tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3477, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8642, device='cuda:0', grad_fn=<AddBackward0>)



 49%|████▉     | 175/354 [15:45<16:04,  5.39s/it][A

	loss_cls: tensor(0.6521, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0494, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7015, device='cuda:0', grad_fn=<AddBackward0>)



 50%|████▉     | 176/354 [15:50<15:56,  5.37s/it][A

	loss_cls: tensor(0.3763, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3584, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7346, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 177/354 [15:56<15:57,  5.41s/it][A

	loss_cls: tensor(0.7499, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0495, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7994, device='cuda:0', grad_fn=<AddBackward0>)



 50%|█████     | 178/354 [16:01<15:51,  5.40s/it][A

	loss_cls: tensor(0.7611, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1877, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9488, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 179/354 [16:06<15:49,  5.42s/it][A

	loss_cls: tensor(0.4575, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2065, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6640, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 180/354 [16:12<15:41,  5.41s/it][A

	loss_cls: tensor(1.1396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1708, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3105, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████     | 181/354 [16:17<15:37,  5.42s/it][A

	loss_cls: tensor(0.6313, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0663, device='cuda:0', grad_fn=<AddBackward0>)



 51%|█████▏    | 182/354 [16:23<15:28,  5.40s/it][A

	loss_cls: tensor(0.4234, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1392, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5626, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 183/354 [16:28<15:21,  5.39s/it][A

	loss_cls: tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1614, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6239, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 184/354 [16:33<15:18,  5.40s/it][A

	loss_cls: tensor(0.4249, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1568, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5816, device='cuda:0', grad_fn=<AddBackward0>)



 52%|█████▏    | 185/354 [16:39<15:10,  5.39s/it][A

	loss_cls: tensor(0.4501, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4653, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9154, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 186/354 [16:44<15:09,  5.41s/it][A

	loss_cls: tensor(0.4442, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0382, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4824, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 187/354 [16:50<15:00,  5.39s/it][A

	loss_cls: tensor(0.5063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0387, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5449, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 188/354 [16:55<14:57,  5.41s/it][A

	loss_cls: tensor(0.4618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0717, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5335, device='cuda:0', grad_fn=<AddBackward0>)



 53%|█████▎    | 189/354 [17:00<14:48,  5.39s/it][A

	loss_cls: tensor(0.4451, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1461, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5912, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▎    | 190/354 [17:06<14:41,  5.38s/it][A

	loss_cls: tensor(0.7394, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0793, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8187, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 191/354 [17:11<14:38,  5.39s/it][A

	loss_cls: tensor(0.5188, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1101, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6289, device='cuda:0', grad_fn=<AddBackward0>)



 54%|█████▍    | 192/354 [17:16<14:32,  5.38s/it][A

	loss_cls: tensor(0.4989, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2617, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7606, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 193/354 [17:22<14:28,  5.39s/it][A

	loss_cls: tensor(0.4155, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1244, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5399, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▍    | 194/354 [17:27<14:21,  5.38s/it][A

	loss_cls: tensor(0.6552, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0663, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 195/354 [17:33<14:17,  5.39s/it][A

	loss_cls: tensor(0.5090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3019, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8109, device='cuda:0', grad_fn=<AddBackward0>)



 55%|█████▌    | 196/354 [17:38<14:09,  5.38s/it][A

	loss_cls: tensor(0.4026, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1482, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5508, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 197/354 [17:43<14:01,  5.36s/it][A

	loss_cls: tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0900, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6608, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 198/354 [17:49<13:59,  5.38s/it][A

	loss_cls: tensor(0.6099, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1766, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7865, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▌    | 199/354 [17:54<13:53,  5.37s/it][A

	loss_cls: tensor(0.6483, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1834, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8318, device='cuda:0', grad_fn=<AddBackward0>)



 56%|█████▋    | 200/354 [18:00<13:52,  5.41s/it][A

	loss_cls: tensor(0.5126, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2965, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8091, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 201/354 [18:05<13:44,  5.39s/it][A

	loss_cls: tensor(0.7076, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8832, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 202/354 [18:10<13:41,  5.41s/it][A

	loss_cls: tensor(0.5190, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1814, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7005, device='cuda:0', grad_fn=<AddBackward0>)



 57%|█████▋    | 203/354 [18:16<13:32,  5.38s/it][A

	loss_cls: tensor(0.7440, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2756, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0196, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 204/354 [18:21<13:25,  5.37s/it][A

	loss_cls: tensor(0.5078, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3162, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8240, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 205/354 [18:26<13:22,  5.39s/it][A

	loss_cls: tensor(0.9954, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3596, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3550, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 206/354 [18:32<13:17,  5.39s/it][A

	loss_cls: tensor(0.5474, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0671, device='cuda:0', grad_fn=<AddBackward0>)



 58%|█████▊    | 207/354 [18:37<13:14,  5.41s/it][A

	loss_cls: tensor(0.6030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2086, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8116, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 208/354 [18:43<13:07,  5.40s/it][A

	loss_cls: tensor(0.4235, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4384, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8620, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 209/354 [18:48<13:03,  5.41s/it][A

	loss_cls: tensor(0.4391, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1352, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5743, device='cuda:0', grad_fn=<AddBackward0>)



 59%|█████▉    | 210/354 [18:53<12:55,  5.39s/it][A

	loss_cls: tensor(0.6363, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1409, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7772, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 211/354 [18:59<12:51,  5.40s/it][A

	loss_cls: tensor(0.7027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2539, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9567, device='cuda:0', grad_fn=<AddBackward0>)



 60%|█████▉    | 212/354 [19:04<12:45,  5.39s/it][A

	loss_cls: tensor(0.5050, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1156, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6205, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 213/354 [19:10<12:38,  5.38s/it][A

	loss_cls: tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0656, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4765, device='cuda:0', grad_fn=<AddBackward0>)



 60%|██████    | 214/354 [19:15<12:35,  5.40s/it][A

	loss_cls: tensor(0.5396, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0572, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5968, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 215/354 [19:20<12:28,  5.38s/it][A

	loss_cls: tensor(0.8455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1437, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9891, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████    | 216/354 [19:26<12:24,  5.39s/it][A

	loss_cls: tensor(0.5100, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0904, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6004, device='cuda:0', grad_fn=<AddBackward0>)



 61%|██████▏   | 217/354 [19:31<12:16,  5.38s/it][A

	loss_cls: tensor(0.6047, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3021, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9069, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 218/354 [19:37<12:12,  5.39s/it][A

	loss_cls: tensor(0.7803, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9001, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 219/354 [19:42<12:06,  5.38s/it][A

	loss_cls: tensor(0.5438, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2978, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8415, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 220/354 [19:47<11:59,  5.37s/it][A

	loss_cls: tensor(0.3097, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3534, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6631, device='cuda:0', grad_fn=<AddBackward0>)



 62%|██████▏   | 221/354 [19:53<11:57,  5.39s/it][A

	loss_cls: tensor(1.0669, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2820, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3489, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 222/354 [19:58<11:50,  5.38s/it][A

	loss_cls: tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0506, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5521, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 223/354 [20:03<11:45,  5.39s/it][A

	loss_cls: tensor(0.4894, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1573, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6467, device='cuda:0', grad_fn=<AddBackward0>)



 63%|██████▎   | 224/354 [20:09<11:38,  5.37s/it][A

	loss_cls: tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3223, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0583, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▎   | 225/354 [20:14<11:37,  5.40s/it][A

	loss_cls: tensor(0.4377, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3081, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7458, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 226/354 [20:20<11:33,  5.42s/it][A

	loss_cls: tensor(0.6939, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2640, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9579, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 227/354 [20:25<11:27,  5.41s/it][A

	loss_cls: tensor(0.4657, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0467, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5125, device='cuda:0', grad_fn=<AddBackward0>)



 64%|██████▍   | 228/354 [20:31<11:23,  5.42s/it][A

	loss_cls: tensor(0.5138, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0416, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5554, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 229/354 [20:36<11:14,  5.40s/it][A

	loss_cls: tensor(0.8144, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1327, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▍   | 230/354 [20:41<11:09,  5.40s/it][A

	loss_cls: tensor(0.6844, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2528, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9371, device='cuda:0', grad_fn=<AddBackward0>)



 65%|██████▌   | 231/354 [20:47<11:01,  5.38s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6100, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 232/354 [20:52<10:58,  5.40s/it][A

	loss_cls: tensor(0.7231, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1761, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8992, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 233/354 [20:57<10:50,  5.38s/it][A

	loss_cls: tensor(0.5348, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7488, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▌   | 234/354 [21:03<10:45,  5.38s/it][A

	loss_cls: tensor(0.5090, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3413, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8503, device='cuda:0', grad_fn=<AddBackward0>)



 66%|██████▋   | 235/354 [21:08<10:41,  5.39s/it][A

	loss_cls: tensor(0.4341, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2176, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6516, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 236/354 [21:14<10:34,  5.38s/it][A

	loss_cls: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0454, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6564, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 237/354 [21:19<10:30,  5.39s/it][A

	loss_cls: tensor(0.7393, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5438, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2831, device='cuda:0', grad_fn=<AddBackward0>)



 67%|██████▋   | 238/354 [21:24<10:24,  5.38s/it][A

	loss_cls: tensor(0.9308, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2167, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1475, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 239/354 [21:30<10:21,  5.40s/it][A

	loss_cls: tensor(0.5700, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0466, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6166, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 240/354 [21:35<10:16,  5.41s/it][A

	loss_cls: tensor(0.6997, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3475, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0472, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 241/354 [21:41<10:08,  5.39s/it][A

	loss_cls: tensor(0.7030, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1619, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8649, device='cuda:0', grad_fn=<AddBackward0>)



 68%|██████▊   | 242/354 [21:46<10:04,  5.40s/it][A

	loss_cls: tensor(0.4530, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3912, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8442, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▊   | 243/354 [21:51<09:56,  5.37s/it][A

	loss_cls: tensor(0.9164, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1974, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1139, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 244/354 [21:57<09:52,  5.39s/it][A

	loss_cls: tensor(0.5240, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6914, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 245/354 [22:02<09:45,  5.38s/it][A

	loss_cls: tensor(0.6873, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0693, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7566, device='cuda:0', grad_fn=<AddBackward0>)



 69%|██████▉   | 246/354 [22:07<09:42,  5.40s/it][A

	loss_cls: tensor(0.6183, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3522, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9705, device='cuda:0', grad_fn=<AddBackward0>)



 70%|██████▉   | 247/354 [22:13<09:34,  5.36s/it][A

	loss_cls: tensor(0.5207, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2147, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7354, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 248/354 [22:18<09:28,  5.36s/it][A

	loss_cls: tensor(0.5286, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3703, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8989, device='cuda:0', grad_fn=<AddBackward0>)



 70%|███████   | 249/354 [22:24<09:25,  5.38s/it][A

	loss_cls: tensor(0.5347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1129, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6476, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 250/354 [22:29<09:19,  5.38s/it][A

	loss_cls: tensor(0.6537, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1878, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8415, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 251/354 [22:34<09:15,  5.39s/it][A

	loss_cls: tensor(0.6435, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0317, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████   | 252/354 [22:40<09:14,  5.43s/it][A

	loss_cls: tensor(1.2795, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1876, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4672, device='cuda:0', grad_fn=<AddBackward0>)



 71%|███████▏  | 253/354 [22:45<09:13,  5.48s/it][A

	loss_cls: tensor(0.6608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2872, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9480, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 254/354 [22:51<09:08,  5.49s/it][A

	loss_cls: tensor(0.9497, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4184, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3681, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 255/354 [22:56<09:03,  5.49s/it][A

	loss_cls: tensor(0.8749, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3849, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2598, device='cuda:0', grad_fn=<AddBackward0>)



 72%|███████▏  | 256/354 [23:02<09:01,  5.53s/it][A

	loss_cls: tensor(0.5693, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1453, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7147, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 257/354 [23:08<08:55,  5.52s/it][A

	loss_cls: tensor(0.6063, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0279, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 258/354 [23:13<08:51,  5.54s/it][A

	loss_cls: tensor(0.5345, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2517, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7861, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 259/354 [23:19<08:45,  5.53s/it][A

	loss_cls: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1774, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8512, device='cuda:0', grad_fn=<AddBackward0>)



 73%|███████▎  | 260/354 [23:24<08:41,  5.55s/it][A

	loss_cls: tensor(0.7002, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3140, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0142, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▎  | 261/354 [23:30<08:30,  5.49s/it][A

	loss_cls: tensor(0.7695, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1618, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9312, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 262/354 [23:35<08:25,  5.49s/it][A

	loss_cls: tensor(0.5119, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1908, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7028, device='cuda:0', grad_fn=<AddBackward0>)



 74%|███████▍  | 263/354 [23:41<08:21,  5.51s/it][A

	loss_cls: tensor(0.2813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3626, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6440, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 264/354 [23:46<08:15,  5.50s/it][A

	loss_cls: tensor(0.5441, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2005, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7446, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▍  | 265/354 [23:52<08:12,  5.53s/it][A

	loss_cls: tensor(0.6169, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8969, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 266/354 [23:57<08:05,  5.52s/it][A

	loss_cls: tensor(0.7944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2712, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0656, device='cuda:0', grad_fn=<AddBackward0>)



 75%|███████▌  | 267/354 [24:03<08:01,  5.53s/it][A

	loss_cls: tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2093, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9145, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 268/354 [24:08<07:55,  5.53s/it][A

	loss_cls: tensor(0.7039, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1502, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8541, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▌  | 269/354 [24:14<07:49,  5.52s/it][A

	loss_cls: tensor(0.6027, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1544, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7571, device='cuda:0', grad_fn=<AddBackward0>)



 76%|███████▋  | 270/354 [24:19<07:40,  5.48s/it][A

	loss_cls: tensor(0.6511, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1750, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8261, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 271/354 [24:25<07:36,  5.50s/it][A

	loss_cls: tensor(0.5522, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2138, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7659, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 272/354 [24:30<07:36,  5.57s/it][A

	loss_cls: tensor(0.6944, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.6704, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3648, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 273/354 [24:36<07:41,  5.69s/it][A

	loss_cls: tensor(0.6208, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2188, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8395, device='cuda:0', grad_fn=<AddBackward0>)



 77%|███████▋  | 274/354 [24:42<07:40,  5.76s/it][A

	loss_cls: tensor(0.5420, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4063, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9483, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 275/354 [24:48<07:32,  5.72s/it][A

	loss_cls: tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1682, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7143, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 276/354 [24:54<07:25,  5.72s/it][A

	loss_cls: tensor(0.5855, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2429, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8283, device='cuda:0', grad_fn=<AddBackward0>)



 78%|███████▊  | 277/354 [24:59<07:21,  5.73s/it][A

	loss_cls: tensor(0.6057, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2516, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8573, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▊  | 278/354 [25:05<07:13,  5.71s/it][A

	loss_cls: tensor(0.5444, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1613, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7057, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 279/354 [25:11<07:07,  5.69s/it][A

	loss_cls: tensor(0.8373, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4121, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.2494, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 280/354 [25:16<06:58,  5.65s/it][A

	loss_cls: tensor(0.5227, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2702, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7929, device='cuda:0', grad_fn=<AddBackward0>)



 79%|███████▉  | 281/354 [25:22<06:51,  5.64s/it][A

	loss_cls: tensor(0.7210, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3754, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0964, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 282/354 [25:28<06:43,  5.61s/it][A

	loss_cls: tensor(0.6350, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1047, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7397, device='cuda:0', grad_fn=<AddBackward0>)



 80%|███████▉  | 283/354 [25:33<06:36,  5.59s/it][A

	loss_cls: tensor(0.7864, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3332, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1197, device='cuda:0', grad_fn=<AddBackward0>)



 80%|████████  | 284/354 [25:39<06:31,  5.60s/it][A

	loss_cls: tensor(0.5813, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0490, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6304, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 285/354 [25:44<06:25,  5.58s/it][A

	loss_cls: tensor(0.6317, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3305, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9622, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 286/354 [25:50<06:21,  5.61s/it][A

	loss_cls: tensor(0.4508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1624, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6132, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████  | 287/354 [25:55<06:15,  5.60s/it][A

	loss_cls: tensor(0.5542, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2977, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8519, device='cuda:0', grad_fn=<AddBackward0>)



 81%|████████▏ | 288/354 [26:01<06:09,  5.60s/it][A

	loss_cls: tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3983, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0964, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 289/354 [26:07<06:02,  5.58s/it][A

	loss_cls: tensor(0.6615, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1550, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8166, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 290/354 [26:12<05:56,  5.57s/it][A

	loss_cls: tensor(0.5618, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7175, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 291/354 [26:18<05:51,  5.58s/it][A

	loss_cls: tensor(0.7805, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3173, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0979, device='cuda:0', grad_fn=<AddBackward0>)



 82%|████████▏ | 292/354 [26:23<05:36,  5.43s/it][A

	loss_cls: tensor(0.5102, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1180, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6283, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 293/354 [26:28<05:31,  5.44s/it][A

	loss_cls: tensor(0.5182, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0807, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5989, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 294/354 [26:34<05:29,  5.49s/it][A

	loss_cls: tensor(0.5744, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5127, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0872, device='cuda:0', grad_fn=<AddBackward0>)



 83%|████████▎ | 295/354 [26:40<05:26,  5.54s/it][A

	loss_cls: tensor(0.5613, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1350, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6962, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▎ | 296/354 [26:45<05:21,  5.55s/it][A

	loss_cls: tensor(0.6508, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2541, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9050, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 297/354 [26:50<05:12,  5.49s/it][A

	loss_cls: tensor(0.4765, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1846, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6611, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 298/354 [26:56<05:08,  5.52s/it][A

	loss_cls: tensor(0.6254, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1542, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7796, device='cuda:0', grad_fn=<AddBackward0>)



 84%|████████▍ | 299/354 [27:02<05:03,  5.53s/it][A

	loss_cls: tensor(0.5649, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0530, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6180, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▍ | 300/354 [27:07<05:00,  5.56s/it][A

	loss_cls: tensor(0.5399, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1910, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7309, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 301/354 [27:13<04:54,  5.56s/it][A

	loss_cls: tensor(0.5372, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1265, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6636, device='cuda:0', grad_fn=<AddBackward0>)



 85%|████████▌ | 302/354 [27:18<04:50,  5.58s/it][A

	loss_cls: tensor(0.6111, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3004, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9115, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 303/354 [27:24<04:37,  5.45s/it][A

	loss_cls: tensor(0.9192, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.1889, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 304/354 [27:29<04:33,  5.47s/it][A

	loss_cls: tensor(0.4347, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1278, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5624, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▌ | 305/354 [27:35<04:29,  5.51s/it][A

	loss_cls: tensor(0.4566, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1150, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5715, device='cuda:0', grad_fn=<AddBackward0>)



 86%|████████▋ | 306/354 [27:40<04:15,  5.32s/it][A

	loss_cls: tensor(0.3890, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1026, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4916, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 307/354 [27:45<04:13,  5.39s/it][A

	loss_cls: tensor(0.5614, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1804, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7419, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 308/354 [27:51<04:10,  5.44s/it][A

	loss_cls: tensor(0.5925, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1365, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7289, device='cuda:0', grad_fn=<AddBackward0>)



 87%|████████▋ | 309/354 [27:56<04:07,  5.50s/it][A

	loss_cls: tensor(0.4065, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2456, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6521, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 310/354 [28:02<04:02,  5.51s/it][A

	loss_cls: tensor(0.5321, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1182, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6502, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 311/354 [28:07<03:58,  5.54s/it][A

	loss_cls: tensor(0.6643, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1674, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8317, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 312/354 [28:13<03:53,  5.55s/it][A

	loss_cls: tensor(0.5455, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1697, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7152, device='cuda:0', grad_fn=<AddBackward0>)



 88%|████████▊ | 313/354 [28:19<03:47,  5.56s/it][A

	loss_cls: tensor(0.4398, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3165, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7563, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▊ | 314/354 [28:24<03:42,  5.55s/it][A

	loss_cls: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0460, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6038, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 315/354 [28:30<03:36,  5.56s/it][A

	loss_cls: tensor(0.4935, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2809, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7744, device='cuda:0', grad_fn=<AddBackward0>)



 89%|████████▉ | 316/354 [28:35<03:29,  5.52s/it][A

	loss_cls: tensor(0.4951, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1627, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6578, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 317/354 [28:41<03:24,  5.53s/it][A

	loss_cls: tensor(0.7927, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1862, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9790, device='cuda:0', grad_fn=<AddBackward0>)



 90%|████████▉ | 318/354 [28:46<03:13,  5.36s/it][A

	loss_cls: tensor(0.5671, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1218, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6890, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 319/354 [28:51<03:03,  5.23s/it][A

	loss_cls: tensor(1.0314, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3455, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3769, device='cuda:0', grad_fn=<AddBackward0>)



 90%|█████████ | 320/354 [28:56<02:54,  5.14s/it][A

	loss_cls: tensor(0.3990, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2650, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6640, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 321/354 [29:01<02:53,  5.25s/it][A

	loss_cls: tensor(0.5550, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2212, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7762, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 322/354 [29:07<02:51,  5.35s/it][A

	loss_cls: tensor(0.6094, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4424, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0518, device='cuda:0', grad_fn=<AddBackward0>)



 91%|█████████ | 323/354 [29:12<02:48,  5.42s/it][A

	loss_cls: tensor(0.3365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2208, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5574, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 324/354 [29:18<02:43,  5.46s/it][A

	loss_cls: tensor(0.4659, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1181, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5840, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 325/354 [29:23<02:39,  5.51s/it][A

	loss_cls: tensor(0.4196, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3557, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7753, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 326/354 [29:29<02:34,  5.53s/it][A

	loss_cls: tensor(0.2996, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3202, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6198, device='cuda:0', grad_fn=<AddBackward0>)



 92%|█████████▏| 327/354 [29:34<02:29,  5.53s/it][A

	loss_cls: tensor(0.7148, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0446, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7594, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 328/354 [29:40<02:24,  5.57s/it][A

	loss_cls: tensor(0.4773, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1763, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6536, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 329/354 [29:46<02:19,  5.56s/it][A

	loss_cls: tensor(1.1365, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3298, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.4663, device='cuda:0', grad_fn=<AddBackward0>)



 93%|█████████▎| 330/354 [29:51<02:13,  5.58s/it][A

	loss_cls: tensor(0.5608, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0713, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6321, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▎| 331/354 [29:57<02:08,  5.58s/it][A

	loss_cls: tensor(0.7692, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2501, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0193, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 332/354 [30:03<02:03,  5.61s/it][A

	loss_cls: tensor(0.4280, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2535, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6815, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 333/354 [30:08<01:57,  5.60s/it][A

	loss_cls: tensor(0.5756, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1254, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7010, device='cuda:0', grad_fn=<AddBackward0>)



 94%|█████████▍| 334/354 [30:14<01:52,  5.62s/it][A

	loss_cls: tensor(0.7224, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1801, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9025, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 335/354 [30:19<01:46,  5.62s/it][A

	loss_cls: tensor(0.8834, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4681, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.3515, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▍| 336/354 [30:25<01:40,  5.60s/it][A

	loss_cls: tensor(0.5622, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3868, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9489, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 337/354 [30:31<01:35,  5.60s/it][A

	loss_cls: tensor(0.5727, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2120, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7847, device='cuda:0', grad_fn=<AddBackward0>)



 95%|█████████▌| 338/354 [30:36<01:29,  5.59s/it][A

	loss_cls: tensor(0.5667, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2342, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8009, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 339/354 [30:42<01:23,  5.57s/it][A

	loss_cls: tensor(0.4024, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0478, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4502, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▌| 340/354 [30:47<01:17,  5.53s/it][A

	loss_cls: tensor(0.3022, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1768, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4790, device='cuda:0', grad_fn=<AddBackward0>)



 96%|█████████▋| 341/354 [30:53<01:11,  5.54s/it][A

	loss_cls: tensor(0.6383, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2203, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.8586, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 342/354 [30:58<01:06,  5.57s/it][A

	loss_cls: tensor(0.4299, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0729, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.5029, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 343/354 [31:04<01:01,  5.57s/it][A

	loss_cls: tensor(0.5811, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1197, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7008, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 344/354 [31:09<00:55,  5.58s/it][A

	loss_cls: tensor(0.4620, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1954, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6574, device='cuda:0', grad_fn=<AddBackward0>)



 97%|█████████▋| 345/354 [31:15<00:50,  5.64s/it][A

	loss_cls: tensor(0.4154, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0794, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4948, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 346/354 [31:21<00:45,  5.67s/it][A

	loss_cls: tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1326, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7153, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 347/354 [31:27<00:39,  5.64s/it][A

	loss_cls: tensor(0.5260, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.1741, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.7002, device='cuda:0', grad_fn=<AddBackward0>)



 98%|█████████▊| 348/354 [31:32<00:33,  5.61s/it][A

	loss_cls: tensor(0.6965, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3217, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0181, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▊| 349/354 [31:38<00:28,  5.62s/it][A

	loss_cls: tensor(0.4482, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0480, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4963, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 350/354 [31:43<00:22,  5.61s/it][A

	loss_cls: tensor(0.5950, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3258, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9208, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 351/354 [31:49<00:16,  5.62s/it][A

	loss_cls: tensor(0.4312, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2597, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.6909, device='cuda:0', grad_fn=<AddBackward0>)



 99%|█████████▉| 352/354 [31:55<00:11,  5.61s/it][A

	loss_cls: tensor(0.4029, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.0440, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.4469, device='cuda:0', grad_fn=<AddBackward0>)



100%|█████████▉| 353/354 [32:00<00:05,  5.63s/it][A

	loss_cls: tensor(0.6510, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2873, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(0.9383, device='cuda:0', grad_fn=<AddBackward0>)



100%|██████████| 354/354 [32:02<00:00,  5.43s/it][A

  0%|          | 0/177 [00:00<?, ?it/s][A

	loss_cls: tensor(0.8654, device='cuda:0', grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2158, device='cuda:0', grad_fn=<NllLossBackward>)
loss: tensor(1.0812, device='cuda:0', grad_fn=<AddBackward0>)

	Training Loss: 0.8050564634429533

	Training cls acc: 0.6984463276836158

	Training cls prec: 0.5791820879214947

	Training cls rec: 0.6366875179904841

	Training cls f1: 0.5512897897241895

--
	Training ner acc: 0.9554634453088704

	Training ner prec: 0.27128877952667957

	Training ner rec: 0.27984475647273405

	Training ner f1: 0.2751091367270245

	Current Learning rate:  0.0



  1%|          | 1/177 [00:00<02:21,  1.24it/s][A
  1%|          | 2/177 [00:01<02:09,  1.35it/s][A
  2%|▏         | 3/177 [00:02<02:10,  1.33it/s][A
  2%|▏         | 4/177 [00:03<02:11,  1.32it/s][A
  3%|▎         | 5/177 [00:03<02:11,  1.31it/s][A
  3%|▎         | 6/177 [00:04<02:08,  1.34it/s][A
  4%|▍         | 7/177 [00:05<02:08,  1.32it/s][A
  5%|▍         | 8/177 [00:06<02:08,  1.32it/s][A
  5%|▌         | 9/177 [00:06<02:04,  1.35it/s][A
  6%|▌         | 10/177 [00:07<02:04,  1.34it/s][A
  6%|▌         | 11/177 [00:08<02:05,  1.33it/s][A
  7%|▋         | 12/177 [00:09<02:05,  1.32it/s][A
  7%|▋         | 13/177 [00:09<02:00,  1.36it/s][A
  8%|▊         | 14/177 [00:10<02:01,  1.34it/s][A
  8%|▊         | 15/177 [00:11<02:02,  1.32it/s][A
  9%|▉         | 16/177 [00:12<02:02,  1.32it/s][A
 10%|▉         | 17/177 [00:12<01:58,  1.35it/s][A
 10%|█         | 18/177 [00:13<01:59,  1.33it/s][A
 11%|█         | 19/177 [00:14<01:59,  1.32it/s][A
 11%|█▏        | 20/


	Validation Loss: 0.7777009661925041

	Validation cls acc: 0.7083333333333334

	Validation cls prec: 0.6244552058111381

	Validation cls rec: 0.6146825396825397

	Validation cls f1: 0.5725970827665743

--
	Validation ner acc: 0.9532661678264976

	Validation ner prec: 0.4259235845125581

	Validation ner rec: 0.4364406779661017

	Validation ner f1: 0.4309571785820821





### Evaluation on the test dataset

In [13]:

############ test eval metrics ######################
test_loss = []
test_loss = []
test_cls_acc = []
test_cls_prec = []
test_cls_rec = []
test_cls_f1 = []
test_ner_acc = []
test_ner_prec = []
test_ner_rec = []
test_ner_f1 = []

########################################################
for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch     # unpack inputs from dataloader

    with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
        model.eval() # put model in evaluation mode for validation set
        logits_cls, logits_ner = model(**{"input_ids":t_input_ids, "attention_mask":t_input_mask, "token_type_ids":t_token_type_ids}) # forward pass, calculates logit predictions


    ############### LOSS Function #######################################
    ### CLS
    t_loss_cls = loss_fn_cls(logits_cls, t_labels)

    ### NER
    # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
    t_active_loss = t_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
    t_active_logits = logits_ner.view(-1, N_bio_tags)[t_active_loss] # 5 
    t_active_tags = t_bio_tags.view(-1)[t_active_loss]
    t_loss_ner = loss_fn_ner(t_active_logits, t_active_tags)             
    t_loss = t_loss_cls + t_loss_ner
    test_loss.append(t_loss.item())


    ################# PERFORMANCE MEASURES ########################################
    ### CLS
    logits_cls = logits_cls.detach().to('cpu').numpy()
    label_ids = t_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits_cls, axis=1).flatten()
    labels_flat = label_ids.flatten()

    metrics_cls = compute_metrics(pred_flat, labels_flat)
    test_cls_acc.append(metrics_cls["accuracy"])
    test_cls_prec.append(metrics_cls["precision"])
    test_cls_rec.append(metrics_cls["recall"])
    test_cls_f1.append(metrics_cls["f1"])

    #### NER     
    logits_ner = logits_ner.detach().to('cpu').numpy()
    tags_ids = t_bio_tags.to('cpu').numpy()

    # calculate performance measures only on tokens and not subwords or special tokens
    tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
    tags = tags_ids[tags_mask]#.flatten()        

    metrics = compute_metrics(pred, tags)
    test_ner_acc.append(metrics["accuracy"])
    test_ner_prec.append(metrics["precision"])
    test_ner_rec.append(metrics["recall"])
    test_ner_f1.append(metrics["f1"])


print(F'\n\tTest Loss: {np.mean(test_loss)}')
print(F'\n\tTest cls acc: {np.mean(test_cls_acc)}')
print(F'\n\tTest cls prec: {np.mean(test_cls_prec)}')
print(F'\n\tTest cls rec: {np.mean(test_cls_rec)}')
print(F'\n\tTest cls f1: {np.mean(test_cls_f1)}')
print(F'\n--\n\tTest ner acc: {np.mean(test_ner_acc)}')
print(F'\n\tTest ner prec: {np.mean(test_ner_prec)}')
print(F'\n\tTest ner rec: {np.mean(test_ner_rec)}')
print(F'\n\tTest ner f1: {np.mean(test_ner_f1)}')


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
100%|██████████| 221/221 [03:12<00:00,  1.15it/s]


	Test Loss: 0.750974503880982

	Test cls acc: 0.6932773109243698

	Test cls prec: 0.5939291101055807

	Test cls rec: 0.5739657401422108

	Test cls f1: 0.5384079972315267

--
	Test ner acc: 0.9584356732616232

	Test ner prec: 0.44846081032251667

	Test ner rec: 0.4582956259426848

	Test ner f1: 0.45318474139426296





### bio tags back to tokens

In [14]:
# take last batch of test set:
t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch 

for i in range(len(batch)):
    tags_mask = t_bio_tags[i].to("cpu").numpy() != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits_ner[i], axis=1)[tags_mask]
    true_tags = t_bio_tags[i][tags_mask].to("cpu").numpy()    
    

    tokens = tokenizer.convert_ids_to_tokens(t_input_ids[i])

    print("\n\nPadded Sentence:")
    print(tokens)
    print("true labels:")
    print(t_bio_tags[i])
    for token, true_label, pred in zip(np.array(tokens)[tags_mask], true_tags, pred):
        print(token, "\t\ttrue:", true_label, "  pred:", pred)

    
    break




Padded Sentence:
['<s>', '16', 'year', 'Dia@@', 'betic', 'Anniversary', 'today', '!', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
true labels:
tensor([-100,    0,    0,    0, -100,    0,    0,    0, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

### Save model

In [15]:
torch.save(model.state_dict(), "finetuned-decoder-multi-task-35-epochs.pth")

### Load model locally

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CausalMultiTask()
model.load_state_dict(torch.load("finetuned-decoder-multi-task-35-epochs.pth"))
model.to(device)
model.eval()

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.5.output.dense.weight', 'roberta.encoder.layer.8.attention.self.value.bias', 'roberta.encoder.layer.1.output.dense.weight', 'roberta.encoder.layer.5.intermediate.dense.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.encoder.layer.1.attention.output.dense.weight', 'roberta.encoder.layer.7.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.1.attention.output.dense.bias', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.4.attention.self.value.weight', 'roberta.encoder.layer.10.intermediate.dense.weight', 'roberta.encoder.layer.8.attention.output.LayerNorm.bias', 'lm_head.bias', 'roberta.encoder.layer.3.output.LayerNorm.

CausalMultiTask(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tr

### Small example

In [17]:
output_seq, output_cls = model.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
print(output_seq.shape)
print(output_cls.shape)

torch.Size([6, 73, 768])
torch.Size([6, 768])


In [None]:
QUESTIONS:
- Do we only update parameters of task-specific layer? Or the whole BERT model?
- In a multitask setting we have to update all parameters, otherwise
they don't benefit ?