In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm, trange
from utils import normalizeTweet, split_into_sentences, bio_tagging, create_training_data



data = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
#data = pd.read_excel("/home/adrian/Downloads/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
print("Total count:", data.shape[0])
data = data[data["Causal association"].notnull()]
print("Labeled count:", data.shape[0])

data.head()

Total count: 5456
Labeled count: 5000


Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


### Interrater-reliabilty measure

In [2]:
from sklearn.metrics import cohen_kappa_score

charline = data[data["Charline association0=no;1=yes"].notnull()]
coder1 = charline["Causal association"].values
coder2 = charline["Charline association0=no;1=yes"]
score = cohen_kappa_score(coder1,coder2)
#print('Cohen\'s Kappa:',score)

### Data Preprocessing

In [2]:
data["Causal association"].value_counts()

0.0    3720
1.0    1280
Name: Causal association, dtype: int64

In [3]:
trainingData = create_training_data(data, min_words_in_sentences=3)
trainingData.head()

Unnamed: 0,tweet,Causal association,BIOtags
0,"tonight , I learned my older girl will back he...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,I'm a trans woman .,0.0,"[O, O, O, O, O, O]"
3,"Both of us could use a world where "" brave and...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"Make a world where people can just be , withou...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [4]:
for i,row in trainingData.sample(n=20).iterrows():
    print("\n")
    print(row["tweet"])
    print(row["BIOtags"])



USER He got really dramatic about his diabetes so .
['O', 'O', 'B-E', 'I-E', 'I-E', 'O', 'O', 'B-C', 'O', 'O']


USER To be honest .
['O', 'O', 'O', 'O', 'O']


USER USER * PLEASE HELP * my friends father is in Naguabo , Puerto Rico.
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Amiyah is a diabetic and downstairs sneaking the candy cains off the tree .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


USER Steve , I've recently had a diagnosis of being pre-diabetic as an Hb1ac test came back one point below the level for a diabetes diagnosis .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-E', 'I-E', 'O', 'O', 'B-C', 'I-C', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


My 90 yr .
['O', 'O', 'O', 'O']


HTTPURL Now to persuade my GP it is worth it !
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Very informative for kids dealing with the newness of T1D .
['O', 'O', 'O', 'O', 'O', 'O', '

In [7]:
trainingData["Causal association"].value_counts()

0.0    7607
1.0    1019
Name: Causal association, dtype: int64

### Training

In [5]:
trainingDataSample = trainingData.sample(n=200)
train = trainingDataSample.sample(frac=0.8, random_state=0)
test = trainingDataSample.drop(train.index)
validate = train.sample(frac=0.2, random_state=0)
train = train.drop(validate.index)
print("Train:", train.shape)
print("Validate:", validate.shape)
print("Test:", test.shape)

Train: (128, 3)
Validate: (32, 3)
Test: (40, 3)


In [6]:

# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, bio_tags, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.bio_tags = bio_tags
        self.tag2id = {label: idx for idx, label in enumerate(["O", "B-C", "I-C", "B-E", "I-E"])}
        self.tag2id[-100] = -100
        self.id2tag = {id:tag for tag,id in self.tag2id.items()}

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        bio_tags_extended = self.extend_tags(self.text[idx], self.bio_tags[idx], ids[idx])
        assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.float)
              , "bio_tags" : torch.tensor(list(map(lambda bioTags: self.tag2id[bioTags], bio_tags_extended))
, dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

    
    def extend_tags(self, tokens_old, tags_old, ids_tokenized_padded):
        """ 
            Each token has a BIO tag label. 
            However BERT's tokenization splits tokens into subwords. How to label those subwords?
            
            Option 1:
            ---------
            
            add the same label to each subword than the first subword. Only replace "B" by "I"
            Ex. 
            #lowbloodsugar => '#low@@', 'blood@@', 'sugar@@'
               "B-C"       =>   "B-C" ,   "I-C"  ,   "I-C"
            
            Option 2 (implemented):      
            ---------
            
            From : https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities
            A common obstacle with using pre-trained models for token-level classification: many of the tokens in
            the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called 
            WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token
            is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter 
            handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we 
            have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will
            end up with a mismatch between our tokens and our labels.

            One way to handle this is to only train on the tag labels for the first subtoken of a split token. 
            We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. 
            In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set 
            the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
        """
        tags = [-100] # add for start token <CLS>
        for token_old, tag in zip(tokens_old.split(" "), tags_old):
#            print(F"\ntoken_old: {token_old};    tag: {tag}")
            for i, sub_token in enumerate(self.tokenizer.tokenize(token_old)):
                if (i == 0):
                    tags.append(tag)
                else: 
                    tags.append(-100)
           
        tags.append(-100) # 0 for last token
    
        # append -100 for all padded elements
        padded_elements = ids_tokenized_padded.count(1) # id 1 is <PAD> ; Alternative: where attention_mask == 0 add -100
        tags.extend([-100]*padded_elements)
        
        return tags
        
        
    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train["tweet"].map(normalizeTweet).values.tolist()
                           , train["Causal association"].values.tolist()
                           , train["BIOtags"].values.tolist()
                           , tokenizer)
val_dataset = TweetDataSet(validate["tweet"].map(normalizeTweet).values.tolist()
                           , validate["Causal association"].values.tolist()
                           , validate["BIOtags"].values.tolist()
                           , tokenizer)
test_dataset = TweetDataSet(test["tweet"].map(normalizeTweet).values.tolist()
                           , test["Causal association"].values.tolist()
                           , test["BIOtags"].values.tolist()
                           , tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# put data to batches
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)


  assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


128
32
40


In [7]:
# 1) Trainer 
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred, labels):
    """
        Dataset is unbalanced -> measure weighted metrics
        Calculate metrics for each label, and find their average wieghted by support (Number of true instances for each label)
        This alters 'macro' to account for label imbalance;
        it can result in an F-Score taht is not between precision and recall
    """
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted') #binary
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalNER(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalNER, self).__init__()
        self.num_labels = 5 # B-C, I-C, B-E, I-E, O
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
#        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output_seq, _ = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output_2 = self.dropout(output_seq)
        output_3 = self.linear1(output_2)
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        logit = self.softmax(output_5)
        return logit


## Model parameters
batchsize_train = 16
lr = 5e-5
adam_eps = 1e-8
epochs = 3
num_warmup_steps = 0
num_training_steps = len(train_loader)*epochs

# Store our loss and learning rate for plotting
train_loss_set = []
learning_rate = []


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CausalNER()
model.to(device)

# fine-tune only the task-specific parameters -> Vivek? 
for param in model.bert.parameters():
    param.requires_grad = False
    


optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
loss_fn = CrossEntropyLoss(ignore_index=-100) # ignore subwords/tokens with label -100 


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.1.attention.self.query.bias', 'roberta.encoder.layer.7.intermediate.dense.bias', 'roberta.encoder.layer.1.output.dense.bias', 'roberta.encoder.layer.5.attention.output.dense.weight', 'roberta.encoder.layer.3.attention.self.query.weight', 'roberta.encoder.layer.9.attention.output.dense.bias', 'roberta.embeddings.position_ids', 'roberta.encoder.layer.9.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.8.intermediate.dense.weight', 'roberta.encoder.layer.4.attention.self.query.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.11.attention.self.value.bias', 'roberta.encoder.layer.3.attention.self.query.bias', 'roberta.encoder.la

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.9.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.11.intermediate.dense.weight', 'encoder.layer.11.attention.self.value.weight', 'encoder.layer.3.attention.output.dense.weight', 'encoder.layer.11.attention.self.key.weight', 'encoder.layer.1.attention.output.dense.weight', 'encoder.layer.5.attention.output.dense.weight', 'encoder.layer.8.attention.output.LayerNorm.weight', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.7.output.dense.bias', 'encoder.layer.5.attention.output.LayerNorm.weight', 'encoder.layer.10.attention.self.key.bias', 'encoder.layer.6.attention.self.key.weight', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.0.attention.self.value.we

### Training

In [8]:
N_bio_tags = 5 # "O", "B-C", "I-C", "B-E", "I-C"
for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")

    
    ############ training eval metrics ######################
    tr_accuracy, tr_mcc_accuracy, nb_tr_steps = 0, 0, 0 # Tracking variables
    train_loss = []
    train_acc = []
    train_prec = []
    train_rec = []
    train_f1 = []
    
    #########################################################
    
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        bio_tags = batch['bio_tags'].to(device)
        
        ################################################
        model.train() # set model to training mode
        logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

        ################################################ 
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        #active_loss2 = bio_tags.view(-1) != -100   # excludes all special tokens including <CLS>, <SEP>
        active_logits = logits.view(-1, N_bio_tags)[active_loss] # 5 
        active_tags = bio_tags.view(-1)[active_loss]
        loss = loss_fn(active_logits, active_tags)             
        print("loss:", loss)
        loss.backward() # backward pass
        optim.step()    # update parameters and take a steup using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
            
            
        ################## Training Performance Measures ##########
        logits = logits.detach().to('cpu').numpy()
        tags_ids = bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]#.flatten()
        
        # move logits and labels to CPU
        #logits = logits.squeeze().detach().to('cpu').numpy()
#        label_ids = labels.to('cpu').numpy()
        #label_ids = bio_tags.squeeze().to('cpu').numpy()
        
        #labels_mask = label_ids != -100 # only get token labels and not labels from subwords or special tokens
        #pred_flat = np.argmax(logits, axis=1)[labels_mask] #.flatten() # convert logits to list of predicted labels
        #labels_flat = label_ids[labels_mask]#.flatten()
                          
            
            
        tr_accuracy += accuracy_score(tags, pred)
        tr_mcc_accuracy += matthews_corrcoef(tags, pred)                          
                
        metrics = compute_metrics(pred, tags)
        train_acc.append(metrics["accuracy"])
        train_prec.append(metrics["precision"])
        train_rec.append(metrics["recall"])
        train_f1.append(metrics["f1"])
                          
        nb_tr_steps += 1
           
    print(F'\n\tTraining Loss: {np.mean(train_loss)}')
    print(F'\n\tTraining acc: {np.mean(train_acc)}')
    print(F'\n\tTraining MCC acc: {tr_mcc_accuracy / nb_tr_steps}')
    print(F'\n\tTraining prec: {np.mean(train_prec)}')
    print(F'\n\tTraining rec: {np.mean(train_rec)}')
    print(F'\n\tTraining f1: {np.mean(train_f1)}')
                          
                          
    # store the current learning rate
    for param_group in optim.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])
    

    ############# Validation ################
    
    eval_accuracy, eval_mcc_accuracy, nb_eval_steps = 0, 0, 0 # Tracking variables
    val_accuracy = []
    val_loss = []
    val_acc = []
    val_prec = []
    val_rec = []
    val_f1 = []

    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        v_input_ids, v_input_mask, v_token_type_ids, v_labels, v_bio_tags = batch  # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            model.eval() # put model in evaluation mode for validation set
            logits = model(**{"input_ids":v_input_ids, "attention_mask":v_input_mask, "token_type_ids":v_token_type_ids}) # forward pass, calculates logit predictions

        ######################################################
        
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        v_active_loss = v_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        v_active_logits = logits.view(-1, N_bio_tags)[v_active_loss] # 5 
        v_active_tags = v_bio_tags.view(-1)[v_active_loss]
        v_loss = loss_fn(v_active_logits, v_active_tags)             
        val_loss.append(v_loss.item())
              
        #########################################################
        logits = logits.detach().to('cpu').numpy()
        tags_ids = v_bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]#.flatten()        
        
        # move logits and labels to CPU
        #logits = logits.squeeze().detach().to('cpu').numpy()
        #label_ids = b_labels.squeeze().to('cpu').numpy()
        
        #labels_mask = label_ids != -100 # only get token labels and not labels from subwords or special tokens
        #pred_flat = np.argmax(logits, axis=1)[labels_mask] #.flatten() # convert logits to list of predicted labels
        #labels_flat = label_ids[labels_mask]#.flatten()
        
        eval_accuracy += accuracy_score(tags, pred)
        eval_mcc_accuracy += matthews_corrcoef(tags, pred)
        
        metrics = compute_metrics(pred, tags)
        val_acc.append(metrics["accuracy"])
        val_prec.append(metrics["precision"])
        val_rec.append(metrics["recall"])
        val_f1.append(metrics["f1"])
                              
        nb_eval_steps += 1
        
    print(F'\n\tValidation Loss: {np.mean(val_loss)}')
    print(F'\n\tValidation acc: {np.mean(val_acc)}')
    print(F'\n\tValidation MCC acc: {eval_mcc_accuracy / nb_eval_steps}')
    print(F'\n\tValidation prec: {np.mean(val_prec)}')
    print(F'\n\tValidation rec: {np.mean(val_rec)}')
    print(F'\n\tValidation f1: {np.mean(val_f1)}')
    


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/64 [00:00<?, ?it/s][A



  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  2%|▏         | 1/64 [00:00<00:48,  1.29it/s][A

loss: tensor(1.6065, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  3%|▎         | 2/64 [00:01<00:31,  2.00it/s][A

loss: tensor(1.5727, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  5%|▍         | 3/64 [00:01<00:25,  2.35it/s][A

loss: tensor(1.5639, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  6%|▋         | 4/64 [00:01<00:22,  2.73it/s][A

loss: tensor(1.5055, grad_fn=<NllLossBackward>)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))

  9%|▉         | 6/64 [00:02<00:16,  3.48it/s][A

loss: tensor(1.4856, grad_fn=<NllLossBackward>)
loss: tensor(1.4894, grad_fn=<NllLossBackward>)


  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▎        | 8/64 [00:02<00:13,  4.26it/s][A

loss: tensor(1.4373, grad_fn=<NllLossBackward>)
loss: tensor(1.3561, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 14%|█▍        | 9/64 [00:02<00:12,  4.50it/s][A

loss: tensor(1.3339, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 16%|█▌        | 10/64 [00:02<00:11,  4.56it/s][A

loss: tensor(1.2971, grad_fn=<NllLossBackward>)


  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 11/64 [00:03<00:11,  4.56it/s][A

loss: tensor(1.3600, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 19%|█▉        | 12/64 [00:03<00:11,  4.42it/s][A

loss: tensor(1.2211, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 20%|██        | 13/64 [00:03<00:11,  4.51it/s][A

loss: tensor(1.2458, grad_fn=<NllLossBackward>)
loss: tensor(1.2060, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)



loss: tensor(1.1280, grad_fn=<NllLossBackward>)
loss: tensor(1.1098, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 28%|██▊       | 18/64 [00:04<00:09,  4.92it/s][A

loss: tensor(1.1024, grad_fn=<NllLossBackward>)
loss: tensor(1.1682, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 30%|██▉       | 19/64 [00:04<00:09,  4.66it/s][A

loss: tensor(1.0491, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 33%|███▎      | 21/64 [00:05<00:09,  4.57it/s][A

loss: tensor(1.1133, grad_fn=<NllLossBackward>)
loss: tensor(1.0156, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 34%|███▍      | 22/64 [00:05<00:08,  4.73it/s][A

loss: tensor(0.9996, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 36%|███▌      | 23/64 [00:05<00:08,  4.66it/s][A

loss: tensor(0.9979, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 38%|███▊      | 24/64 [00:06<00:09,  4.39it/s][A

loss: tensor(0.9787, grad_fn=<NllLossBackward>)
loss: tensor(1.0474, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 41%|████      | 26/64 [00:06<00:07,  4.83it/s][A

loss: tensor(1.2054, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 42%|████▏     | 27/64 [00:06<00:07,  4.69it/s][A

loss: tensor(0.9661, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 45%|████▌     | 29/64 [00:07<00:07,  4.42it/s][A

loss: tensor(0.9508, grad_fn=<NllLossBackward>)
loss: tensor(0.9529, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 47%|████▋     | 30/64 [00:07<00:07,  4.39it/s][A

loss: tensor(0.9452, grad_fn=<NllLossBackward>)
loss: 

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 48%|████▊     | 31/64 [00:07<00:07,  4.54it/s][A

tensor(0.9435, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 52%|█████▏    | 33/64 [00:07<00:06,  4.78it/s][A

loss: tensor(0.9391, grad_fn=<NllLossBackward>)
loss: tensor(1.0137, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 53%|█████▎    | 34/64 [00:08<00:06,  4.81it/s][A

loss: tensor(1.0210, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 55%|█████▍    | 35/64 [00:08<00:06,  4.49it/s][A

loss: tensor(0.9327, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 56%|█████▋    | 36/64 [00:08<00:06,  4.36it/s][A

loss: tensor(0.9299, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 58%|█████▊    | 37/64 [00:08<00:06,  4.31it/s][A

loss: tensor(0.9286, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 61%|██████    | 39/64 [00:09<00:05,  4.53it/s][A

loss: tensor(0.9297, grad_fn=<NllLossBackward>)
loss: tensor(1.0413, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 40/64 [00:09<00:05,  4.74it/s][A

loss: tensor(0.9785, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 64%|██████▍   | 41/64 [00:09<00:04,  4.65it/s][A

loss: tensor(0.9264, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 66%|██████▌   | 42/64 [00:10<00:05,  4.11it/s][A

loss: tensor(0.9252, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 69%|██████▉   | 44/64 [00:10<00:04,  4.14it/s][A

loss: tensor(0.9718, grad_fn=<NllLossBackward>)
loss: tensor(0.9207, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 70%|███████   | 45/64 [00:10<00:04,  4.30it/s][A

loss: tensor(0.9220, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 72%|███████▏  | 46/64 [00:11<00:04,  4.18it/s][A

loss: tensor(0.9202, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 73%|███████▎  | 47/64 [00:11<00:03,  4.35it/s][A

loss: tensor(0.9251, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 75%|███████▌  | 48/64 [00:11<00:03,  4.06it/s][A

loss: tensor(0.9201, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 78%|███████▊  | 50/64 [00:11<00:03,  4.25it/s][A

loss: tensor(0.9204, grad_fn=<NllLossBackward>)
loss: tensor(0.9194, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 81%|████████▏ | 52/64 [00:12<00:02,  4.65it/s][A

loss: tensor(0.9186, grad_fn=<NllLossBackward>)
loss: tensor(0.9223, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 83%|████████▎ | 53/64 [00:12<00:02,  4.14it/s][A

loss: tensor(0.9180, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 84%|████████▍ | 54/64 [00:12<00:02,  3.90it/s][A

loss: tensor(0.9150, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 86%|████████▌ | 55/64 [00:13<00:02,  3.87it/s][A

loss: tensor(0.9165, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 88%|████████▊ | 56/64 [00:13<00:02,  3.89it/s][A

loss: tensor(0.9180, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 91%|█████████ | 58/64 [00:13<00:01,  4.20it/s][A

loss: tensor(0.9175, grad_fn=<NllLossBackward>)
loss: tensor(1.1270, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 94%|█████████▍| 60/64 [00:14<00:00,  4.69it/s][A

loss: tensor(0.9174, grad_fn=<NllLossBackward>)
loss: tensor(1.2129, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 97%|█████████▋| 62/64 [00:14<00:00,  4.78it/s][A

loss: tensor(0.9122, grad_fn=<NllLossBackward>)
loss: tensor(0.9168, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 98%|█████████▊| 63/64 [00:14<00:00,  4.76it/s][A

loss: tensor(0.9156, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

100%|██████████| 64/64 [00:15<00:00,  4.21it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss: tensor(0.9160, grad_fn=<NllLossBackward>)

	Training Loss: 1.0731903361156583

	Training acc: 0.9139520875586123

	Training MCC acc: -0.0010303733675085152

	Training prec: 0.9458404984702412

	Training rec: 0.9139520875586123

	Training f1: 0.9175125310063432

	Current Learning rate:  3.3333333333333335e-05


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.82it/s][A
Epoch:  33%|███▎      | 1/3 [00:17<00:34, 17.42s/it]
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  2%|▏         | 1/64 [00:00<00:11,  5.45it/s][A


	Validation Loss: 0.9212558716535568

	Validation acc: 0.9875164690382081

	Validation MCC acc: 0.0

	Validation prec: 0.9753927876461816

	Validation rec: 0.9875164690382081

	Validation f1: 0.9813661137840431
loss: tensor(0.9163, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  3%|▎         | 2/64 [00:00<00:12,  5.14it/s][A

loss: tensor(1.0806, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  5%|▍         | 3/64 [00:00<00:13,  4.43it/s][A

loss: tensor(0.9135, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  6%|▋         | 4/64 [00:00<00:13,  4.54it/s][A

loss: tensor(0.9204, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  9%|▉         | 6/64 [00:01<00:11,  4.87it/s][A

loss: tensor(0.9164, grad_fn=<NllLossBackward>)
loss: tensor(0.9151, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 11%|█         | 7/64 [00:01<00:13,  4.35it/s][A

loss: tensor(0.9136, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 12%|█▎        | 8/64 [00:01<00:13,  4.01it/s][A

loss: tensor(0.9127, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 14%|█▍        | 9/64 [00:02<00:13,  3.98it/s][A

loss: tensor(0.9153, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 16%|█▌        | 10/64 [00:02<00:12,  4.17it/s][A

loss: tensor(0.9892, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 17%|█▋        | 11/64 [00:02<00:12,  4.11it/s][A

loss: tensor(0.9132, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 20%|██        | 13/64 [00:03<00:11,  4.27it/s]

loss: tensor(0.9957, grad_fn=<NllLossBackward>)
loss: tensor(0.9136, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 22%|██▏       | 14/64 [00:03<00:13,  3.81it/s][A

loss: tensor(0.9125, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 23%|██▎       | 15/64 [00:03<00:12,  3.93it/s][A

loss: tensor(0.9121, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 16/64 [00:03<00:13,  3.63it/s][A

loss: tensor(1.0576, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 27%|██▋       | 17/64 [00:04<00:11,  3.93it/s][A

loss: tensor(0.9122, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 30%|██▉       | 19/64 [00:04<00:10,  4.29it/s][A

loss: tensor(1.0105, grad_fn=<NllLossBackward>)
loss: tensor(1.0026, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 33%|███▎      | 21/64 [00:04<00:08,  4.78it/s][A

loss: tensor(0.9120, grad_fn=<NllLossBackward>)
loss: tensor(0.9139, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 34%|███▍      | 22/64 [00:05<00:08,  4.82it/s][A

loss: tensor(0.9125, grad_fn=<NllLossBackward>)
loss: tensor(0.9120, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 38%|███▊      | 24/64 [00:05<00:08,  4.51it/s][A

loss: tensor(0.9121, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 39%|███▉      | 25/64 [00:05<00:09,  4.10it/s][A

loss: tensor(0.9115, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 41%|████      | 26/64 [00:06<00:09,  4.00it/s][A

loss: tensor(0.9140, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 42%|████▏     | 27/64 [00:06<00:08,  4.23it/s][A

loss: tensor(0.9112, grad_fn=<NllLossBackward>)
loss: tensor(0.9137, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 45%|████▌     | 29/64 [00:06<00:07,  4.66it/s][A

loss: tensor(0.9738, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 47%|████▋     | 30/64 [00:07<00:08,  4.15it/s][A

loss: tensor(0.9112, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 48%|████▊     | 31/64 [00:07<00:08,  4.04it/s][A

loss: tensor(0.9113, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 50%|█████     | 32/64 [00:07<00:07,  4.21it/s][A

loss: tensor(0.9116, grad_fn=<NllLossBackward>)
loss: tensor(1.0685, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 55%|█████▍    | 35/64 [00:08<00:06,  4.67it/s][A

loss: tensor(0.9102, grad_fn=<NllLossBackward>)
loss: tensor(0.9135, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 56%|█████▋    | 36/64 [00:08<00:05,  4.79it/s][A

loss: tensor(0.9090, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 58%|█████▊    | 37/64 [00:08<00:05,  4.55it/s][A

loss: tensor(0.9124, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 59%|█████▉    | 38/64 [00:08<00:05,  4.65it/s][A

loss: tensor(0.9124, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 61%|██████    | 39/64 [00:09<00:05,  4.49it/s][A

loss: tensor(0.9114, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 40/64 [00:09<00:05,  4.46it/s][A

loss: tensor(1.1414, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 64%|██████▍   | 41/64 [00:09<00:05,  4.46it/s][A

loss: tensor(0.9124, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 67%|██████▋   | 43/64 [00:09<00:04,  4.54it/s][A

loss: tensor(0.9118, grad_fn=<NllLossBackward>)
loss: tensor(0.9108, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 69%|██████▉   | 44/64 [00:10<00:04,  4.20it/s][A

loss: tensor(0.9107, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 70%|███████   | 45/64 [00:10<00:04,  3.97it/s][A

loss: tensor(0.9659, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 72%|███████▏  | 46/64 [00:10<00:04,  3.82it/s][A

loss: tensor(0.9981, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 73%|███████▎  | 47/64 [00:11<00:04,  3.82it/s][A

loss: tensor(0.9111, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 77%|███████▋  | 49/64 [00:11<00:03,  4.27it/s][A

loss: tensor(0.9108, grad_fn=<NllLossBackward>)
loss: tensor(0.9099, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 78%|███████▊  | 50/64 [00:11<00:03,  4.48it/s][A

loss: tensor(0.9101, grad_fn=<NllLossBackward>)
loss: 

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 80%|███████▉  | 51/64 [00:11<00:02,  4.60it/s][A

tensor(1.0436, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 81%|████████▏ | 52/64 [00:12<00:02,  4.42it/s][A

loss: tensor(1.0048, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 83%|████████▎ | 53/64 [00:12<00:02,  4.28it/s][A

loss: tensor(0.9107, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 84%|████████▍ | 54/64 [00:12<00:02,  3.88it/s][A

loss: tensor(0.9129, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 86%|████████▌ | 55/64 [00:12<00:02,  4.04it/s][A

loss: tensor(1.1166, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 88%|████████▊ | 56/64 [00:13<00:01,  4.22it/s][A

loss: tensor(0.9090, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 89%|████████▉ | 57/64 [00:13<00:01,  4.01it/s][A

loss: tensor(0.9097, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 91%|█████████ | 58/64 [00:13<00:01,  4.01it/s][A

loss: tensor(0.9107, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 92%|█████████▏| 59/64 [00:13<00:01,  4.07it/s][A

loss: tensor(0.9102, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 94%|█████████▍| 60/64 [00:14<00:01,  3.84it/s][A

loss: tensor(0.9101, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 95%|█████████▌| 61/64 [00:14<00:00,  3.59it/s][A

loss: tensor(0.9747, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 97%|█████████▋| 62/64 [00:14<00:00,  3.62it/s][A

loss: tensor(0.9095, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 98%|█████████▊| 63/64 [00:15<00:00,  3.70it/s][A

loss: tensor(0.9112, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 64/64 [00:15<00:00,  4.20it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss: tensor(0.9861, grad_fn=<NllLossBackward>)

	Training Loss: 0.9405305124819279

	Training acc: 0.971285720087811

	Training MCC acc: 0.0

	Training prec: 0.9465418281502342

	Training rec: 0.971285720087811

	Training f1: 0.958009344689859

	Current Learning rate:  1.6666666666666667e-05


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.77it/s][A
Epoch:  67%|██████▋   | 2/3 [00:34<00:17, 17.48s/it]
  0%|          | 0/64 [00:00<?, ?it/s][A


	Validation Loss: 0.9228370785713196

	Validation acc: 0.9838709677419355

	Validation MCC acc: 0.0

	Validation prec: 0.9687825182101977

	Validation rec: 0.9838709677419355

	Validation f1: 0.9760752688172043


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  2%|▏         | 1/64 [00:00<00:14,  4.50it/s][A

loss: tensor(0.9111, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  3%|▎         | 2/64 [00:00<00:14,  4.37it/s][A

loss: tensor(0.9781, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  5%|▍         | 3/64 [00:00<00:13,  4.56it/s][A

loss: tensor(0.9103, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  6%|▋         | 4/64 [00:00<00:13,  4.57it/s][A

loss: tensor(0.9099, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  8%|▊         | 5/64 [00:01<00:12,  4.68it/s][A

loss: tensor(0.9106, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  9%|▉         | 6/64 [00:01<00:12,  4.69it/s][A

loss: tensor(1.0096, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 11%|█         | 7/64 [00:01<00:12,  4.41it/s][A

loss: tensor(1.0814, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 12%|█▎        | 8/64 [00:01<00:12,  4.51it/s][A

loss: tensor(0.9116, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 14%|█▍        | 9/64 [00:01<00:12,  4.45it/s][A

loss: tensor(0.9093, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 16%|█▌        | 10/64 [00:02<00:12,  4.38it/s][A

loss: tensor(0.9099, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 11/64 [00:02<00:11,  4.46it/s][A

loss: tensor(1.2082, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 19%|█▉        | 12/64 [00:02<00:12,  4.10it/s][A

loss: tensor(0.9095, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 20%|██        | 13/64 [00:02<00:11,  4.31it/s][A

loss: tensor(0.9082, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 22%|██▏       | 14/64 [00:03<00:11,  4.42it/s][A

loss: tensor(1.0099, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 23%|██▎       | 15/64 [00:03<00:12,  4.00it/s][A

loss: tensor(0.9098, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 16/64 [00:03<00:11,  4.08it/s][A

loss: tensor(0.9890, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 27%|██▋       | 17/64 [00:03<00:10,  4.28it/s][A

loss: tensor(1.1422, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 28%|██▊       | 18/64 [00:04<00:10,  4.44it/s][A

loss: tensor(0.9100, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 30%|██▉       | 19/64 [00:04<00:10,  4.42it/s][A

loss: tensor(0.9107, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 31%|███▏      | 20/64 [00:04<00:09,  4.46it/s][A

loss: tensor(0.9090, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 33%|███▎      | 21/64 [00:04<00:09,  4.55it/s][A

loss: tensor(0.9832, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 34%|███▍      | 22/64 [00:04<00:09,  4.58it/s][A

loss: tensor(0.9693, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 36%|███▌      | 23/64 [00:05<00:09,  4.24it/s][A

loss: tensor(0.9094, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 38%|███▊      | 24/64 [00:05<00:09,  4.27it/s][A

loss: tensor(0.9086, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 39%|███▉      | 25/64 [00:05<00:08,  4.39it/s][A

loss: tensor(0.9103, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 41%|████      | 26/64 [00:05<00:08,  4.44it/s][A

loss: tensor(0.9106, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 42%|████▏     | 27/64 [00:06<00:08,  4.43it/s][A

loss: tensor(0.9088, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 44%|████▍     | 28/64 [00:06<00:07,  4.53it/s][A

loss: tensor(0.9086, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 45%|████▌     | 29/64 [00:06<00:07,  4.56it/s][A

loss: tensor(0.9105, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 47%|████▋     | 30/64 [00:06<00:07,  4.50it/s][A

loss: tensor(0.9107, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 48%|████▊     | 31/64 [00:07<00:07,  4.61it/s][A

loss: tensor(0.9109, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 50%|█████     | 32/64 [00:07<00:06,  4.69it/s][A

loss: tensor(0.9091, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 52%|█████▏    | 33/64 [00:07<00:06,  4.74it/s][A

loss: tensor(0.9087, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 53%|█████▎    | 34/64 [00:07<00:06,  4.80it/s][A

loss: tensor(0.9087, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 55%|█████▍    | 35/64 [00:07<00:06,  4.24it/s][A

loss: tensor(1.0662, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 56%|█████▋    | 36/64 [00:08<00:07,  3.96it/s][A

loss: tensor(0.9848, grad_fn=<NllLossBackward>)
loss: tensor(0.9084, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 59%|█████▉    | 38/64 [00:08<00:05,  4.39it/s][A

loss: tensor(0.9083, grad_fn=<NllLossBackward>)
loss: tensor(0.9085, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 62%|██████▎   | 40/64 [00:09<00:05,  4.64it/s][A

loss: tensor(0.9094, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 64%|██████▍   | 41/64 [00:09<00:05,  4.54it/s][A

loss: tensor(0.9095, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 66%|██████▌   | 42/64 [00:09<00:05,  4.21it/s][A

loss: tensor(0.9109, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 67%|██████▋   | 43/64 [00:09<00:05,  3.84it/s][A

loss: tensor(0.9952, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 69%|██████▉   | 44/64 [00:10<00:05,  3.97it/s][A

loss: tensor(0.9861, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 70%|███████   | 45/64 [00:10<00:04,  4.11it/s][A

loss: tensor(0.9095, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 72%|███████▏  | 46/64 [00:10<00:04,  4.20it/s][A

loss: tensor(0.9096, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 73%|███████▎  | 47/64 [00:10<00:03,  4.38it/s][A

loss: tensor(0.9084, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 48/64 [00:11<00:04,  3.97it/s][A

loss: tensor(1.1099, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 77%|███████▋  | 49/64 [00:11<00:03,  3.78it/s][A

loss: tensor(0.9108, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 78%|███████▊  | 50/64 [00:11<00:03,  3.98it/s][A

loss: tensor(0.9098, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 80%|███████▉  | 51/64 [00:11<00:03,  4.06it/s][A

loss: tensor(0.9095, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 81%|████████▏ | 52/64 [00:12<00:03,  3.99it/s][A

loss: tensor(0.9094, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 83%|████████▎ | 53/64 [00:12<00:02,  3.72it/s][A

loss: tensor(0.9091, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 84%|████████▍ | 54/64 [00:12<00:02,  3.64it/s][A

loss: tensor(0.9096, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 86%|████████▌ | 55/64 [00:12<00:02,  3.57it/s][A

loss: tensor(1.0683, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 88%|████████▊ | 56/64 [00:13<00:02,  3.58it/s][A

loss: tensor(1.1072, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 89%|████████▉ | 57/64 [00:13<00:01,  3.74it/s][A

loss: tensor(0.9091, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 91%|█████████ | 58/64 [00:13<00:01,  3.62it/s][A

loss: tensor(0.9082, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 92%|█████████▏| 59/64 [00:13<00:01,  3.90it/s][A

loss: tensor(0.9092, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 94%|█████████▍| 60/64 [00:14<00:01,  3.94it/s][A

loss: tensor(0.9096, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 95%|█████████▌| 61/64 [00:14<00:00,  3.86it/s][A

loss: tensor(0.9075, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 97%|█████████▋| 62/64 [00:14<00:00,  3.54it/s][A

loss: tensor(0.9076, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

 98%|█████████▊| 63/64 [00:15<00:00,  3.33it/s][A

loss: tensor(0.9116, grad_fn=<NllLossBackward>)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 64/64 [00:15<00:00,  4.15it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss: tensor(1.1659, grad_fn=<NllLossBackward>)

	Training Loss: 0.9469197122380137

	Training acc: 0.9624451150070873

	Training MCC acc: 0.0

	Training prec: 0.9316285439700391

	Training rec: 0.9624451150070873

	Training f1: 0.9455545101859788

	Current Learning rate:  0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.77it/s][A
Epoch: 100%|██████████| 3/3 [00:52<00:00, 17.54s/it]


	Validation Loss: 0.9177711308002472

	Validation acc: 0.9886350689301583

	Validation MCC acc: 0.0

	Validation prec: 0.9775589162010786

	Validation rec: 0.9886350689301583

	Validation f1: 0.9830258061006851





### Evaluation on the test dataset

In [15]:

############ test eval metrics ######################
test_mcc_accuracy, nb_test_steps = 0, 0# Tracking variables
test_loss = []
test_acc = []
test_prec = []
test_rec = []
test_f1 = []

########################################################
for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch     # unpack inputs from dataloader

    with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
        model.eval() # put model in evaluation mode for validation set
        logits = model(**{"input_ids":t_input_ids, "attention_mask":t_input_mask, "token_type_ids":t_token_type_ids}) # forward pass, calculates logit predictions

    ######################################################

    # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
    t_active_loss = t_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
    t_active_logits = logits.view(-1, N_bio_tags)[t_active_loss] # 5 
    t_active_tags = t_bio_tags.view(-1)[t_active_loss]
    t_loss = loss_fn(t_active_logits, t_active_tags)             
    test_loss.append(t_loss.item())

    #########################################################
    logits = logits.detach().to('cpu').numpy()
    tags_ids = t_bio_tags.to('cpu').numpy()

    # calculate performance measures only on tokens and not subwords or special tokens
    tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
    tags = tags_ids[tags_mask]#.flatten()                          

    test_mcc_accuracy += matthews_corrcoef(tags, pred)
    metrics = compute_metrics(pred, tags)
    test_acc.append(metrics["accuracy"])
    test_prec.append(metrics["precision"])
    test_rec.append(metrics["recall"])
    test_f1.append(metrics["f1"])

    nb_test_steps += 1

print(F'\n\tTest Loss: {np.mean(test_loss)}')
print(F'\n\tTest acc: {np.mean(test_acc)}')
print(F'\n\tTest MCC acc: {eval_mcc_accuracy / nb_eval_steps}')
print(F'\n\tTest prec: {np.mean(test_prec)}')
print(F'\n\tTest rec: {np.mean(test_rec)}')
print(F'\n\tTest f1: {np.mean(test_f1)}')


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
100%|██████████| 5/5 [00:02<00:00,  1.78it/s]


	Test Loss: 0.9306409001350403

	Test acc: 0.9757000656404777

	Test MCC acc: 0.0

	Test prec: 0.9524102891686322

	Test rec: 0.9757000656404777

	Test f1: 0.9638081231229314





In [27]:
print(tags_ids.shape)
print(pred.shape)
for i, ids in enumerate(t_input_ids):
    print(tokenizer.convert_ids_to_tokens(ids))
    print(tags_ids[i])
    #print(pred.shape)
    #print(tags.shape)
print(t_input_ids.shape)

    #logits = logits.detach().to('cpu').numpy()
    #tags_ids = t_bio_tags.to('cpu').numpy()

    # calculate performance measures only on tokens and not subwords or special tokens
    #tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
    #pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
    #tags = tags_ids[tags_mask]#.flatten()      

(8, 50)
(108,)
['<s>', 'Have', 'the', 'most', 'spectacular', 'celebrations', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[-100    0    0    0    0    0    0 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100]
['<s>', '"', 'Careful', ',', 'I', 'am', 'cancer@@', 'ous', '"', 'or', '"', 'want', 'diabetes', ',', 'drink', 'me', '!', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

### Save model

In [None]:
torch.save(model.state_dict(), "finetuned-35-epochs.pth")

### Load model locally

In [None]:
device = torch.device("cuda", if torch.cuda.is_available() else "cpu")
model = CausalityBERT()
model.load_state_dict(torch.load("finetuned-35-epochs.pth"))
model.to(device)
model.eval()

# Questions to Vivek?


In [None]:
If there is only one cause and no effect ; or only one effect and no cause => ignore ? 

In [9]:
data.head()

Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


### Small example

In [52]:
# Small steps
sample = trainingData.sample(n=5, random_state=11)[3:]
sample.head()

Unnamed: 0,tweet,Causal association,BIOtags
447,I've been light headed and shakey for the last...,1.0,"[O, O, O, B-E, I-E, O, B-E, O, O, O, O, O, O, ..."
7584,2 before to 0.,0.0,"[O, O, O, O, O]"


In [54]:
N_bio_tags = 5 
train_dataset = TweetDataSet(sample["tweet"].map(normalizeTweet).values.tolist()
                           , sample["Causal association"].values.tolist()
                           , sample["BIOtags"].values.tolist()
                           , tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

print("Tweet:")
print(sample.iloc[0]["tweet"])
print("BIO tags:")
print(sample.iloc[0]["BIOtags"])
print("\ntokenized:")
print(tokenizer.convert_ids_to_tokens(train_dataset[1]["input_ids"]))
print("BIO tags extended:")
print(train_dataset[0]["bio_tags"])
print("\nids:")
print(train_dataset[0]["input_ids"])
print("BIO tags extended:")
print(train_dataset[0]["bio_tags"])
print("attention mask:")
print(train_dataset[0]["attention_mask"])


Tweet:
I've been light headed and shakey for the last 4 hours due to low blood sugar and it's uncomfortable and debilitating !
BIO tags:
['O', 'O', 'O', 'B-E', 'I-E', 'O', 'B-E', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-C', 'I-C', 'I-C', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

tokenized:
['<s>', '2', 'before', 'to', '0', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
BIO tags extended:
tensor([-100,    0,    0,    0,    3,    4,    0,    3, -100,    0,    0,    0,
           0,    0,    0,    0,    1,    2,    2,    0,    0,    0,    0,    0,
           0, -100, -100,    0, -100])

ids:
tensor([    0,     8,   120,   108,   937,  4432,    13,  2258,  1499,    19,
            6,   175,   204,   493,  1006,     9,  1101,  1945,  4057,    13,
           18,    20,  6976,    13, 13084, 41480,  1526,    12,     2])
BIO tags extended:
te

In [57]:
for batch in tqdm(train_loader):
    optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch["token_type_ids"].to(device)
    bio_tags = batch['bio_tags'].to(device)
    print("BATCH:")
    print("tweet A:", tokenizer.convert_ids_to_tokens(input_ids[0]))
    print("tweet B:", tokenizer.convert_ids_to_tokens(input_ids[1]))
    print("tweet A shape:", len(tokenizer.convert_ids_to_tokens(input_ids[0])))
    print("tweet B shape:", len(tokenizer.convert_ids_to_tokens(input_ids[1])))    
    print("============\n")
    
    ################################################
    model.train() # set model to training mode
    logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

    print("logits.shape:", logits.shape)
    print("bio_tags.shape:", bio_tags.shape)
    print("============\n")

100%|██████████| 1/1 [00:00<00:00,  6.77it/s]

BATCH:
tweet A: ['<s>', 'I', "'ve", 'been', 'light', 'headed', 'and', 'sha@@', 'key', 'for', 'the', 'last', '4', 'hours', 'due', 'to', 'low', 'blood', 'sugar', 'and', 'it', "'s", 'uncomfortable', 'and', 'deb@@', 'ilit@@', 'ating', '!', '</s>']
tweet B: ['<s>', '2', 'before', 'to', '0', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
tweet A shape: 29
tweet B shape: 29

logits.shape: torch.Size([2, 29, 5])
bio_tags.shape: torch.Size([2, 29])






In [58]:


#################################################
# similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
print("active_loss.shape:", active_loss.shape)
print("active_loss:", active_loss)

#active_loss2 = bio_tags.view(-1) != -100   # excludes all special tokens including <CLS>, <SEP>
active_logits = logits.view(-1, N_bio_tags)[active_loss] # 5 
active_tags = bio_tags.view(-1)[active_loss]
loss = loss_fn(active_logits, active_tags)
print("active_logits:", active_logits.shape)
print("active_tags:", active_tags.shape)
print("loss:", loss)
print("============\n")


active_loss.shape: torch.Size([58])
active_loss: tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False])
active_logits: torch.Size([36, 5])
active_tags: torch.Size([36])
loss: tensor(1.1157, grad_fn=<NllLossBackward>)



In [59]:

logits = logits.detach().to('cpu').numpy()
tags_ids = bio_tags.to('cpu').numpy()

# calculate performance measures only on tokens and not subwords or special tokens
tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
print("pred.shape:", pred.shape)
print("pred:", pred)    
tags = tags_ids[tags_mask]#.flatten()
print("tags.shape", tags.shape)
print("tags:", tags)

print("acc:", accuracy_score(tags, pred))

pred.shape: (29,)
pred: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
tags.shape (29,)
tags: [0 0 0 3 4 0 3 0 0 0 0 0 0 0 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0]
acc: 0.7931034482758621
