In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm, trange
from utils import normalizeTweet, split_into_sentences, bio_tagging, create_training_data



data = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
#data = pd.read_excel("/home/adrian/Downloads/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
print("Total count:", data.shape[0])
data = data[data["Causal association"].notnull()]
print("Labeled count:", data.shape[0])

data.head()

Total count: 5456
Labeled count: 5000


Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


### Interrater-reliabilty measure

In [2]:
from sklearn.metrics import cohen_kappa_score

charline = data[data["Charline association0=no;1=yes"].notnull()]
coder1 = charline["Causal association"].values
coder2 = charline["Charline association0=no;1=yes"]
score = cohen_kappa_score(coder1,coder2)
#print('Cohen\'s Kappa:',score)

### Data Preprocessing

In [3]:
data["Causal association"].value_counts()

0.0    3720
1.0    1280
Name: Causal association, dtype: int64

In [4]:
trainingData = create_training_data(data, min_words_in_sentences=3)
trainingData.head()

Unnamed: 0,tweet,Causal association,BIOtags
0,"tonight , I learned my older girl will back he...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,I'm a trans woman .,0.0,"[O, O, O, O, O, O]"
3,"Both of us could use a world where "" brave and...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"Make a world where people can just be , withou...",0.0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [5]:
for i,row in trainingData.sample(n=10).iterrows():
    print("\n")
    print(row["tweet"])
    print(row["BIOtags"])



USER USER USER Hehehehe !
['O', 'O', 'O', 'O', 'O']


Ok while busy curing diabetic and pushing sch doc ins for his needs .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


One easy thing to do is pay fair rates for preventative diabetic shoes .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


its like they want physically / mentally sound people .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


USER GHC 2.
['O', 'O', 'O', 'O']


After a long day of driving back from ATL , I made it back in time to administer Ivan's insulin .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


USER Yes , he needs to give himself a giant dose of insulin .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Lowest my sugars have been for a long time .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Slowly but surely improving !
['O', 'O', 'O', 'O', 'O']


After fol

In [7]:
trainingData["Causal association"].value_counts()

0.0    7607
1.0    1019
Name: Causal association, dtype: int64

### Training

In [19]:
trainingDataSample = trainingData.sample(n=200)
train = trainingDataSample.sample(frac=0.8, random_state=0)
test = trainingDataSample.drop(train.index)
validate = train.sample(frac=0.2, random_state=0)
train = train.drop(validate.index)
print("Train:", train.shape)
print("Validate:", validate.shape)
print("Test:", test.shape)

Train: (128, 3)
Validate: (32, 3)
Test: (40, 3)


In [26]:

# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, bio_tags, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.bio_tags = bio_tags
        self.tag2id = {label: idx for idx, label in enumerate(["O", "B-C", "I-C", "B-E", "I-E"])}
        self.tag2id[-100] = -100
        self.id2tag = {id:tag for tag,id in self.tag2id.items()}

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        bio_tags_extended = self.extend_tags(self.text[idx], self.bio_tags[idx], ids[idx])
        assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
              , "bio_tags" : torch.tensor(list(map(lambda bioTags: self.tag2id[bioTags], bio_tags_extended))
, dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

    
    def extend_tags(self, tokens_old, tags_old, ids_tokenized_padded):
        """ 
            Each token has a BIO tag label. 
            However BERT's tokenization splits tokens into subwords. How to label those subwords?
            
            Option 1:
            ---------
            
            add the same label to each subword than the first subword. Only replace "B" by "I"
            Ex. 
            #lowbloodsugar => '#low@@', 'blood@@', 'sugar@@'
               "B-C"       =>   "B-C" ,   "I-C"  ,   "I-C"
            
            Option 2 (implemented):      
            ---------
            
            From : https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities
            A common obstacle with using pre-trained models for token-level classification: many of the tokens in
            the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called 
            WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token
            is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter 
            handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we 
            have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will
            end up with a mismatch between our tokens and our labels.

            One way to handle this is to only train on the tag labels for the first subtoken of a split token. 
            We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. 
            In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set 
            the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
        """
        tags = [-100] # add for start token <CLS>
        for token_old, tag in zip(tokens_old.split(" "), tags_old):
#            print(F"\ntoken_old: {token_old};    tag: {tag}")
            for i, sub_token in enumerate(self.tokenizer.tokenize(token_old)):
                if (i == 0):
                    tags.append(tag)
                else: 
                    tags.append(-100)
           
        tags.append(-100) # 0 for end of sentence token
    
        # append -100 for all padded elements
        padded_elements = ids_tokenized_padded.count(1) # id 1 is <PAD> ; Alternative: where attention_mask == 0 add -100
        tags.extend([-100]*padded_elements)
        
        return tags
        
        
    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train["tweet"].map(normalizeTweet).values.tolist()
                           , train["Causal association"].values.tolist()
                           , train["BIOtags"].values.tolist()
                           , tokenizer)
val_dataset = TweetDataSet(validate["tweet"].map(normalizeTweet).values.tolist()
                           , validate["Causal association"].values.tolist()
                           , validate["BIOtags"].values.tolist()
                           , tokenizer)
test_dataset = TweetDataSet(test["tweet"].map(normalizeTweet).values.tolist()
                           , test["Causal association"].values.tolist()
                           , test["BIOtags"].values.tolist()
                           , tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# put data to batches
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)


  assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


128
32
40


In [27]:
# 1) Trainer 
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred, labels):
    """
        Dataset is unbalanced -> measure weighted metrics
        Calculate metrics for each label, and find their average wieghted by support (Number of true instances for each label)
        This alters 'macro' to account for label imbalance;
        it can result in an F-Score taht is not between precision and recall
    """
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted') #binary
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalMultiTask(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalMultiTask, self).__init__()
        self.num_labels = 5 # B-C, I-C, B-E, I-E, O
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_seq, output_cls = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token

        output_cls_2 = self.dropout(output_cls)
        output_cls_3 = self.linear1(output_cls_2)
        output_cls_4 = self.dropout(output_cls_3)
        output_cls_5 = self.linear2(output_cls_4)
        logit_cls = self.softmax(output_cls_5)
        
        output_ner_2 = self.dropout(output_seq)
        output_ner_3 = self.linear1(output_ner_2)
        output_ner_4 = self.dropout(output_ner_3)
        output_ner_5 = self.linear2(output_ner_4)
        logit_ner = self.softmax(output_ner_5)        
        
        return logit_cls, logit_ner


## Model parameters
batchsize_train = 16
lr = 5e-5
adam_eps = 1e-8
epochs = 3
num_warmup_steps = 0
num_training_steps = len(train_loader)*epochs

# Store our loss and learning rate for plotting
train_loss_set = []
learning_rate = []


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CausalMultiTask()
model.to(device)

# fine-tune only the task-specific parameters -> Vivek? 
for param in model.bert.parameters():
    param.requires_grad = False
    


optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# TODO: QUESTION: Do we need two different loss functions for the two different tasks?
loss_fn = CrossEntropyLoss(ignore_index=-100) # ignore subwords/tokens with label -100 


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.7.output.LayerNorm.weight', 'roberta.encoder.layer.4.output.dense.bias', 'roberta.encoder.layer.4.attention.self.value.weight', 'roberta.encoder.layer.8.attention.self.key.weight', 'roberta.encoder.layer.10.output.LayerNorm.weight', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.11.attention.output.dense.weight', 'roberta.encoder.layer.6.output.LayerNorm.weight', 'roberta.encoder.layer.4.intermediate.dense.weight', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.encoder.layer.3.intermediate.dense.bias', 'roberta.encoder.layer.6.attention.output.LayerNorm.bias', 'roberta.encoder.lay

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.10.output.dense.weight', 'encoder.layer.11.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.4.output.LayerNorm.weight', 'encoder.layer.8.intermediate.dense.weight', 'encoder.layer.2.attention.self.value.weight', 'embeddings.LayerNorm.weight', 'encoder.layer.6.attention.self.query.weight', 'encoder.layer.11.attention.self.key.weight', 'encoder.layer.3.attention.self.query.bias', 'encoder.layer.3.attention.self.value.weight', 'encoder.layer.11.output.dense.weight', 'encoder.layer.7.attention.self.key.weight', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.10.attention.output.LayerNorm.bias', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.4.attention.self.query.bias', 'encoder.layer.7.output.LayerNorm.bias', 'encoder.layer.2.attention.output.dense.bias', 'enco

### Training

In [29]:
N_bio_tags = 5 # "O", "B-C", "I-C", "B-E", "I-C"
for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")

    
    ############ training eval metrics ######################
    nb_tr_steps = 0 # Tracking variables
    train_loss = []
    train_cls_acc = []
    train_cls_prec = []
    train_cls_rec = []
    train_cls_f1 = []
    train_ner_acc = []
    train_ner_prec = []
    train_ner_rec = []
    train_ner_f1 = []    
    #########################################################
    
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        bio_tags = batch['bio_tags'].to(device)
        
        ################################################
        model.train() # set model to training mode
        logits_cls, logits_ner = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

        ################# Loss function ############################### 
        ### CLS
        loss_cls = loss_fn(logits_cls, labels)
        print("loss_cls:", loss_cls)
        
        ### NER
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        active_logits = logits_ner.view(-1, N_bio_tags)[active_loss] # N_bio_tags=5 
        active_tags = bio_tags.view(-1)[active_loss]
        loss_ner = loss_fn(active_logits, active_tags)             
        print("loss_ner:", loss_ner)   
        
        loss = loss_cls + loss_ner  # combine binary classification loss and named entity recognition loss
        print("loss:", loss)      
        loss.backward() # backward pass
        optim.step()    # update parameters and take a steup using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
            
            
        ################## Training Performance Measures ##########
        ### CLS
        logits_cls = logits_cls.detach().to('cpu').numpy()
        label_ids = labels.to('cpu').numpy()

        pred_flat = np.argmax(logits_cls, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics_cls = compute_metrics(pred_flat, labels_flat)
        train_cls_acc.append(metrics_cls["accuracy"])
        train_cls_prec.append(metrics_cls["precision"])
        train_cls_rec.append(metrics_cls["recall"])
        train_cls_f1.append(metrics_cls["f1"])
        
        #### NER 
        logits_ner = logits_ner.detach().to('cpu').numpy()
        tags_ids = bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]                      
                
        metrics_ner = compute_metrics(pred, tags)
        train_ner_acc.append(metrics_ner["accuracy"])
        train_ner_prec.append(metrics_ner["precision"])
        train_ner_rec.append(metrics_ner["recall"])
        train_ner_f1.append(metrics_ner["f1"])
                          
        nb_tr_steps += 1
           
    print(F'\n\tTraining Loss: {np.mean(train_loss)}')
    print(F'\n\tTraining cls acc: {np.mean(train_cls_acc)}')
    print(F'\n\tTraining cls prec: {np.mean(train_cls_prec)}')
    print(F'\n\tTraining cls rec: {np.mean(train_cls_rec)}')
    print(F'\n\tTraining cls f1: {np.mean(train_cls_f1)}')
    print(F'\n--\n\tTraining ner acc: {np.mean(train_ner_acc)}')
    print(F'\n\tTraining ner prec: {np.mean(train_ner_prec)}')
    print(F'\n\tTraining ner rec: {np.mean(train_ner_rec)}')
    print(F'\n\tTraining ner f1: {np.mean(train_ner_f1)}')
                          
                          
    # store the current learning rate
    for param_group in optim.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])
    

    ############# Validation ################
    
    nb_eval_steps = 0 # Tracking variables
    val_accuracy = []
    val_loss = []
    val_cls_acc = []
    val_cls_prec = []
    val_cls_rec = []
    val_cls_f1 = []
    val_ner_acc = []
    val_ner_prec = []
    val_ner_rec = []
    val_ner_f1 = []
    
    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        v_input_ids, v_input_mask, v_token_type_ids, v_labels, v_bio_tags = batch  # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            model.eval() # put model in evaluation mode for validation set
            logits_cls, logits_ner = model(**{"input_ids":v_input_ids, "attention_mask":v_input_mask, "token_type_ids":v_token_type_ids}) # forward pass, calculates logit predictions

        ############### LOSS Function #######################################
        ### CLS
        v_loss_cls = loss_fn(logits_cls, v_labels)
        
        ### NER
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        v_active_loss = v_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        v_active_logits = logits_ner.view(-1, N_bio_tags)[v_active_loss] # 5 
        v_active_tags = v_bio_tags.view(-1)[v_active_loss]
        v_loss_ner = loss_fn(v_active_logits, v_active_tags)             
        v_loss = v_loss_cls + v_loss_ner
        val_loss.append(v_loss.item())

   
        ################# PERFORMANCE MEASURES ########################################
        ### CLS
        logits_cls = logits_cls.detach().to('cpu').numpy()
        label_ids = v_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits_cls, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics_cls = compute_metrics(pred_flat, labels_flat)
        val_cls_acc.append(metrics_cls["accuracy"])
        val_cls_prec.append(metrics_cls["precision"])
        val_cls_rec.append(metrics_cls["recall"])
        val_cls_f1.append(metrics_cls["f1"])
        
        #### NER     
        logits_ner = logits_ner.detach().to('cpu').numpy()
        tags_ids = v_bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]#.flatten()        
        
        metrics = compute_metrics(pred, tags)
        val_ner_acc.append(metrics["accuracy"])
        val_ner_prec.append(metrics["precision"])
        val_ner_rec.append(metrics["recall"])
        val_ner_f1.append(metrics["f1"])
                              
        nb_eval_steps += 1
           
    print(F'\n\tValidation Loss: {np.mean(val_loss)}')
    print(F'\n\tValidation cls acc: {np.mean(val_cls_acc)}')
    print(F'\n\tValidation cls prec: {np.mean(val_cls_prec)}')
    print(F'\n\tValidation cls rec: {np.mean(val_cls_rec)}')
    print(F'\n\tValidation cls f1: {np.mean(val_cls_f1)}')
    print(F'\n--\n\tValidation ner acc: {np.mean(val_ner_acc)}')
    print(F'\n\tValidation ner prec: {np.mean(val_ner_prec)}')
    print(F'\n\tValidation ner rec: {np.mean(val_ner_rec)}')
    print(F'\n\tValidation ner f1: {np.mean(val_ner_f1)}')


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▎        | 1/8 [00:01<00:12,  1.80s/it][A

loss_cls: tensor(1.5994, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.3959, grad_fn=<NllLossBackward>)
loss: tensor(2.9953, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 2/8 [00:03<00:10,  1.73s/it][A

loss_cls: tensor(1.5882, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.3725, grad_fn=<NllLossBackward>)
loss: tensor(2.9607, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 38%|███▊      | 3/8 [00:05<00:08,  1.68s/it][A

loss_cls: tensor(1.5799, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.3490, grad_fn=<NllLossBackward>)
loss: tensor(2.9289, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 4/8 [00:06<00:06,  1.62s/it][A

loss_cls: tensor(1.5713, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.3785, grad_fn=<NllLossBackward>)
loss: tensor(2.9497, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 5/8 [00:08<00:04,  1.59s/it][A

loss_cls: tensor(1.5724, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.3347, grad_fn=<NllLossBackward>)
loss: tensor(2.9071, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 6/8 [00:09<00:03,  1.64s/it][A

loss_cls: tensor(1.5667, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2890, grad_fn=<NllLossBackward>)
loss: tensor(2.8557, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 88%|████████▊ | 7/8 [00:11<00:01,  1.73s/it][A

loss_cls: tensor(1.5920, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.3175, grad_fn=<NllLossBackward>)
loss: tensor(2.9095, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 8/8 [00:13<00:00,  1.71s/it][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss_cls: tensor(1.5813, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2793, grad_fn=<NllLossBackward>)
loss: tensor(2.8606, grad_fn=<AddBackward0>)

	Training Loss: 2.9209513068199158

	Training cls acc: 0.46875

	Training cls prec: 0.761810064935065

	Training cls rec: 0.46875

	Training cls f1: 0.5668524184149184

--
	Training ner acc: 0.9240265623283583

	Training ner prec: 0.910179399279005

	Training ner rec: 0.9240265623283583

	Training ner f1: 0.9163116097744446

	Current Learning rate:  1.6666666666666667e-05


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.53it/s][A
Epoch:  33%|███▎      | 1/3 [00:16<00:32, 16.34s/it]
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)



	Validation Loss: 2.7894954085350037

	Validation cls acc: 0.78125

	Validation cls prec: 0.61328125

	Validation cls rec: 0.78125

	Validation cls f1: 0.6863095238095237

--
	Validation ner acc: 0.9381458822573987

	Validation ner prec: 0.8808603924137706

	Validation ner rec: 0.9381458822573987

	Validation ner f1: 0.9084086039899638


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▎        | 1/8 [00:01<00:12,  1.83s/it][A

loss_cls: tensor(1.5636, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2522, grad_fn=<NllLossBackward>)
loss: tensor(2.8158, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 2/8 [00:03<00:10,  1.75s/it][A

loss_cls: tensor(1.5565, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2323, grad_fn=<NllLossBackward>)
loss: tensor(2.7888, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 38%|███▊      | 3/8 [00:05<00:08,  1.69s/it][A

loss_cls: tensor(1.5693, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2684, grad_fn=<NllLossBackward>)
loss: tensor(2.8377, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 4/8 [00:06<00:06,  1.66s/it][A

loss_cls: tensor(1.5573, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2532, grad_fn=<NllLossBackward>)
loss: tensor(2.8105, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 5/8 [00:08<00:05,  1.73s/it][A

loss_cls: tensor(1.5527, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2111, grad_fn=<NllLossBackward>)
loss: tensor(2.7638, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 6/8 [00:10<00:03,  1.68s/it][A

loss_cls: tensor(1.5677, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2227, grad_fn=<NllLossBackward>)
loss: tensor(2.7903, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 88%|████████▊ | 7/8 [00:11<00:01,  1.66s/it][A

loss_cls: tensor(1.5614, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2193, grad_fn=<NllLossBackward>)
loss: tensor(2.7806, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 8/8 [00:13<00:00,  1.70s/it][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss_cls: tensor(1.5526, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2272, grad_fn=<NllLossBackward>)
loss: tensor(2.7798, grad_fn=<AddBackward0>)

	Training Loss: 2.795923173427582

	Training cls acc: 0.65625

	Training cls prec: 0.7714361159673659

	Training cls rec: 0.65625

	Training cls f1: 0.7046866128329303

--
	Training ner acc: 0.9514715702868067

	Training ner prec: 0.9070164538607244

	Training ner rec: 0.9514715702868067

	Training ner f1: 0.9285425897627648

	Current Learning rate:  0.0


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.37it/s][A
Epoch:  67%|██████▋   | 2/3 [00:32<00:16, 16.47s/it]
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)



	Validation Loss: 2.728271245956421

	Validation cls acc: 0.78125

	Validation cls prec: 0.64453125

	Validation cls rec: 0.78125

	Validation cls f1: 0.6982142857142857

--
	Validation ner acc: 0.9422284808066665

	Validation ner prec: 0.891270951750364

	Validation ner rec: 0.9422284808066665

	Validation ner f1: 0.9151745672166882


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▎        | 1/8 [00:01<00:13,  1.88s/it][A

loss_cls: tensor(1.5778, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2596, grad_fn=<NllLossBackward>)
loss: tensor(2.8374, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 2/8 [00:03<00:10,  1.80s/it][A

loss_cls: tensor(1.5540, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2038, grad_fn=<NllLossBackward>)
loss: tensor(2.7578, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 38%|███▊      | 3/8 [00:05<00:08,  1.78s/it][A

loss_cls: tensor(1.5462, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2099, grad_fn=<NllLossBackward>)
loss: tensor(2.7561, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 4/8 [00:07<00:07,  1.99s/it][A

loss_cls: tensor(1.5635, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2163, grad_fn=<NllLossBackward>)
loss: tensor(2.7798, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 5/8 [00:09<00:05,  1.92s/it][A

loss_cls: tensor(1.5564, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2112, grad_fn=<NllLossBackward>)
loss: tensor(2.7676, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 6/8 [00:11<00:03,  1.90s/it][A

loss_cls: tensor(1.5583, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2029, grad_fn=<NllLossBackward>)
loss: tensor(2.7612, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 88%|████████▊ | 7/8 [00:13<00:01,  1.90s/it][A

loss_cls: tensor(1.5574, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2282, grad_fn=<NllLossBackward>)
loss: tensor(2.7856, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 8/8 [00:15<00:00,  1.88s/it][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss_cls: tensor(1.5692, grad_fn=<NllLossBackward>)
loss_ner: tensor(1.2079, grad_fn=<NllLossBackward>)
loss: tensor(2.7771, grad_fn=<AddBackward0>)

	Training Loss: 2.7778351604938507

	Training cls acc: 0.6796875

	Training cls prec: 0.7780403190559441

	Training cls rec: 0.6796875

	Training cls f1: 0.7226079866812625

--
	Training ner acc: 0.9532499298849965

	Training ner prec: 0.910930461343817

	Training ner rec: 0.9532499298849965

	Training ner f1: 0.9313034473575448

	Current Learning rate:  0.0


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.51it/s][A
Epoch: 100%|██████████| 3/3 [00:50<00:00, 16.87s/it]


	Validation Loss: 2.730051279067993

	Validation cls acc: 0.78125

	Validation cls prec: 0.64453125

	Validation cls rec: 0.78125

	Validation cls f1: 0.6982142857142857

--
	Validation ner acc: 0.9391739980449658

	Validation ner prec: 0.8837985879989947

	Validation ner rec: 0.9391739980449658

	Validation ner f1: 0.9101943125150523





### Evaluation on the test dataset

In [32]:

############ test eval metrics ######################
nb_test_steps = 0 # Tracking variables
test_loss = []
test_loss = []
test_cls_acc = []
test_cls_prec = []
test_cls_rec = []
test_cls_f1 = []
test_ner_acc = []
test_ner_prec = []
test_ner_rec = []
test_ner_f1 = []

########################################################
for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch     # unpack inputs from dataloader

    with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
        model.eval() # put model in evaluation mode for validation set
        logits_cls, logits_ner = model(**{"input_ids":t_input_ids, "attention_mask":t_input_mask, "token_type_ids":t_token_type_ids}) # forward pass, calculates logit predictions


    ############### LOSS Function #######################################
    ### CLS
    t_loss_cls = loss_fn(logits_cls, t_labels)

    ### NER
    # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
    t_active_loss = t_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
    t_active_logits = logits_ner.view(-1, N_bio_tags)[t_active_loss] # 5 
    t_active_tags = t_bio_tags.view(-1)[t_active_loss]
    t_loss_ner = loss_fn(t_active_logits, t_active_tags)             
    t_loss = t_loss_cls + t_loss_ner
    test_loss.append(t_loss.item())


    ################# PERFORMANCE MEASURES ########################################
    ### CLS
    logits_cls = logits_cls.detach().to('cpu').numpy()
    label_ids = t_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits_cls, axis=1).flatten()
    labels_flat = label_ids.flatten()

    metrics_cls = compute_metrics(pred_flat, labels_flat)
    test_cls_acc.append(metrics_cls["accuracy"])
    test_cls_prec.append(metrics_cls["precision"])
    test_cls_rec.append(metrics_cls["recall"])
    test_cls_f1.append(metrics_cls["f1"])

    #### NER     
    logits_ner = logits_ner.detach().to('cpu').numpy()
    tags_ids = t_bio_tags.to('cpu').numpy()

    # calculate performance measures only on tokens and not subwords or special tokens
    tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
    tags = tags_ids[tags_mask]#.flatten()        

    metrics = compute_metrics(pred, tags)
    test_ner_acc.append(metrics["accuracy"])
    test_ner_prec.append(metrics["precision"])
    test_ner_rec.append(metrics["recall"])
    test_ner_f1.append(metrics["f1"])

    nb_eval_steps += 1

print(F'\n\tTest Loss: {np.mean(test_loss)}')
print(F'\n\tTest cls acc: {np.mean(test_cls_acc)}')
print(F'\n\tTest cls prec: {np.mean(test_cls_prec)}')
print(F'\n\tTest cls rec: {np.mean(test_cls_rec)}')
print(F'\n\tTest cls f1: {np.mean(test_cls_f1)}')
print(F'\n--\n\tTest ner acc: {np.mean(test_ner_acc)}')
print(F'\n\tTest ner prec: {np.mean(test_ner_prec)}')
print(F'\n\tTest ner rec: {np.mean(test_ner_rec)}')
print(F'\n\tTest ner f1: {np.mean(test_ner_f1)}')


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 5/5 [00:02<00:00,  1.86it/s]


	Test Loss: 2.700912284851074

	Test cls acc: 0.9

	Test cls prec: 0.81875

	Test cls rec: 0.9

	Test cls f1: 0.8552380952380952

--
	Test ner acc: 0.9635146352118829

	Test ner prec: 0.929451889996242

	Test ner rec: 0.9635146352118829

	Test ner f1: 0.9459001908117897





### Save model

In [None]:
torch.save(model.state_dict(), "finetuned-NER-35-epochs.pth")

### Load model locally

In [None]:
device = torch.device("cuda", if torch.cuda.is_available() else "cpu")
model = CausalityBERT()
model.load_state_dict(torch.load("finetuned-35-epochs.pth"))
model.to(device)
model.eval()

### Small example

In [33]:
# Small steps
sample = trainingData.sample(n=5, random_state=11)[3:]
sample.head()

Unnamed: 0,tweet,Causal association,BIOtags
447,I've been light headed and shakey for the last...,1.0,"[O, O, O, B-E, I-E, O, B-E, O, O, O, O, O, O, ..."
7584,2 before to 0.,0.0,"[O, O, O, O, O]"


In [34]:
N_bio_tags = 5 
train_dataset = TweetDataSet(sample["tweet"].map(normalizeTweet).values.tolist()
                           , sample["Causal association"].values.tolist()
                           , sample["BIOtags"].values.tolist()
                           , tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

print("Tweet:")
print(sample.iloc[0]["tweet"])
print("BIO tags:")
print(sample.iloc[0]["BIOtags"])
print("\ntokenized:")
print(tokenizer.convert_ids_to_tokens(train_dataset[1]["input_ids"]))
print("BIO tags extended:")
print(train_dataset[0]["bio_tags"])
print("\nids:")
print(train_dataset[0]["input_ids"])
print("BIO tags extended:")
print(train_dataset[0]["bio_tags"])
print("attention mask:")
print(train_dataset[0]["attention_mask"])


Tweet:
I've been light headed and shakey for the last 4 hours due to low blood sugar and it's uncomfortable and debilitating !
BIO tags:
['O', 'O', 'O', 'B-E', 'I-E', 'O', 'B-E', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-C', 'I-C', 'I-C', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

tokenized:
['<s>', '2', 'before', 'to', '0', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
BIO tags extended:
tensor([-100,    0,    0,    0,    3,    4,    0,    3, -100,    0,    0,    0,
           0,    0,    0,    0,    1,    2,    2,    0,    0,    0,    0,    0,
           0, -100, -100,    0, -100])

ids:
tensor([    0,     8,   120,   108,   937,  4432,    13,  2258,  1499,    19,
            6,   175,   204,   493,  1006,     9,  1101,  1945,  4057,    13,
           18,    20,  6976,    13, 13084, 41480,  1526,    12,     2])
BIO tags extended:
te

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)


In [39]:
for batch in tqdm(train_loader):
    optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch["token_type_ids"].to(device)
    labels = batch["labels"].to(device)
    bio_tags = batch['bio_tags'].to(device)
    print("BATCH:")
    print("tweet A:", tokenizer.convert_ids_to_tokens(input_ids[0]))
    print("tweet B:", tokenizer.convert_ids_to_tokens(input_ids[1]))
    print("tweet A shape:", len(tokenizer.convert_ids_to_tokens(input_ids[0])))
    print("tweet B shape:", len(tokenizer.convert_ids_to_tokens(input_ids[1])))    
    print("============\n")
    
    ################################################
    model.train() # set model to training mode
    logits_cls, logits_ner = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

    print("logits_cls.shape:", logits_cls.shape)
    print("logits_ner.shape:", logits_ner.shape)
    print("bio_tags.shape:", bio_tags.shape)
    print("============\n")

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
100%|██████████| 1/1 [00:00<00:00,  5.85it/s]

BATCH:
tweet A: ['<s>', '2', 'before', 'to', '0', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
tweet B: ['<s>', 'I', "'ve", 'been', 'light', 'headed', 'and', 'sha@@', 'key', 'for', 'the', 'last', '4', 'hours', 'due', 'to', 'low', 'blood', 'sugar', 'and', 'it', "'s", 'uncomfortable', 'and', 'deb@@', 'ilit@@', 'ating', '!', '</s>']
tweet A shape: 29
tweet B shape: 29

logits_cls.shape: torch.Size([2, 5])
logits_ner.shape: torch.Size([2, 29, 5])
bio_tags.shape: torch.Size([2, 29])






In [40]:
loss_cls = loss_fn(logits_cls, labels)
print("loss_cls:", loss_cls)
        

#################################################
# similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
print("active_loss.shape:", active_loss.shape)
print("active_loss:", active_loss)

#active_loss2 = bio_tags.view(-1) != -100   # excludes all special tokens including <CLS>, <SEP>
active_logits = logits_ner.view(-1, N_bio_tags)[active_loss] # 5 
active_tags = bio_tags.view(-1)[active_loss]
loss_ner = loss_fn(active_logits, active_tags)
print("loss_ner:", loss_ner)
print("active_logits:", active_logits.shape)
print("active_tags:", active_tags.shape)
loss = loss_cls + loss_ner  # combine binary classification loss and named entity recognition loss
print("loss:", loss)
print("============\n")
                        

loss_cls: tensor(1.5943, grad_fn=<NllLossBackward>)
active_loss.shape: torch.Size([58])
active_loss: tensor([ True,  True,  True,  True,  True,  True,  True, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True])
loss_ner: tensor(1.3226, grad_fn=<NllLossBackward>)
active_logits: torch.Size([36, 5])
active_tags: torch.Size([36])
loss: tensor(2.9169, grad_fn=<AddBackward0>)



In [41]:
output_seq, output_cls = model.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
print(output_seq.shape)
print(output_cls.shape)

torch.Size([2, 29, 768])
torch.Size([2, 768])


In [None]:

TODO: 
    - write annotation guidelines
    - check model predictions, where does it fail?
    - 
