In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm, trange
from utils import normalizeTweet, split_into_sentences, bio_tagging, create_training_data



data = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
#data = pd.read_excel("/home/adrian/Downloads/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
print("Total count:", data.shape[0])
data = data[data["Causal association"].notnull()]
print("Labeled count:", data.shape[0])

data.head()

Total count: 5434
Labeled count: 5000


Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,


## Add BIO tags

In [2]:
data["tokenized"] = data["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
data["bio_tags"] = data.apply(lambda row: bio_tagging(row["full_text"],row["Cause"], row["Effect"]), axis=1)
data.head()

Unnamed: 0,id,text,full_text,Intent,Cause,Effect,Causal association,Charline association0=no;1=yes,Remarks,tokenized,bio_tags
0,908171203029868545,"tonight , I learned my older girl will back he...","tonight , I learned my older girl will back he...",,,,0.0,,,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,1203645589214367745,USER USER I knew diabetes and fibromyalgia wer...,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,,,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,1310596731063525376,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,,,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,1125198453167022085,USER Cheers ! Have one for this diabetic too !,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,,,"[USER, Cheers, !, Have, one, for, this, diabet...","[O, O, O, O, O, O, O, O, O, O]"
4,1248600944138268673,USER Additionally the medicines are being char...,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,,,"[USER, Additionally, the, medicines, are, bein...","[O, O, O, B-C, I-C, I-C, I-C, I-C, I-C, O, O, ..."


### split tweets into sentences => new dataframe with more rows

In [3]:
def get_start_end_index_of_sentence_in_tweet(tweet, sentence):
    """ 
    The sentence tokens are included in the tweet tokens.
    Return the start end end indices of the sentence tokens in the tweet tokens

    """

    sentence_start_word = sentence[0]
    start_indices = [i for i, x in enumerate(tweet) if x == sentence_start_word] # find all indices of the start word of the sentence 
    try:
        for start_index in start_indices:
            isTrueStartIndex = all([tweet[start_index+i] == sentence[i] for i in range(len(sentence))])
            #print("start_index:", start_index, "isTrueStartIndex:", isTrueStartIndex)
            if isTrueStartIndex:
                return start_index, start_index + len(sentence) 
    except:
        print("ERROR: StartIndex should have been found for sentence:")
        print("tweet:")
        print(tweet)
        print("sentence:")
        print(sentence)
    return -1, -2 # should not be returned


def split_tweets_to_sentences(data):
    """ 
        Splits tweets into sentences and associates the appropriate intent, causes, effects and causal association
        to each sentence.
        
        Parameters:
        - min_words_in_sentences: Minimal number of words in a sentence such that the sentence is kept. 
                                  Assumption: A sentence with too few words does not have enough information
                              
                              
                              
        Ex.:
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what? type 1 causes insulin dependence | q;msS  | type 1|insulin dependence | 1       | ...  
        
        New dataframe returned: 
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what?                                  |   q    |       |        |       0            | ...
        type 1 causes insulin dependence       |        | type 1| insulin dependence | 1       | ...  
    """

    newDF = pd.DataFrame(columns=["sentence", "Intent", "Cause", "Effect", "Causal association", "tokenized", "bio_tags"])
    
    for i,row in data.iterrows():
        causes = row["Cause"]
        effects = row["Effect"]
        sentences = split_into_sentences(normalizeTweet(row["full_text"]))

        # single sentence in tweet
        if len(sentences) == 1:
            singleSentenceIntent = ""
            if isinstance(row["Intent"], str):
                if len(row["Intent"].split(";")) > 1:
                    singleSentenceIntent = row["Intent"].strip().replace(";msS", "").replace("msS;", "").replace(";mS", "").replace("mS;", "")
                else:
                    if row["Intent"] == "mS" or row["Intent"] == "msS":
                        singleSentenceIntent = ""
                    else:
                        singleSentenceIntent = row["Intent"].strip()
                    
            newDF=newDF.append(pd.Series({"sentence": sentences[0] # only one sentence
                         , "Intent": singleSentenceIntent
                         , "Cause" : row["Cause"]
                         , "Effect": row["Effect"]
                         , "Causal association" : row["Causal association"]
                         , "tokenized": row["tokenized"]
                         , "bio_tags": row["bio_tags"]}), ignore_index=True)
        
        # tweet has several sentences
        else: 
            intents = str(row["Intent"]).strip().split(";")
            
            for sentence in sentences:
                sent_tokenized = sentence.split(" ")
                
                causeInSentence = np.nan if not isinstance(causes, str) or not any([cause in sentence for cause in causes.split(";")]) else ";".join([cause for cause in causes.split(";") if cause in sentence])
                effectInSentence = np.nan if not isinstance(effects, str) or not any([effect in sentence for effect in effects.split(";")]) else ";".join([effect for effect in effects.split(";") if effect in sentence])
                causalAssociationInSentence = 1 if isinstance(causeInSentence, str) and isinstance(effectInSentence, str) else 0
                
                startIndex, endIndex = get_start_end_index_of_sentence_in_tweet(row["tokenized"], sent_tokenized)
                sentence_tokenized = row["tokenized"][startIndex:endIndex]
                sentence_bio_tags = row["bio_tags"][startIndex:endIndex]
                
                if "q" in intents and sentence[-1] == "?": # if current sentence is question
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "q", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                    
                elif "joke" in intents: # all sentences with "joke" in tweet keep the intent "joke"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "joke", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)   
                elif "neg" in intents: # all sentences with "neg" in tweet keep intent "neg"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "neg", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)               
                elif isinstance(causeInSentence, str) and isinstance(effectInSentence, str): # cause effect sentence
                    causalIntent = ""
                    if len(causeInSentence.split(";")) > 1:
                        causalIntent = "mC"
                        if len(effectInSentence.split(";")) > 1:
                            causalIntent = "mC;mE"
                    elif len(effectInSentence.split(";")) > 1:
                        causalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": causalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                                  
                else:
                    nonCausalIntent = ""
                    if isinstance(causeInSentence, str): # only cause is given
                        if len(causeInSentence.split(";")) > 1:
                            nonCausalIntent = "mC"
                    elif isinstance(effectInSentence, str): # only effect is given
                        if len(effectInSentence.split(";")) > 1:
                            nonCausalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": nonCausalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)

    return newDF
       
# sample: has one example for each possible "Intent" value
#allIntents = data["Intent"].value_counts().keys().tolist()
#sample = data[data["Intent"] == "mS"][0:1]
#for intent in allIntents:
#    sample = sample.append(data[data["Intent"] == intent][1:2])
#print(sample.shape)

#i = 19
#test = sample[i:i+1]
#dataSentences = split_tweets_to_sentences(test)
#dataSentences.head(30)
#test.head()

print("N tweets:", data.shape[0])
dataSentences = split_tweets_to_sentences(data)
print("N sentences:", dataSentences.shape[0])
dataSentences.head()

N tweets: 5000
N sentences: 11784


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,Fiercely .,,,,0,"[Fiercely, .]","[O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
3,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,:face_with_rolling_eyes:,joke,,,0,[:face_with_rolling_eyes:],[O]


### Filter out negation, jokes, questions and sentences with minimal token length of 3

In [4]:
print("N sentences before filtering: ", dataSentences.shape[0])
dataSentFiltered = dataSentences[~dataSentences["Intent"].str.contains("neg|joke|q")] # remove sentences with joke, q, neg
dataSentFiltered = dataSentFiltered[dataSentFiltered["tokenized"].map(len) >= 3] # only keep sentences with at least 3 words
print("N sentences after filtering: ", dataSentFiltered.shape[0])
dataSentFiltered.head()


N sentences before filtering:  11784
N sentences after filtering:  8835


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
5,:down_arrow: :down_arrow: :down_arrow: THIS :d...,,,,0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
6,I 'm a trans woman .,,,,0,"[I, 'm, a, trans, woman, .]","[O, O, O, O, O, O]"
7,"Both of us could use a world where "" brave and...",,,,0,"[Both, of, us, could, use, a, world, where, "",...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


### only work on cause-effect tweets?

In [5]:
# only take sentences with cause and effect
#trainingData = dataSentFiltered[dataSentFiltered["Causal association"] == 1]
#trainingData.shape

# not in the multitask setting

### Create training, validation, test sets

In [5]:
####################### Stratified splits ####################
trainingData = dataSentFiltered#.sample(n=300, random_state=0) # TODO: remove sample
train, test = train_test_split(trainingData, test_size=0.2, stratify=trainingData[["Causal association"]], random_state=0)
train, val = train_test_split(train, test_size=0.2, stratify=train[["Causal association"]], random_state=0)

data_count_info = trainingData["Causal association"].value_counts(normalize=True)
train_count_info = train["Causal association"].value_counts(normalize=True)
val_count_info = val["Causal association"].value_counts(normalize=True)
test_count_info = test["Causal association"].value_counts(normalize=True)

# for class-imbalanced dataset, the class weight for a ith class
# to be specified for balancing in the loss function is given by:
# weight[i] = num_samples / (num_classes * num_samples[i])
# since train_count_info obtained above has fraction of samples
# for ith class, hence the corresponding weight calculation is:
class_weight = (1/train_count_info)/len(train_count_info)

print("All: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(trainingData["Causal association"]), *data_count_info.round(4).to_list()))
print("Train: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(train["Causal association"]), *train_count_info.round(4).to_list()))
print("Val: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(val["Causal association"]), *val_count_info.round(4).to_list()))
print("Test: \tCount = {}, % of 0 = {}, % of 1 = {}".format(
    len(test["Causal association"]), *test_count_info.round(4).to_list()))
print("Balancing class wts: for 0 = {}, for 1 = {}".format(
    *class_weight.round(4).to_list()))

All: 	Count = 300, % of 0 = 0.8767, % of 1 = 0.1233
Train: 	Count = 192, % of 0 = 0.875, % of 1 = 0.125
Val: 	Count = 48, % of 0 = 0.875, % of 1 = 0.125
Test: 	Count = 60, % of 0 = 0.8833, % of 1 = 0.1167
Balancing class wts: for 0 = 0.5714, for 1 = 4.0


In [6]:
trainingData["Causal association"].value_counts()

0    263
1     37
Name: Causal association, dtype: int64

In [9]:
# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, bio_tags, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.bio_tags = bio_tags
        self.tag2id = {label: idx for idx, label in enumerate(["O", "B-C", "I-C", "B-E", "I-E"])}
        self.tag2id[-100] = -100
        self.id2tag = {id:tag for tag,id in self.tag2id.items()}

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        bio_tags_extended = self.extend_tags(self.text[idx], self.bio_tags[idx], ids[idx])
        assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
              , "bio_tags" : torch.tensor(list(map(lambda bioTags: self.tag2id[bioTags], bio_tags_extended))
, dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

    
    def extend_tags(self, tokens_old, tags_old, ids_tokenized_padded):
        """ 
            Each token has a BIO tag label. 
            However BERT's tokenization splits tokens into subwords. How to label those subwords?
            
            Option 1:
            ---------
            
            add the same label to each subword than the first subword. Only replace "B" by "I"
            Ex. 
            #lowbloodsugar => '#low@@', 'blood@@', 'sugar@@'
               "B-C"       =>   "B-C" ,   "I-C"  ,   "I-C"
            
            Option 2 (implemented):      
            ---------
            
            From : https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities
            A common obstacle with using pre-trained models for token-level classification: many of the tokens in
            the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called 
            WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token
            is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter 
            handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we 
            have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will
            end up with a mismatch between our tokens and our labels.

            One way to handle this is to only train on the tag labels for the first subtoken of a split token. 
            We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. 
            In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set 
            the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
        """
        tags = [-100] # add for start token <CLS>
        for token_old, tag in zip(tokens_old.split(" "), tags_old):
#            print(F"\ntoken_old: {token_old};    tag: {tag}")
            for i, sub_token in enumerate(self.tokenizer.tokenize(token_old)):
                if (i == 0):
                    tags.append(tag)
                else: 
                    tags.append(-100)
           
        tags.append(-100) # 0 for end of sentence token
    
        # append -100 for all padded elements
        padded_elements = ids_tokenized_padded.count(1) # id 1 is <PAD> ; Alternative: where attention_mask == 0 add -100
        tags.extend([-100]*padded_elements)
        
        return tags
        
        
    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train["sentence"].values.tolist()
                           , train["Causal association"].values.tolist()
                           , train["bio_tags"].values.tolist()
                           , tokenizer)
val_dataset = TweetDataSet(val["sentence"].values.tolist()
                           , val["Causal association"].values.tolist()
                           , val["bio_tags"].values.tolist()
                           , tokenizer)
test_dataset = TweetDataSet(test["sentence"].values.tolist()
                           , test["Causal association"].values.tolist()
                           , test["bio_tags"].values.tolist()
                           , tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# put data to batches
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)


  assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


192
48
60


In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred, labels):
    """
        Dataset is unbalanced -> measure weighted metrics
        Calculate metrics for each label, and find their average wieghted by support (Number of true instances for each label)
        This alters 'macro' to account for label imbalance;
        it can result in an F-Score taht is not between precision and recall
    """
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='macro') #binary
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalMultiTask(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalMultiTask, self).__init__()
        self.num_labels_NER = 5 # B-C, I-C, B-E, I-E, O
        self.num_labels_CLS = 2 # 0, 1
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear_NER = torch.nn.Linear(256, self.num_labels_NER)
        self.linear_CLS = torch.nn.Linear(256, self.num_labels_CLS)
        #self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_seq, output_cls = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token

        # classification
        output_cls_2 = self.dropout(output_cls)
        output_cls_3 = self.linear1(output_cls_2)
        output_cls_4 = self.dropout(output_cls_3)
        output_cls_5 = self.linear_CLS(output_cls_4)
        #logit_cls = self.softmax(output_cls_5)
        
        # named entity recognition
        output_ner_2 = self.dropout(output_seq)
        output_ner_3 = self.linear1(output_ner_2)
        output_ner_4 = self.dropout(output_ner_3)
        output_ner_5 = self.linear_NER(output_ner_4)
        #logit_ner = self.softmax(output_ner_5)        
        
        return output_cls_5, output_ner_5



### Model parameters

In [12]:
## Model parameters
batchsize_train = 16
lr = 5e-5
adam_eps = 1e-8
epochs = 3
num_warmup_steps = 0
num_training_steps = len(train_loader)*epochs

In [13]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CausalMultiTask()
model.to(device)

# TODO: Check in Multi-task setting, if underlying BERT parameters shall
# be updated too to benefit from common training
for param in model.bert.parameters():
    param.requires_grad = False
    


optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

loss_fn_ner = CrossEntropyLoss(ignore_index=-100) # ignore subwords/tokens with label -100 
## penalising more for class with less number of exaplmes 
loss_fn_cls = CrossEntropyLoss(torch.tensor(class_weight.to_list()).to(device))


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.4.attention.self.query.weight', 'roberta.encoder.layer.10.intermediate.dense.weight', 'roberta.encoder.layer.3.attention.self.value.weight', 'roberta.encoder.layer.2.attention.self.value.bias', 'roberta.encoder.layer.3.output.dense.bias', 'roberta.encoder.layer.7.attention.self.value.bias', 'roberta.encoder.layer.9.attention.output.dense.bias', 'roberta.encoder.layer.7.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.dense.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.weight', 'roberta.encoder.layer.4.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.pooler.dense.weight', 'roberta.encoder.laye

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.3.attention.self.value.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.5.attention.self.query.bias', 'encoder.layer.4.attention.output.LayerNorm.weight', 'encoder.layer.3.attention.self.value.bias', 'encoder.layer.3.output.dense.weight', 'encoder.layer.4.attention.self.value.bias', 'encoder.layer.10.attention.self.value.bias', 'encoder.layer.7.output.dense.weight', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.6.attention.self.value.weight', 'encoder.layer.3.attention.self.key.weight', 'encoder.layer.8.output.LayerNorm.bias', 'encoder.layer.2.attention.output.LayerNorm.weight', 'pooler.dense.weight', 'encoder.layer.5.attention.output.LayerNorm.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.10.attention.self.query.b

### Training

In [15]:
# Store our loss and learning rate for plotting
learning_rate = []

N_bio_tags = 5 # "O", "B-C", "I-C", "B-E", "I-C"
for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")

    
    ############ training eval metrics ######################
    nb_tr_steps = 0 # Tracking variables
    train_loss = []
    train_cls_acc = []
    train_cls_prec = []
    train_cls_rec = []
    train_cls_f1 = []
    train_ner_acc = []
    train_ner_prec = []
    train_ner_rec = []
    train_ner_f1 = []    
    #########################################################
    
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        bio_tags = batch['bio_tags'].to(device)
        
        ################################################
        model.train() # set model to training mode
        logits_cls, logits_ner = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

        ################# Loss function ############################### 
        ### CLS
        loss_cls = loss_fn_cls(logits_cls, labels)
        print("\tloss_cls:", loss_cls)
        
        ### NER
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        active_logits = logits_ner.view(-1, N_bio_tags)[active_loss] # N_bio_tags=5 
        active_tags = bio_tags.view(-1)[active_loss]
        loss_ner = loss_fn_ner(active_logits, active_tags)             
        print("\tloss_ner:", loss_ner)   
        
        loss = loss_cls + loss_ner  # combine binary classification loss and named entity recognition loss
        print("loss:", loss)      
        loss.backward() # backward pass
        optim.step()    # update parameters and take a steup using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
            
            
        ################## Training Performance Measures ##########
        ### CLS
        logits_cls = logits_cls.detach().to('cpu').numpy()
        label_ids = labels.to('cpu').numpy()

        pred_flat = np.argmax(logits_cls, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics_cls = compute_metrics(pred_flat, labels_flat)
        train_cls_acc.append(metrics_cls["accuracy"])
        train_cls_prec.append(metrics_cls["precision"])
        train_cls_rec.append(metrics_cls["recall"])
        train_cls_f1.append(metrics_cls["f1"])
        
        #### NER 
        logits_ner = logits_ner.detach().to('cpu').numpy()
        tags_ids = bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]                      
                
        metrics_ner = compute_metrics(pred, tags)
        train_ner_acc.append(metrics_ner["accuracy"])
        train_ner_prec.append(metrics_ner["precision"])
        train_ner_rec.append(metrics_ner["recall"])
        train_ner_f1.append(metrics_ner["f1"])
                          
        nb_tr_steps += 1
           
    print(F'\n\tTraining Loss: {np.mean(train_loss)}')
    print(F'\n\tTraining cls acc: {np.mean(train_cls_acc)}')
    print(F'\n\tTraining cls prec: {np.mean(train_cls_prec)}')
    print(F'\n\tTraining cls rec: {np.mean(train_cls_rec)}')
    print(F'\n\tTraining cls f1: {np.mean(train_cls_f1)}')
    print(F'\n--\n\tTraining ner acc: {np.mean(train_ner_acc)}')
    print(F'\n\tTraining ner prec: {np.mean(train_ner_prec)}')
    print(F'\n\tTraining ner rec: {np.mean(train_ner_rec)}')
    print(F'\n\tTraining ner f1: {np.mean(train_ner_f1)}')
                          
                          
    # store the current learning rate
    for param_group in optim.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])
    

    ############# Validation ################
    
    val_accuracy = []
    val_loss = []
    val_cls_acc = []
    val_cls_prec = []
    val_cls_rec = []
    val_cls_f1 = []
    val_ner_acc = []
    val_ner_prec = []
    val_ner_rec = []
    val_ner_f1 = []
    
    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        v_input_ids, v_input_mask, v_token_type_ids, v_labels, v_bio_tags = batch  # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            model.eval() # put model in evaluation mode for validation set
            logits_cls, logits_ner = model(**{"input_ids":v_input_ids, "attention_mask":v_input_mask, "token_type_ids":v_token_type_ids}) # forward pass, calculates logit predictions

        ############### LOSS Function #######################################
        ### CLS
        v_loss_cls = loss_fn_cls(logits_cls, v_labels)
        
        ### NER
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        v_active_loss = v_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        v_active_logits = logits_ner.view(-1, N_bio_tags)[v_active_loss] # 5 
        v_active_tags = v_bio_tags.view(-1)[v_active_loss]
        v_loss_ner = loss_fn_ner(v_active_logits, v_active_tags)             
        v_loss = v_loss_cls + v_loss_ner
        val_loss.append(v_loss.item())

   
        ################# PERFORMANCE MEASURES ########################################
        ### CLS
        logits_cls = logits_cls.detach().to('cpu').numpy()
        label_ids = v_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits_cls, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        metrics_cls = compute_metrics(pred_flat, labels_flat)
        val_cls_acc.append(metrics_cls["accuracy"])
        val_cls_prec.append(metrics_cls["precision"])
        val_cls_rec.append(metrics_cls["recall"])
        val_cls_f1.append(metrics_cls["f1"])
        
        #### NER     
        logits_ner = logits_ner.detach().to('cpu').numpy()
        tags_ids = v_bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]#.flatten()        
        
        metrics = compute_metrics(pred, tags)
        val_ner_acc.append(metrics["accuracy"])
        val_ner_prec.append(metrics["precision"])
        val_ner_rec.append(metrics["recall"])
        val_ner_f1.append(metrics["f1"])
                              
           
    print(F'\n\tValidation Loss: {np.mean(val_loss)}')
    print(F'\n\tValidation cls acc: {np.mean(val_cls_acc)}')
    print(F'\n\tValidation cls prec: {np.mean(val_cls_prec)}')
    print(F'\n\tValidation cls rec: {np.mean(val_cls_rec)}')
    print(F'\n\tValidation cls f1: {np.mean(val_cls_f1)}')
    print(F'\n--\n\tValidation ner acc: {np.mean(val_ner_acc)}')
    print(F'\n\tValidation ner prec: {np.mean(val_ner_prec)}')
    print(F'\n\tValidation ner rec: {np.mean(val_ner_rec)}')
    print(F'\n\tValidation ner f1: {np.mean(val_ner_f1)}')


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)




  _warn_prf(average, modifier, msg_start, len(result))

  8%|▊         | 1/12 [00:01<00:16,  1.46s/it][A

	loss_cls: tensor(0.6297, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5720, grad_fn=<NllLossBackward>)
loss: tensor(1.2017, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 2/12 [00:02<00:14,  1.45s/it][A

	loss_cls: tensor(0.6966, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5875, grad_fn=<NllLossBackward>)
loss: tensor(1.2842, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 3/12 [00:04<00:12,  1.44s/it][A

	loss_cls: tensor(0.7038, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5799, grad_fn=<NllLossBackward>)
loss: tensor(1.2837, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 33%|███▎      | 4/12 [00:05<00:11,  1.44s/it][A

	loss_cls: tensor(0.6732, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4991, grad_fn=<NllLossBackward>)
loss: tensor(1.1723, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 42%|████▏     | 5/12 [00:07<00:10,  1.45s/it][A

	loss_cls: tensor(0.6762, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5257, grad_fn=<NllLossBackward>)
loss: tensor(1.2019, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 6/12 [00:08<00:08,  1.46s/it][A

	loss_cls: tensor(0.6226, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5028, grad_fn=<NllLossBackward>)
loss: tensor(1.1254, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 58%|█████▊    | 7/12 [00:10<00:07,  1.47s/it][A

	loss_cls: tensor(0.7475, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5041, grad_fn=<NllLossBackward>)
loss: tensor(1.2517, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

 67%|██████▋   | 8/12 [00:11<00:05,  1.49s/it][A

	loss_cls: tensor(0.6475, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5124, grad_fn=<NllLossBackward>)
loss: tensor(1.1599, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 9/12 [00:13<00:04,  1.49s/it][A

	loss_cls: tensor(0.7523, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4820, grad_fn=<NllLossBackward>)
loss: tensor(1.2343, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 83%|████████▎ | 10/12 [00:14<00:03,  1.50s/it][A

	loss_cls: tensor(0.6937, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5823, grad_fn=<NllLossBackward>)
loss: tensor(1.2761, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 92%|█████████▏| 11/12 [00:16<00:01,  1.50s/it][A

	loss_cls: tensor(0.6364, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4915, grad_fn=<NllLossBackward>)
loss: tensor(1.1279, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 12/12 [00:17<00:00,  1.47s/it][A

  0%|          | 0/6 [00:00<?, ?it/s][A

	loss_cls: tensor(0.6961, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4245, grad_fn=<NllLossBackward>)
loss: tensor(1.1206, grad_fn=<AddBackward0>)

	Training Loss: 1.203305612007777

	Training cls acc: 0.7395833333333334

	Training cls prec: 0.5279723748473748

	Training cls rec: 0.5344169719169719

	Training cls f1: 0.5147047477482259

--
	Training ner acc: 0.9529474373367991

	Training ner prec: 0.21381219708526158

	Training ner rec: 0.22336709929302526

	Training ner f1: 0.21845238280744628

	Current Learning rate:  1.6666666666666667e-05


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 6/6 [00:03<00:00,  1.94it/s][A
Epoch:  33%


	Validation Loss: 0.985430101553599

	Validation cls acc: 0.875

	Validation cls prec: 0.5208333333333334

	Validation cls rec: 0.5833333333333334

	Validation cls f1: 0.5492063492063493

--
	Validation ner acc: 0.9647872978379096

	Validation ner prec: 0.3779413764869827

	Validation ner rec: 0.38611111111111107

	Validation ner f1: 0.38191068719037785


  _warn_prf(average, modifier, msg_start, len(result))

  8%|▊         | 1/12 [00:01<00:15,  1.45s/it][A

	loss_cls: tensor(0.6990, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4264, grad_fn=<NllLossBackward>)
loss: tensor(1.1255, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 2/12 [00:02<00:14,  1.45s/it][A

	loss_cls: tensor(0.6567, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4286, grad_fn=<NllLossBackward>)
loss: tensor(1.0854, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 3/12 [00:04<00:13,  1.50s/it][A

	loss_cls: tensor(0.6741, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3078, grad_fn=<NllLossBackward>)
loss: tensor(0.9819, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 33%|███▎      | 4/12 [00:06<00:12,  1.53s/it][A

	loss_cls: tensor(0.7031, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4132, grad_fn=<NllLossBackward>)
loss: tensor(1.1163, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 42%|████▏     | 5/12 [00:07<00:10,  1.56s/it][A

	loss_cls: tensor(0.5790, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3531, grad_fn=<NllLossBackward>)
loss: tensor(0.9321, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 6/12 [00:09<00:09,  1.59s/it][A

	loss_cls: tensor(0.6855, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3713, grad_fn=<NllLossBackward>)
loss: tensor(1.0568, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 58%|█████▊    | 7/12 [00:10<00:08,  1.61s/it][A

	loss_cls: tensor(0.6458, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3985, grad_fn=<NllLossBackward>)
loss: tensor(1.0443, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 67%|██████▋   | 8/12 [00:12<00:06,  1.61s/it][A

	loss_cls: tensor(0.7569, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4279, grad_fn=<NllLossBackward>)
loss: tensor(1.1848, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 9/12 [00:14<00:04,  1.62s/it][A

	loss_cls: tensor(0.6137, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3612, grad_fn=<NllLossBackward>)
loss: tensor(0.9749, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 83%|████████▎ | 10/12 [00:15<00:03,  1.61s/it][A

	loss_cls: tensor(0.7671, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3023, grad_fn=<NllLossBackward>)
loss: tensor(1.0694, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 92%|█████████▏| 11/12 [00:17<00:01,  1.60s/it][A

	loss_cls: tensor(0.7047, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5797, grad_fn=<NllLossBackward>)
loss: tensor(1.2844, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 12/12 [00:19<00:00,  1.59s/it][A

  0%|          | 0/6 [00:00<?, ?it/s][A

	loss_cls: tensor(0.6186, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3233, grad_fn=<NllLossBackward>)
loss: tensor(0.9419, grad_fn=<AddBackward0>)

	Training Loss: 1.0664655317862828

	Training cls acc: 0.6979166666666666

	Training cls prec: 0.5076555389055389

	Training cls rec: 0.5582875457875459

	Training cls f1: 0.497125009941102

--
	Training ner acc: 0.9534483094329529

	Training ner prec: 0.20686699173052678

	Training ner rec: 0.2166666666666667

	Training ner f1: 0.21161547695056557

	Current Learning rate:  0.0


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

100%|██████████| 6/6 [00:03<00:00,  1.76it/s][A
Epoch:  67%|██████▋   | 2/3 [00:43<00:21, 21.79s/it]
  0%|          | 0/12 [00:00<?, ?it/s][A


	Validation Loss: 0.919186015923818

	Validation cls acc: 0.875

	Validation cls prec: 0.6875

	Validation cls rec: 0.75

	Validation cls f1: 0.7142857142857143

--
	Validation ner acc: 0.9688355395550389

	Validation ner prec: 0.49026612171771394

	Validation ner rec: 0.49722222222222223

	Validation ner f1: 0.4936486821681747


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  8%|▊         | 1/12 [00:01<00:17,  1.58s/it][A

	loss_cls: tensor(0.7090, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.5453, grad_fn=<NllLossBackward>)
loss: tensor(1.2543, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 2/12 [00:03<00:15,  1.59s/it][A

	loss_cls: tensor(0.6096, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3955, grad_fn=<NllLossBackward>)
loss: tensor(1.0051, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 3/12 [00:04<00:14,  1.58s/it][A

	loss_cls: tensor(0.5958, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2477, grad_fn=<NllLossBackward>)
loss: tensor(0.8436, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 33%|███▎      | 4/12 [00:06<00:12,  1.58s/it][A

	loss_cls: tensor(0.7938, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3371, grad_fn=<NllLossBackward>)
loss: tensor(1.1309, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 42%|████▏     | 5/12 [00:07<00:11,  1.58s/it][A

	loss_cls: tensor(0.6902, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3228, grad_fn=<NllLossBackward>)
loss: tensor(1.0130, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 6/12 [00:09<00:09,  1.59s/it][A

	loss_cls: tensor(0.6215, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3893, grad_fn=<NllLossBackward>)
loss: tensor(1.0108, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 58%|█████▊    | 7/12 [00:11<00:07,  1.59s/it][A

	loss_cls: tensor(0.6907, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4559, grad_fn=<NllLossBackward>)
loss: tensor(1.1466, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 67%|██████▋   | 8/12 [00:12<00:06,  1.60s/it][A

	loss_cls: tensor(0.7285, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3126, grad_fn=<NllLossBackward>)
loss: tensor(1.0411, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 9/12 [00:14<00:04,  1.59s/it][A

	loss_cls: tensor(0.6973, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4479, grad_fn=<NllLossBackward>)
loss: tensor(1.1452, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 83%|████████▎ | 10/12 [00:15<00:03,  1.58s/it][A

	loss_cls: tensor(0.6593, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.4517, grad_fn=<NllLossBackward>)
loss: tensor(1.1111, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 92%|█████████▏| 11/12 [00:17<00:01,  1.58s/it][A

	loss_cls: tensor(0.7119, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.3736, grad_fn=<NllLossBackward>)
loss: tensor(1.0855, grad_fn=<AddBackward0>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 12/12 [00:18<00:00,  1.58s/it][A

  0%|          | 0/6 [00:00<?, ?it/s][A

	loss_cls: tensor(0.7029, grad_fn=<NllLossBackward>)
	loss_ner: tensor(0.2710, grad_fn=<NllLossBackward>)
loss: tensor(0.9739, grad_fn=<AddBackward0>)

	Training Loss: 1.0634136249621708

	Training cls acc: 0.7083333333333334

	Training cls prec: 0.5063482813482814

	Training cls rec: 0.5257613913863913

	Training cls f1: 0.4889639713056952

--
	Training ner acc: 0.9545991294854549

	Training ner prec: 0.28176025433699636

	Training ner rec: 0.2916666666666667

	Training ner f1: 0.2865565877035095

	Current Learning rate:  0.0


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 6/6 [00:03<00:00,  1.78it/s][A
Epoch: 100%|██████████| 3/3 [01:05<00:00, 21.89s/it]


	Validation Loss: 0.945095827182134

	Validation cls acc: 0.875

	Validation cls prec: 0.5208333333333334

	Validation cls rec: 0.5833333333333334

	Validation cls f1: 0.5492063492063493

--
	Validation ner acc: 0.9659425461263754

	Validation ner prec: 0.3776989156576507

	Validation ner rec: 0.3861111111111111

	Validation ner f1: 0.3818128665449367





### Evaluation on the test dataset

In [16]:

############ test eval metrics ######################
test_loss = []
test_loss = []
test_cls_acc = []
test_cls_prec = []
test_cls_rec = []
test_cls_f1 = []
test_ner_acc = []
test_ner_prec = []
test_ner_rec = []
test_ner_f1 = []

########################################################
for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch     # unpack inputs from dataloader

    with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
        model.eval() # put model in evaluation mode for validation set
        logits_cls, logits_ner = model(**{"input_ids":t_input_ids, "attention_mask":t_input_mask, "token_type_ids":t_token_type_ids}) # forward pass, calculates logit predictions


    ############### LOSS Function #######################################
    ### CLS
    t_loss_cls = loss_fn_cls(logits_cls, t_labels)

    ### NER
    # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
    t_active_loss = t_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
    t_active_logits = logits_ner.view(-1, N_bio_tags)[t_active_loss] # 5 
    t_active_tags = t_bio_tags.view(-1)[t_active_loss]
    t_loss_ner = loss_fn_ner(t_active_logits, t_active_tags)             
    t_loss = t_loss_cls + t_loss_ner
    test_loss.append(t_loss.item())


    ################# PERFORMANCE MEASURES ########################################
    ### CLS
    logits_cls = logits_cls.detach().to('cpu').numpy()
    label_ids = t_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits_cls, axis=1).flatten()
    labels_flat = label_ids.flatten()

    metrics_cls = compute_metrics(pred_flat, labels_flat)
    test_cls_acc.append(metrics_cls["accuracy"])
    test_cls_prec.append(metrics_cls["precision"])
    test_cls_rec.append(metrics_cls["recall"])
    test_cls_f1.append(metrics_cls["f1"])

    #### NER     
    logits_ner = logits_ner.detach().to('cpu').numpy()
    tags_ids = t_bio_tags.to('cpu').numpy()

    # calculate performance measures only on tokens and not subwords or special tokens
    tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits_ner, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
    tags = tags_ids[tags_mask]#.flatten()        

    metrics = compute_metrics(pred, tags)
    test_ner_acc.append(metrics["accuracy"])
    test_ner_prec.append(metrics["precision"])
    test_ner_rec.append(metrics["recall"])
    test_ner_f1.append(metrics["f1"])


print(F'\n\tTest Loss: {np.mean(test_loss)}')
print(F'\n\tTest cls acc: {np.mean(test_cls_acc)}')
print(F'\n\tTest cls prec: {np.mean(test_cls_prec)}')
print(F'\n\tTest cls rec: {np.mean(test_cls_rec)}')
print(F'\n\tTest cls f1: {np.mean(test_cls_f1)}')
print(F'\n--\n\tTest ner acc: {np.mean(test_ner_acc)}')
print(F'\n\tTest ner prec: {np.mean(test_ner_prec)}')
print(F'\n\tTest ner rec: {np.mean(test_ner_rec)}')
print(F'\n\tTest ner f1: {np.mean(test_ner_f1)}')


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
10


	Test Loss: 0.9592961147427559

	Test cls acc: 0.890625

	Test cls prec: 0.6953125

	Test cls rec: 0.75

	Test cls f1: 0.7183150183150183

--
	Test ner acc: 0.955632641696913

	Test ner prec: 0.44417688174184866

	Test ner rec: 0.45416666666666666

	Test ner f1: 0.44899543916206364





### bio tags back to tokens

In [18]:
# take last batch of test set:
t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch 

for i in range(len(batch)):
    tags_mask = t_bio_tags[i].to("cpu").numpy() != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits_ner[i], axis=1)[tags_mask]
    true_tags = t_bio_tags[i][tags_mask].to("cpu").numpy()    
    

    tokens = tokenizer.convert_ids_to_tokens(t_input_ids[i])

    print("\n\nPadded Sentence:")
    print(tokens)
    print("true labels:")
    print(t_bio_tags[i])
    for token, true_label, pred in zip(np.array(tokens)[tags_mask], true_tags, pred):
        print(token, "\t\ttrue:", true_label, "  pred:", pred)

    
    break




Padded Sentence:
['<s>', 'Happens', 'more', 'often', ',', 'only', 'at', 'such', 'an', 'insane', 'crowded', 'place', 'as', 'CS', 'one', 'is', 'likely', 'to', 'hit', 'some', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
true labels:
tensor([-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100])
Happens 		true: 0   pred: 0
more 		true: 0   pred: 0
often 		true: 0   pred: 0
, 		true: 0   pred: 0
only 		true: 0   pred: 0
at 		true: 0   pred: 

### Save model

In [None]:
torch.save(model.state_dict(), "finetuned-NER-35-epochs.pth")

### Load model locally

In [None]:
device = torch.device("cuda", if torch.cuda.is_available() else "cpu")
model = CausalityBERT()
model.load_state_dict(torch.load("finetuned-35-epochs.pth"))
model.to(device)
model.eval()

### Small example

In [41]:
output_seq, output_cls = model.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
print(output_seq.shape)
print(output_cls.shape)

torch.Size([2, 29, 768])
torch.Size([2, 768])


In [None]:
QUESTIONS:
- Do we only update parameters of task-specific layer? Or the whole BERT model?
In a multitask setting we have to update all parameters, otherwise
they don't benefit ?