In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm, trange
from utils import normalizeTweet, split_into_sentences, bio_tagging, create_training_data



#data = pd.read_excel("/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
data = pd.read_excel("/home/adrian/Downloads/Causality + hypoglycemia.xlsx", sheet_name=">5000_samples_")
print("Total count:", data.shape[0])
data = data[data["Causal association"].notnull()]
data = data[["full_text", "Intent", "Cause", "Effect", "Causal association"]]
print("Labeled count:", data.shape[0])

data.head()

Total count: 5434
Labeled count: 5000


Unnamed: 0,full_text,Intent,Cause,Effect,Causal association
0,"tonight , I learned my older girl will back he...",,,,0.0
1,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0
2,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0
3,USER Cheers ! Have one for this diabetic too !,mS,,,0.0
4,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0


## Add BIO tags

In [2]:
data["tokenized"] = data["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
data["bio_tags"] = data.apply(lambda row: bio_tagging(row["full_text"],row["Cause"], row["Effect"]), axis=1)
data.head(n=20)

Unnamed: 0,full_text,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0.0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0.0,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,⬇ ️ ⬇ ️ ⬇ ️ THIS ⬇ ️ ⬇ ️ ⬇ ️ My wife has type ...,mS,,,0.0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,USER Cheers ! Have one for this diabetic too !,mS,,,0.0,"[USER, Cheers, !, Have, one, for, this, diabet...","[O, O, O, O, O, O, O, O, O, O]"
4,USER Additionally the medicines are being char...,,medicines are being charged at MRP,costing much higher,1.0,"[USER, Additionally, the, medicines, are, bein...","[O, O, O, B-C, I-C, I-C, I-C, I-C, I-C, O, O, ..."
5,USER USER We have those days Esp . if it inter...,msS,diabetic,hate,1.0,"[USER, USER, We, have, those, days, Esp, ., if...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6,Why all of a sudden are people hungry and vuln...,q,,,0.0,"[Why, all, of, a, sudden, are, people, hungry,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"i got lime for my glucose test , wasn't that b...",,glucose test,nauseous,1.0,"[i, got, lime, for, my, glucose, test, ,, was,...","[O, O, O, O, O, B-C, I-C, O, O, O, O, O, O, O,..."
8,This stickur of Unkel Funny iz ware i am shave...,,,,0.0,"[This, stickur, of, Unkel, Funny, iz, ware, i,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
9,For the second time in my life I gave myself i...,mS,,,0.0,"[For, the, second, time, in, my, life, I, gave...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Split all tweets into sentences => new dataframe with more rows

In [3]:
def get_start_end_index_of_sentence_in_tweet(tweet, sentence):
    """ 
    The sentence tokens are included in the tweet tokens.
    Return the start end end indices of the sentence tokens in the tweet tokens

    """

    sentence_start_word = sentence[0]
    start_indices = [i for i, x in enumerate(tweet) if x == sentence_start_word] # find all indices of the start word of the sentence 
    try:
        for start_index in start_indices:
            isTrueStartIndex = all([tweet[start_index+i] == sentence[i] for i in range(len(sentence))])
            #print("start_index:", start_index, "isTrueStartIndex:", isTrueStartIndex)
            if isTrueStartIndex:
                return start_index, start_index + len(sentence) 
    except:
        print("ERROR: StartIndex should have been found for sentence:")
        print("tweet:")
        print(tweet)
        print("sentence:")
        print(sentence)
    return -1, -2 # should not be returned


def split_tweets_to_sentences(data):
    """ 
        Splits tweets into sentences and associates the appropriate intent, causes, effects and causal association
        to each sentence.
        
        Parameters:
        - min_words_in_sentences: Minimal number of words in a sentence such that the sentence is kept. 
                                  Assumption: A sentence with too few words does not have enough information
                              
                              
                              
        Ex.:
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what? type 1 causes insulin dependence | q;msS  | type 1|insulin dependence | 1       | ...  
        
        New dataframe returned: 
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what?                                  |   q    |       |        |       0            | ...
        type 1 causes insulin dependence       |        | type 1| insulin dependence | 1       | ...  
    """

    newDF = pd.DataFrame(columns=["sentence", "Intent", "Cause", "Effect", "Causal association", "tokenized", "bio_tags"])
    
    for i,row in data.iterrows():
        causes = row["Cause"]
        effects = row["Effect"]
        sentences = split_into_sentences(normalizeTweet(row["full_text"]))

        # single sentence in tweet
        if len(sentences) == 1:
            singleSentenceIntent = ""
            if isinstance(row["Intent"], str):
                if len(row["Intent"].split(";")) > 1:
                    singleSentenceIntent = row["Intent"].strip().replace(";msS", "").replace("msS;", "").replace(";mS", "").replace("mS;", "")
                else:
                    if row["Intent"] == "mS" or row["Intent"] == "msS":
                        singleSentenceIntent = ""
                    else:
                        singleSentenceIntent = row["Intent"].strip()
                    
            newDF=newDF.append(pd.Series({"sentence": sentences[0] # only one sentence
                         , "Intent": singleSentenceIntent
                         , "Cause" : row["Cause"]
                         , "Effect": row["Effect"]
                         , "Causal association" : row["Causal association"]
                         , "tokenized": row["tokenized"]
                         , "bio_tags": row["bio_tags"]}), ignore_index=True)
        
        # tweet has several sentences
        else: 
            intents = str(row["Intent"]).strip().split(";")
            
            for sentence in sentences:
                sent_tokenized = sentence.split(" ")
                
                causeInSentence = np.nan if not isinstance(causes, str) or not any([cause in sentence for cause in causes.split(";")]) else ";".join([cause for cause in causes.split(";") if cause in sentence])
                effectInSentence = np.nan if not isinstance(effects, str) or not any([effect in sentence for effect in effects.split(";")]) else ";".join([effect for effect in effects.split(";") if effect in sentence])
                causalAssociationInSentence = 1 if isinstance(causeInSentence, str) and isinstance(effectInSentence, str) else 0
                
                startIndex, endIndex = get_start_end_index_of_sentence_in_tweet(row["tokenized"], sent_tokenized)
                sentence_tokenized = row["tokenized"][startIndex:endIndex]
                sentence_bio_tags = row["bio_tags"][startIndex:endIndex]
                
                if "q" in intents and sentence[-1] == "?": # if current sentence is question
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "q", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                    
                elif "joke" in intents: # all sentences with "joke" in tweet keep the intent "joke"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "joke", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)   
                elif "neg" in intents: # all sentences with "neg" in tweet keep intent "neg"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "neg", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)               
                elif isinstance(causeInSentence, str) and isinstance(effectInSentence, str): # cause effect sentence
                    causalIntent = ""
                    if len(causeInSentence.split(";")) > 1:
                        causalIntent = "mC"
                        if len(effectInSentence.split(";")) > 1:
                            causalIntent = "mC;mE"
                    elif len(effectInSentence.split(";")) > 1:
                        causalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": causalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)                                  
                else:
                    nonCausalIntent = ""
                    if isinstance(causeInSentence, str): # only cause is given
                        if len(causeInSentence.split(";")) > 1:
                            nonCausalIntent = "mC"
                    elif isinstance(effectInSentence, str): # only effect is given
                        if len(effectInSentence.split(";")) > 1:
                            nonCausalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": nonCausalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized, "bio_tags": sentence_bio_tags}), ignore_index=True)

    return newDF
       
# sample: has one example for each possible "Intent" value
#allIntents = data["Intent"].value_counts().keys().tolist()
#sample = data[data["Intent"] == "mS"][0:1]
#for intent in allIntents:
#    sample = sample.append(data[data["Intent"] == intent][1:2])
#print(sample.shape)

#i = 19
#test = sample[i:i+1]
#dataSentences = split_tweets_to_sentences(test)
#dataSentences.head(30)
#test.head()

print("N tweets:", data.shape[0])
dataSentences = split_tweets_to_sentences(data)
print("N sentences:", dataSentences.shape[0])
dataSentences.head()

N tweets: 5000
N sentences: 11784


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,Fiercely .,,,,0,"[Fiercely, .]","[O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
3,USER USER I knew diabetes and fibromyalgia wer...,joke,,,0,"[USER, USER, I, knew, diabetes, and, fibromyal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,:face_with_rolling_eyes:,joke,,,0,[:face_with_rolling_eyes:],[O]


### Filter out negation, jokes, questions and sentences with a minimal token length of 3

In [7]:
print("N sentences before filtering: ", dataSentences.shape[0])
dataSentFiltered = dataSentences[~dataSentences["Intent"].str.contains("neg|joke|q")] # remove sentences with joke, q, neg
dataSentFiltered = dataSentFiltered[dataSentFiltered["tokenized"].map(len) >= 3] # only keep sentences with at least 3 words
print("N sentences after filtering: ", dataSentFiltered.shape[0])
dataSentFiltered.head()


N sentences before filtering:  11784
N sentences after filtering:  8835


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized,bio_tags
0,"tonight , I learned my older girl will back he...",,,,0,"[tonight, ,, I, learned, my, older, girl, will...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,#impressive #bigsister #type1 #type1times2,,,,0,"[#impressive, #bigsister, #type1, #type1times2]","[O, O, O, O]"
5,:down_arrow: :down_arrow: :down_arrow: THIS :d...,,,,0,"[:down_arrow:, :down_arrow:, :down_arrow:, THI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
6,I 'm a trans woman .,,,,0,"[I, 'm, a, trans, woman, .]","[O, O, O, O, O, O]"
7,"Both of us could use a world where "" brave and...",,,,0,"[Both, of, us, could, use, a, world, where, "",...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [11]:
dataSentFiltered["Intent"].value_counts()

         8705
mE         72
mC         47
mC;mE      10
mE;mC       1
Name: Intent, dtype: int64

### Only work on cause-effect tweets

In [12]:
dataSentFiltered["Causal association"].value_counts()

0.0    7799
1.0    1036
Name: Causal association, dtype: int64

In [15]:
# only take sentences with cause and effect
trainingData = dataSentFiltered[dataSentFiltered["Causal association"] == 1]
trainingData.shape

(1036, 7)

### Create training, validation, test sets

In [18]:
trainingDataSample = trainingData#.sample(n=200)   # VIVEK: DELETE TAKING SAMPLE. THIS WAS ONLY FOR TESTING
train = trainingDataSample.sample(frac=0.8, random_state=0)
test = trainingDataSample.drop(train.index)
validate = train.sample(frac=0.2, random_state=0)
train = train.drop(validate.index)
print("Train:", train.shape)
print("Validate:", validate.shape)
print("Test:", test.shape)

Train: (128, 7)
Validate: (32, 7)
Test: (40, 7)


In [46]:

# Transform labels + encodings into Pytorch DataSet object (including __len__, __getitem__)
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, bio_tags, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.bio_tags = bio_tags
        self.tag2id = {label: idx for idx, label in enumerate(["O", "B-C", "I-C", "B-E", "I-E"])}
        self.tag2id[-100] = -100
        self.id2tag = {id:tag for tag,id in self.tag2id.items()}

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        bio_tags_extended = self.extend_tags(self.text[idx], self.bio_tags[idx], ids[idx])
        assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
              , "bio_tags" : torch.tensor(list(map(lambda bioTags: self.tag2id[bioTags], bio_tags_extended))
, dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

    
    def extend_tags(self, tokens_old, tags_old, ids_tokenized_padded):
        """ 
            Each token has a BIO tag label. 
            However BERT's tokenization splits tokens into subwords. How to label those subwords?
            
            Option 1:
            ---------
            
            add the same label to each subword than the first subword. Only replace "B" by "I"
            Ex. 
            #lowbloodsugar => '#low@@', 'blood@@', 'sugar@@'
               "B-C"       =>   "B-C" ,   "I-C"  ,   "I-C"
            
            Option 2 (implemented):      
            ---------
            
            From : https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities
            A common obstacle with using pre-trained models for token-level classification: many of the tokens in
            the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called 
            WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token
            is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter 
            handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we 
            have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will
            end up with a mismatch between our tokens and our labels.

            One way to handle this is to only train on the tag labels for the first subtoken of a split token. 
            We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. 
            In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set 
            the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
        """
        tags = [-100] # add for start token <CLS>
        for token_old, tag in zip(tokens_old.split(" "), tags_old):
#            print(F"\ntoken_old: {token_old};    tag: {tag}")
            for i, sub_token in enumerate(self.tokenizer.tokenize(token_old)):
                if (i == 0):
                    tags.append(tag)
                else: 
                    tags.append(-100)
           
        tags.append(-100) # 0 for end of sentence token
    
        # append -100 for all padded elements
        padded_elements = ids_tokenized_padded.count(1) # id 1 is <PAD> ; Alternative: where attention_mask == 0 add -100
        tags.extend([-100]*padded_elements)
        
        return tags
        
        
    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train["sentence"].values.tolist()
                           , train["Causal association"].values.tolist()
                           , train["bio_tags"].values.tolist()
                           , tokenizer)
val_dataset = TweetDataSet(validate["sentence"].values.tolist()
                           , validate["Causal association"].values.tolist()
                           , validate["bio_tags"].values.tolist()
                           , tokenizer)
test_dataset = TweetDataSet(test["sentence"].values.tolist()
                           , test["Causal association"].values.tolist()
                           , test["bio_tags"].values.tolist()
                           , tokenizer)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# put data to batches
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)


  assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
  assert(len(ids[idx]) == len(bio_tags_extended), "token ids and BIO tags lengths do not match!")
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


128
32
40


In [53]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred, labels):
    """
        Dataset is unbalanced -> measure weighted metrics
        Calculate metrics for each label, and find their average wieghted by support (Number of true instances for each label)
        This alters 'macro' to account for label imbalance;
        it can result in an F-Score taht is not between precision and recall
    """
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='macro') # TODO: check weightin
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



class CausalNER(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalNER, self).__init__()
        self.num_labels = 5 # B-C, I-C, B-E, I-E, O
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        self.softmax = torch.nn.Softmax(-1)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
#        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output_seq, _ = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token
        output_2 = self.dropout(output_seq)
        output_3 = self.linear1(output_2)
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        return output_5


### Model parameters

In [54]:
batchsize_train = 16
lr = 1e-3
adam_eps = 1e-8
epochs = 3
num_warmup_steps = 0
num_training_steps = len(train_loader)*epochs

In [51]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CausalNER()
model.to(device)

# fine-tune only the task-specific parameters -> Vivek? 
for param in model.bert.parameters():
    param.requires_grad = False
    


optim = AdamW(model.parameters(), lr=lr, eps=adam_eps)
# scheduler with a linearly decreasing learning rate from the initial lr set in the optimizer to 0;
# after a warmup period during which it increases linearly from to the initial lr set in the optimizer
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) 

loss_fn = CrossEntropyLoss(ignore_index=-100) # ignore subwords/tokens with label -100 


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['lm_head.bias', 'roberta.encoder.layer.7.output.dense.weight', 'roberta.encoder.layer.10.attention.output.dense.bias', 'roberta.encoder.layer.2.output.dense.weight', 'roberta.encoder.layer.1.attention.self.key.weight', 'roberta.encoder.layer.6.attention.self.key.weight', 'roberta.encoder.layer.9.attention.self.query.bias', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.encoder.layer.7.attention.output.LayerNorm.bias', 'roberta.encoder.layer.11.output.dense.bias', 'roberta.encoder.layer.7.attention.self.value.weight', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.9.attention.self.key.weight', 'roberta.encoder.layer.2.attention.output.dense.weight', 'roberta.encoder.layer.4.attention.output.Lay

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.10.attention.output.dense.weight', 'encoder.layer.6.attention.output.dense.bias', 'encoder.layer.3.attention.self.query.bias', 'encoder.layer.3.attention.self.query.weight', 'encoder.layer.8.output.dense.weight', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.5.output.dense.weight', 'encoder.layer.11.output.dense.weight', 'encoder.layer.6.attention.output.LayerNorm.bias', 'encoder.layer.9.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.7.attention.self.key.bias', 'encoder.layer.7.attention.self.value.weight', 'encoder.layer.7.attention.self.query.weight

### Training

In [55]:
# Store our loss and learning rate for plotting
learning_rate = []

N_bio_tags = 5 # "O", "B-C", "I-C", "B-E", "I-C"
for epoch in trange(1, epochs+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {epoch} "+ "="*22 + ">")

    
    ############ training eval metrics ######################
    train_loss = []
    train_acc = []
    train_prec = []
    train_rec = []
    train_f1 = []
    
    #########################################################
    
    
    for batch in tqdm(train_loader):
        optim.zero_grad() # gradients get accumulated by default -> clear previous accumulated gradients
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch['labels'].to(device)
        bio_tags = batch['bio_tags'].to(device)
        
        ################################################
        model.train() # set model to training mode
        logits = model(**{"input_ids":input_ids, "attention_mask":attention_mask, "token_type_ids":token_type_ids}) # forward pass

        ################################################ 
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        active_loss = attention_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        active_logits = logits.view(-1, N_bio_tags)[active_loss] # N_bio_tags=5 
        active_tags = bio_tags.view(-1)[active_loss]
        loss = loss_fn(active_logits, active_tags)             
        print("loss:", loss)       ## TODO VIVEK: check loss function calculation
        loss.backward() # backward pass
        optim.step()    # update parameters and take a steup using the computed gradient
        scheduler.step()# update learning rate scheduler
        train_loss.append(loss.item())
            
            
        ################## Training Performance Measures ##########
        logits = logits.detach().to('cpu').numpy()
        tags_ids = bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]                      
                
        metrics = compute_metrics(pred, tags)
        train_acc.append(metrics["accuracy"])
        train_prec.append(metrics["precision"])
        train_rec.append(metrics["recall"])
        train_f1.append(metrics["f1"])
                          
           
    print(F'\n\tTraining Loss: {np.mean(train_loss)}')
    print(F'\n\tTraining acc: {np.mean(train_acc)}')
    print(F'\n\tTraining prec: {np.mean(train_prec)}')
    print(F'\n\tTraining rec: {np.mean(train_rec)}')
    print(F'\n\tTraining f1: {np.mean(train_f1)}')
                          
                          
    # store the current learning rate
    for param_group in optim.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])
    

    ############# Validation ################
    
    nb_eval_steps = 0 # Tracking variables
    val_accuracy = []
    val_loss = []
    val_acc = []
    val_prec = []
    val_rec = []
    val_f1 = []

    # Evaluate data for one epoch
    for batch in tqdm(validation_loader):
        batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
        v_input_ids, v_input_mask, v_token_type_ids, v_labels, v_bio_tags = batch  # unpack inputs from dataloader
        
        with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
            model.eval() # put model in evaluation mode for validation set
            logits = model(**{"input_ids":v_input_ids, "attention_mask":v_input_mask, "token_type_ids":v_token_type_ids}) # forward pass, calculates logit predictions

        ######################################################
        
        # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
        v_active_loss = v_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
        v_active_logits = logits.view(-1, N_bio_tags)[v_active_loss] # 5 
        v_active_tags = v_bio_tags.view(-1)[v_active_loss]
        v_loss = loss_fn(v_active_logits, v_active_tags)             
        val_loss.append(v_loss.item())
              
        #########################################################
        logits = logits.detach().to('cpu').numpy()
        tags_ids = v_bio_tags.to('cpu').numpy()

        # calculate performance measures only on tokens and not subwords or special tokens
        tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
        pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
        tags = tags_ids[tags_mask]#.flatten()        
        
        metrics = compute_metrics(pred, tags)
        val_acc.append(metrics["accuracy"])
        val_prec.append(metrics["precision"])
        val_rec.append(metrics["recall"])
        val_f1.append(metrics["f1"])
                              
        nb_eval_steps += 1
        
    print(F'\n\tValidation Loss: {np.mean(val_loss)}')
    print(F'\n\tValidation acc: {np.mean(val_acc)}')
    print(F'\n\tValidation prec: {np.mean(val_prec)}')
    print(F'\n\tValidation rec: {np.mean(val_rec)}')
    print(F'\n\tValidation f1: {np.mean(val_f1)}')
    


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)




  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▎        | 1/8 [00:01<00:10,  1.49s/it][A

loss: tensor(0.8383, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 2/8 [00:03<00:09,  1.55s/it][A

loss: tensor(0.9779, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 38%|███▊      | 3/8 [00:04<00:07,  1.55s/it][A

loss: tensor(1.1006, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 4/8 [00:06<00:06,  1.52s/it][A

loss: tensor(1.1350, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 5/8 [00:07<00:04,  1.53s/it][A

loss: tensor(0.9612, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 6/8 [00:09<00:03,  1.55s/it][A

loss: tensor(0.9111, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 88%|████████▊ | 7/8 [00:10<00:01,  1.56s/it][A

loss: tensor(1.1876, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 8/8 [00:12<00:00,  1.54s/it][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss: tensor(0.7778, grad_fn=<NllLossBackward>)

	Training Loss: 0.9861996546387672

	Training acc: 0.8106001263088918

	Training prec: 0.16212002526177835

	Training rec: 0.2

	Training f1: 0.17902499396597074

	Current Learning rate:  0.000625


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.52it/s][A
Epoch:  33%|███▎      | 1/3 [00:15<00:30, 15.01s/it]
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)



	Validation Loss: 0.7920175939798355

	Validation acc: 0.7953049207707834

	Validation prec: 0.15906098415415668

	Validation rec: 0.2

	Validation f1: 0.17697670027160403


  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▎        | 1/8 [00:01<00:10,  1.54s/it][A

loss: tensor(0.7760, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 2/8 [00:03<00:09,  1.50s/it][A

loss: tensor(0.7718, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

 38%|███▊      | 3/8 [00:04<00:07,  1.54s/it][A

loss: tensor(0.7309, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)

 50%|█████     | 4/8 [00:06<00:06,  1.54s/it][A

loss: tensor(0.8183, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 5/8 [00:07<00:04,  1.53s/it][A

loss: tensor(0.7252, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 6/8 [00:09<00:03,  1.53s/it][A

loss: tensor(0.8045, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 88%|████████▊ | 7/8 [00:10<00:01,  1.54s/it][A

loss: tensor(0.8654, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 8/8 [00:12<00:00,  1.53s/it][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss: tensor(0.8671, grad_fn=<NllLossBackward>)

	Training Loss: 0.7948975712060928

	Training acc: 0.8109252852908607

	Training prec: 0.3002201303231583

	Training rec: 0.2206228476804964

	Training f1: 0.2151295626815598

	Current Learning rate:  0.0002916666666666667


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.62it/s][A
Epoch:  67%|██████▋   | 2/3 [00:29<00:14, 14.86s/it]
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)



	Validation Loss: 0.7585615515708923

	Validation acc: 0.7948491849013122

	Validation prec: 0.1589698369802624

	Validation rec: 0.2

	Validation f1: 0.17701825090093454


  _warn_prf(average, modifier, msg_start, len(result))

 12%|█▎        | 1/8 [00:01<00:10,  1.53s/it][A

loss: tensor(0.8501, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 25%|██▌       | 2/8 [00:03<00:09,  1.55s/it][A

loss: tensor(0.8858, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 38%|███▊      | 3/8 [00:04<00:07,  1.54s/it][A

loss: tensor(0.6826, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 4/8 [00:06<00:06,  1.52s/it][A

loss: tensor(0.5518, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 62%|██████▎   | 5/8 [00:07<00:04,  1.50s/it][A

loss: tensor(0.8981, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 75%|███████▌  | 6/8 [00:09<00:03,  1.52s/it][A

loss: tensor(0.5713, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

 88%|████████▊ | 7/8 [00:10<00:01,  1.52s/it][A

loss: tensor(0.6077, grad_fn=<NllLossBackward>)


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 8/8 [00:12<00:00,  1.53s/it][A

  0%|          | 0/4 [00:00<?, ?it/s][A

loss: tensor(0.6349, grad_fn=<NllLossBackward>)

	Training Loss: 0.7102793455123901

	Training acc: 0.8101320199562618

	Training prec: 0.2662504280606497

	Training rec: 0.210345840549647

	Training f1: 0.19761690470600646

	Current Learning rate:  0.0


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 4/4 [00:02<00:00,  1.54it/s][A
Epoch: 100%|██████████| 3/3 [00:44<00:00, 14.86s/it]


	Validation Loss: 0.7914430946111679

	Validation acc: 0.7942995787643284

	Validation prec: 0.1588599157528657

	Validation rec: 0.2

	Validation f1: 0.17694625245016293





### Evaluation on the test dataset

In [56]:

############ test eval metrics ######################
nb_test_steps = 0 # Tracking variables
test_loss = []
test_acc = []
test_prec = []
test_rec = []
test_f1 = []

########################################################
for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch     # unpack inputs from dataloader

    with torch.no_grad(): # tell model not to compute or store gradients -> saves memory + speeds up validation
        model.eval() # put model in evaluation mode for validation set
        logits = model(**{"input_ids":t_input_ids, "attention_mask":t_input_mask, "token_type_ids":t_token_type_ids}) # forward pass, calculates logit predictions

    ######################################################

    # similar to the class RobertaForToken classification in transformers: https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py
    t_active_loss = t_input_mask.view(-1) == 1  # either based on attention_mask (includes <CLS>, <SEP> token)
    t_active_logits = logits.view(-1, N_bio_tags)[t_active_loss] # 5 
    t_active_tags = t_bio_tags.view(-1)[t_active_loss]
    t_loss = loss_fn(t_active_logits, t_active_tags)             
    test_loss.append(t_loss.item())

    #########################################################
    logits = logits.detach().to('cpu').numpy()
    tags_ids = t_bio_tags.to('cpu').numpy()

    # calculate performance measures only on tokens and not subwords or special tokens
    tags_mask = tags_ids != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits, axis=2)[tags_mask] #.flatten() # convert logits to list of predicted labels
    tags = tags_ids[tags_mask]#.flatten()                          

    metrics = compute_metrics(pred, tags)
    test_acc.append(metrics["accuracy"])
    test_prec.append(metrics["precision"])
    test_rec.append(metrics["recall"])
    test_f1.append(metrics["f1"])

    nb_test_steps += 1

print(F'\n\tTest Loss: {np.mean(test_loss)}')
print(F'\n\tTest acc: {np.mean(test_acc)}')
print(F'\n\tTest prec: {np.mean(test_prec)}')
print(F'\n\tTest rec: {np.mean(test_rec)}')
print(F'\n\tTest f1: {np.mean(test_f1)}')


  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 5/5 [00:03<00:00,  1.46it/s]


	Test Loss: 0.7850914478302002

	Test acc: 0.7923388343776085

	Test prec: 0.15846776687552172

	Test rec: 0.2

	Test f1: 0.17676806023077402





### bio tags back to tokens

In [103]:
# take last batch of test set:
t_input_ids, t_input_mask, t_token_type_ids, t_labels, t_bio_tags = batch 

for i in range(len(batch)):
    tags_mask = t_bio_tags[i].to("cpu").numpy() != -100 # only get token labels and not labels from subwords or special tokens
    pred = np.argmax(logits[i], axis=1)[tags_mask]
    true_tags = t_bio_tags[i][tags_mask].to("cpu").numpy()    
    

    tokens = tokenizer.convert_ids_to_tokens(t_input_ids[i])

    print("\n\nPadded Sentence:")
    print(tokens)
    print("true labels:")
    print(t_bio_tags[i])
    for token, true_label, pred in zip(np.array(tokens)[tags_mask], true_tags, pred):
        print(token, "\t\ttrue:", true_label, "  pred:", pred)

    
    break






Padded Sentence:
['<s>', 'Diabetes', 'is', 'the', 'worst', 'fucking', 'thing', 'to', 'have', 'to', 'deal', 'with', 'when', 'you', "'re", 'trying', 'to', 'perform', 'well', 'in', 'anything', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
true labels:
tensor([-100,    1,    0,    0,    3,    4,    4,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100])
Diabetes 		true: 1   pred

### Save model

In [None]:
torch.save(model.state_dict(), "finetuned-NER-35-epochs.pth")

### Load model locally

In [None]:
device = torch.device("cuda", if torch.cuda.is_available() else "cpu")
model = CausalityBERT()
model.load_state_dict(torch.load("finetuned-35-epochs.pth"))
model.to(device)
model.eval()