## A model build using TweetBERT to classify tweet as causal or non-causal 

The causal sentence prediction model will be trained in several steps using an active learning approach, where in each step the training dataset will be augmented.
In each step the causal sentence classifier is trained and applied on a subsample of unlabeled tweets to identify tweets with causal elements. Those tweets are then manually labeled for the two tasks: causal sentence prediction and cause-effect identification (NER). The newly labeled data will be added to the training dataset and the causal sentence classifier will be retrained with the augmented dataset to increase performance

In [1]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import random
import os
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, SubsetRandomSampler
import transformers
from tqdm import tqdm, trange
import io
from utils import normalizeTweet, split_into_sentences, EarlyStopping
import matplotlib.pyplot as plt

########################### Check if cuda available ############################
# print("Cuda available: ", torch.cuda.is_available())
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



########################### DATA FILE ###################################
# dataPath = "/home/adrian/workspace/causality/Causal-associations-diabetes-twitter/data/Causality_tweets_data.xlsx"
# dataPath = "data/Causality_tweets_data.xlsx"
dataPath = "data/Causality_tweets_data.xlsx"


########################### MODEL PARAMETERS ############################
active_learning_round = 3 # will change the saved model name 
lr = 1e-3    
adam_eps = 1e-8
epochs = 35
num_warmup_steps = 0
early_patience = 7# how long to wait after last time validation loss improved

train_batch_size = 16
val_batch_size = 16
test_batch_size = 16
test_to_train_ratio = 0.1 # 10% test and 90% train
val_to_train_ratio = 0.2

#metrics_average = "binary" # this will give measure for class_1,i.e., causal class
finetuned_model = "./model-causal-tweet/model_3_finetuned-5-epochs-lr_0.001.pth"

#
# saveModelName = "./model-causal-model/model_1_finetuned-{}-epochs-lr_{}.pth".format(epochs, lr) # it should be epoch so that the name shows at what epoch teh mdel ws saved
#finetuned_model = "./model-causal-model/model_2_finetuned-30-epochs-lr_0.001.pth" # load finetuned model from previous round to continue fine-tuning on new data

#### Checking if thec cuda is available and then select the `gpu`

In [2]:
########################### Check if cuda available ############################
print("Cuda available: ", torch.cuda.is_available())
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print("Selected {} for this notebook".format(device))

Cuda available:  False
Selected cpu for this notebook


In [3]:
##### DATA TO LOAD ######

data_round0 = pd.read_excel(dataPath, sheet_name="round0")
data_round0 = data_round0[data_round0["Causal association"].notnull()] # some tweets at the end are not labeled yet
data_round0 = data_round0[["full_text", "Intent", "Cause", "Effect", "Causal association"]]
print("Data round 0 (tweets!):")
print(data_round0["Causal association"].value_counts())
print("-----"*5)


##### additional data labeled through active learning strategy - round 1 ########
data_round1 = pd.read_excel(dataPath, sheet_name="round1")
data_round1 = data_round1[data_round1["Causal association"].notnull()]
data_round1 = data_round1[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round1.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("Sentences round 1:")
print(data_round1["Causal association"].value_counts())
print("-----"*5)

##### additional data labeled through active learning strategy - round 2 ########
data_round2 = pd.read_excel(dataPath, sheet_name="round2")
data_round2 = data_round2[data_round2["Causal association"].notnull()]
data_round2 = data_round2[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round2.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
print("sentences round 2:")
print(data_round2["Causal association"].value_counts())
print("-----"*5)
#### merge datasets ######
data_old = data_round0.append(data_round1).append(data_round2)
print("\nAfter merge old data:")
print(data_old["Causal association"].value_counts())
data_old.head()
print("-----"*5)

##### new additional data labeled through active learning strategy - round 3 (model is only retrained with this data) #####################
data_round3 = pd.read_excel(dataPath, sheet_name="round3")
data_round3 = data_round3[data_round3["Causal association"].notnull()]
data_round3 = data_round3[["sentence", "Intent", "Cause", "Effect", "Causal association"]]
data_round3.rename(columns ={"sentence":"full_text"}, inplace=True) # rename for merge
data_new = data_round3
print("sentences round 3:")
print(data_new["Causal association"].value_counts())

print("----"*5)
print(" 0:non causal tweet \n 1: causal tweet.\n \n each tweet may have more than one sentence and we are splitting them and labelling by checking if cause or effect occur in them or not")

Data round 0 (tweets!):
0.0    3710
1.0    1290
Name: Causal association, dtype: int64
-------------------------
Sentences round 1:
0.0    1763
1.0     429
Name: Causal association, dtype: int64
-------------------------
sentences round 2:
0.0    1658
1.0     150
Name: Causal association, dtype: int64
-------------------------

After merge old data:
0.0    7131
1.0    1869
Name: Causal association, dtype: int64
-------------------------
sentences round 3:
0.0    1886
1.0     215
Name: Causal association, dtype: int64
--------------------
 0:non causal tweet 
 1: causal tweet.
 
 each tweet may have more than one sentence and we are splitting them and labelling by checking if cause or effect occur in them or not


### Preprocessing

In [4]:
def get_start_end_index_of_sentence_in_tweet(tweet, sentence):
    """ 
    The sentence tokens are included in the tweet tokens.
    Return the start end end indices of the sentence tokens in the tweet tokens

    """

    sentence_start_word = sentence[0]
    start_indices = [i for i, x in enumerate(tweet) if x == sentence_start_word] # find all indices of the start word of the sentence 
    try:
        for start_index in start_indices:
            isTrueStartIndex = all([tweet[start_index+i] == sentence[i] for i in range(len(sentence))])
            #print("start_index:", start_index, "isTrueStartIndex:", isTrueStartIndex)
            if isTrueStartIndex:
                return start_index, start_index + len(sentence) 
    except:
        print("ERROR: StartIndex should have been found for sentence:")
        print("tweet:")
        print(tweet)
        print("sentence:")
        print(sentence)
    return -1, -2 # should not be returned


def split_tweets_to_sentences(data):
    """ 
        Splits tweets into sentences and associates the appropriate intent, causes, effects and causal association
        to each sentence.
        
        Parameters:
        - min_words_in_sentences: Minimal number of words in a sentence such that the sentence is kept. 
                                  Assumption: A sentence with too few words does not have enough information
                              
                              
                              
        Ex.:
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what? type 1 causes insulin dependence | q;msS  | type 1|insulin dependence | 1       | ...  
        
        New dataframe returned: 
        full_text                              | Intent | Cause | Effect | Causal association | ...
        --------------------------------------------------------------------------------------------
        what?                                  |   q    |       |        |       0            | ...
        type 1 causes insulin dependence       |        | type 1| insulin dependence | 1       | ...  
    """

    newDF = pd.DataFrame(columns=["sentence", "Intent", "Cause", "Effect", "Causal association", "tokenized"])
    
    for i,row in data.iterrows():
        causes = row["Cause"]
        effects = row["Effect"]
        sentences = split_into_sentences(normalizeTweet(row["full_text"]))
        
        # single sentence in tweet
        if len(sentences) == 1:
            singleSentenceIntent = ""
            if isinstance(row["Intent"], str):
                if len(row["Intent"].split(";")) > 1:
                    singleSentenceIntent = row["Intent"].strip().replace(";msS", "").replace("msS;", "").replace(";mS", "").replace("mS;", "")
                else:
                    if row["Intent"] == "mS" or row["Intent"] == "msS":
                        singleSentenceIntent = ""
                    else:
                        singleSentenceIntent = row["Intent"].strip()
                    
            newDF=newDF.append(pd.Series({"sentence": sentences[0] # only one sentence
                         , "Intent": singleSentenceIntent
                         , "Cause" : row["Cause"]
                         , "Effect": row["Effect"]
                         , "Causal association" : row["Causal association"]
                         , "tokenized": row["tokenized"]}), ignore_index=True)
        
        # tweet has several sentences
        else: 
            intents = str(row["Intent"]).strip().split(";")
            for sentence in sentences:
                sent_tokenized = sentence.split(" ")
                causeInSentence = np.nan if not isinstance(causes, str) or not any([cause in sentence for cause in causes.split(";")]) else ";".join([cause for cause in causes.split(";") if cause in sentence])
                effectInSentence = np.nan if not isinstance(effects, str) or not any([effect in sentence for effect in effects.split(";")]) else ";".join([effect for effect in effects.split(";") if effect in sentence])
                causalAssociationInSentence = 1 if isinstance(causeInSentence, str) and isinstance(effectInSentence, str) else 0
                startIndex, endIndex = get_start_end_index_of_sentence_in_tweet(row["tokenized"], sent_tokenized)
                sentence_tokenized = row["tokenized"][startIndex:endIndex]
                
                if "q" in intents and sentence[-1] == "?": # if current sentence is question
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "q", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)                    
                elif "joke" in intents: # all sentences with "joke" in tweet keep the intent "joke"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "joke", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)   
                elif "neg" in intents: # all sentences with "neg" in tweet keep intent "neg"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": "neg", "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)               
                elif isinstance(causeInSentence, str) and isinstance(effectInSentence, str): # cause effect sentence
                    causalIntent = ""
                    if len(causeInSentence.split(";")) > 1:
                        causalIntent = "mC"
                        if len(effectInSentence.split(";")) > 1:
                            causalIntent = "mC;mE"
                    elif len(effectInSentence.split(";")) > 1:
                        causalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": causalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)                                  
                else:
                    nonCausalIntent = ""
                    if isinstance(causeInSentence, str): # only cause is given
                        if len(causeInSentence.split(";")) > 1:
                            nonCausalIntent = "mC"
                    elif isinstance(effectInSentence, str): # only effect is given
                        if len(effectInSentence.split(";")) > 1:
                            nonCausalIntent = "mE"
                    newDF=newDF.append(pd.Series({"sentence": sentence, "Intent": nonCausalIntent, "Cause" : causeInSentence
                                                , "Effect": effectInSentence, "Causal association" : causalAssociationInSentence
                                                , "tokenized": sentence_tokenized}), ignore_index=True)

    return newDF

In [5]:
### Split tweets into sentences (train classifier on sentence level) ####

print("Count of  tweets old:", data_old.shape[0])
print("Count of  tweets new:", data_new.shape[0])

data_old["tokenized"] = data_old["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
dataSentences_old = split_tweets_to_sentences(data_old)
print("Count of sentences old:", dataSentences_old.shape[0])


data_new["tokenized"] = data_new["full_text"].map(lambda tweet: normalizeTweet(tweet).split(" "))
dataSentences_new = split_tweets_to_sentences(data_new)
print("Count of sentences new:", dataSentences_new.shape[0])
dataSentences_new.head()

Count of  tweets old: 9000
Count of  tweets new: 2101
Count of sentences old: 15756
Count of sentences new: 2101


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized
0,I got super sick the last day I was in Mexico ...,,,,0.0,"[I, got, super, sick, the, last, day, I, was, ..."
1,"@USER @USER I know this is just twitter , but ...",mC;mE,type 1 diabetic;low blood sugar,die;sad,1.0,"[@USER, @USER, I, know, this, is, just, twitte..."
2,While I was lifting today spotify reminded me ...,,,,0.0,"[While, I, was, lifting, today, spotify, remin..."
3,@USER Many already send for private finger pri...,mC,finger prick tests;medicine,alive,1.0,"[@USER, Many, already, send, for, private, fin..."
4,"@USER @USER Speaking of testimony , I beat typ...",,Daniel diet,beat type 2 diabetes,1.0,"[@USER, @USER, Speaking, of, testimony, ,, I, ..."


In [6]:
########## Remove sentences with joke, question, negation and keep only sentences with more than 3 tokens #####

print("Count ofsentences old before filtering: ", dataSentences_old.shape[0])
dataSentFiltered_old = dataSentences_old[~dataSentences_old["Intent"].str.contains("neg|joke|q")] 
dataSentFiltered_old = dataSentFiltered_old[dataSentFiltered_old["tokenized"].map(len) > 3] 
print("Count of sentences old after filtering: ", dataSentFiltered_old.shape[0])
print("\n")
print("Distribution old:")
print(dataSentFiltered_old["Causal association"].value_counts())
print("----"*5)

print("Count of sentences new before filtering: ", dataSentences_new.shape[0])
dataSentFiltered_new = dataSentences_new[~dataSentences_new["Intent"].str.contains("neg|joke|q")] 
dataSentFiltered_new = dataSentFiltered_new[dataSentFiltered_new["tokenized"].map(len) > 3] 
print("Count of sentences new after filtering: ", dataSentFiltered_new.shape[0])
print("Distribution new:")
print("\n")
print(dataSentFiltered_new["Causal association"].value_counts())
dataSentFiltered_new.head()



Count ofsentences old before filtering:  15756
Count of sentences old after filtering:  12229


Distribution old:
0.0    10625
1.0     1604
Name: Causal association, dtype: int64
--------------------
Count of sentences new before filtering:  2101
Count of sentences new after filtering:  2056
Distribution new:


0.0    1855
1.0     201
Name: Causal association, dtype: int64


Unnamed: 0,sentence,Intent,Cause,Effect,Causal association,tokenized
0,I got super sick the last day I was in Mexico ...,,,,0.0,"[I, got, super, sick, the, last, day, I, was, ..."
1,"@USER @USER I know this is just twitter , but ...",mC;mE,type 1 diabetic;low blood sugar,die;sad,1.0,"[@USER, @USER, I, know, this, is, just, twitte..."
2,While I was lifting today spotify reminded me ...,,,,0.0,"[While, I, was, lifting, today, spotify, remin..."
3,@USER Many already send for private finger pri...,mC,finger prick tests;medicine,alive,1.0,"[@USER, Many, already, send, for, private, fin..."
4,"@USER @USER Speaking of testimony , I beat typ...",,Daniel diet,beat type 2 diabetes,1.0,"[@USER, @USER, Speaking, of, testimony, ,, I, ..."


### Data split and calculate class weight

In [9]:
####################### Stratified splits ####################


## ONLY FOR TESTING ---------------
#dataSentFiltered = dataSentFiltered[0:500] # for testing

text_old = dataSentFiltered_old["sentence"].map(normalizeTweet).values.tolist()
labels_old = dataSentFiltered_old["Causal association"].values.tolist()

# first split the data into training and testing label in the ratio of 90:10
train_texts_old, test_texts, train_labels_old, test_labels = train_test_split(text_old, labels_old, test_size=test_to_train_ratio, stratify=labels_old, random_state=9)
train_texts_old, val_texts, train_labels_old, val_labels = train_test_split(train_texts_old, train_labels_old, test_size=val_to_train_ratio, stratify=train_labels_old, random_state=9)

# Redefine training set:Take only new labeled tweets from round 3 for training; test and val set come from old data
train_texts = dataSentFiltered_new["sentence"].map(normalizeTweet).values.tolist()
train_labels = dataSentFiltered_new["Causal association"].values.tolist()

labels = train_labels + val_labels +test_labels # combining new train with validataion and test data from previous rounds


data_count_info = pd.Series(labels).value_counts(normalize=True)
train_count_info = pd.Series(train_labels).value_counts(normalize=True)
test_count_info = pd.Series(test_labels).value_counts(normalize=True)

# for class-imbalanced dataset, the class weight for a ith class
# to be specified for balancing in the loss function is given by:
# weight[i] = num_samples / (num_classes * num_samples[i])
# since train_count_info obtained above has fraction of samples
# for ith class, hence the corresponding weight calculation is:
class_weight = (1/train_count_info)/len(train_count_info)

print("All: Count = {}, % of 0 = {}, % of 1 = {}".format(len(labels), *data_count_info.round(4).to_list()))
print("\n")
print("Train: Count = {}, % of 0 = {}, % of 1 = {}".format(len(train_labels), *train_count_info.round(4).to_list()))
print("\n")
print("Test: Count = {}, % of 0 = {}, % of 1 = {}".format(len(test_labels), *test_count_info.round(4).to_list()))
print("\n")
print("Balancing class wts: for 0 = {}, for 1 = {}".format(*class_weight.round(4).to_list()))
print("\n")

All: Count = 5481, % of 0 = 0.8814, % of 1 = 0.1186


Train: Count = 2056, % of 0 = 0.9022, % of 1 = 0.0978


Test: Count = 1223, % of 0 = 0.8692, % of 1 = 0.1308


Balancing class wts: for 0 = 0.5542, for 1 = 5.1144




#### Defining our DataLoader 

In [10]:
class TweetDataSet(torch.utils.data.Dataset):
    def __init__(self, text, labels, tokenizer):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.text, padding=True, truncation=True, return_token_type_ids=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        return {
                "input_ids" : torch.tensor(ids[idx], dtype=torch.long)
              , "attention_mask" : torch.tensor(mask[idx], dtype=torch.long)
              , "token_type_ids" : torch.tensor(token_type_ids[idx], dtype=torch.long)
              , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
        }      

    def __len__(self):
        return len(self.labels)

    
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

train_dataset = TweetDataSet(train_texts, train_labels, tokenizer)
test_dataset = TweetDataSet(test_texts, test_labels, tokenizer)
print(len(train_dataset))
print(len(test_dataset))

# During training: In each epoch one part of the training data will be used as validation set
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2056
1223


### Evaluation Metrics

In [20]:
## we are measuring weighted metrics - as our dataset is imbalanced 
# Calculate metrics for each label, and find their average weighted by support
# (the number of true instances for each label). 
# This alters ‘macro’ to account for label imbalance; 
# it can result in an F-score that is not between precision and recall.


from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef

def compute_metrics(pred, labels, average="macro"):
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    precision, recall, f1, _ = precision_recall_fscore_support(labels,pred, average=average)
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



### Model defintion

In [12]:


class CausalityBERT(torch.nn.Module):
    """ Model Bert"""
    def __init__(self):
        super(CausalityBERT, self).__init__()
        self.num_labels = 2
        self.bert = transformers.BertModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 256)
        self.linear2 = torch.nn.Linear(256, self.num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output_1 = self.bert(input_ids, attention_mask = attention_mask, token_type_ids=token_type_ids, return_dict=False) # if output 1 is our cls token        
        output_2 = self.dropout(output_1)
        output_3 = self.linear1(output_2)  
        output_4 = self.dropout(output_3)
        output_5 = self.linear2(output_4)
        return output_5

### Moving the model to  GPU and defining training parameters: 
    * num_training_steps 
    * optimizers 
    * scheduler 
    * loss funciton (weighted) 

In [13]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = CausalityBERT() ## just load the model trained in previous round here 
model.load_state_dict(torch.load(finetuned_model, map_location='cpu')) # load model trained in previous round
model.to(device)


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing BertModel: ['roberta.encoder.layer.11.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.dense.weight', 'roberta.encoder.layer.7.attention.self.key.bias', 'roberta.encoder.layer.1.attention.self.query.weight', 'roberta.encoder.layer.3.attention.self.key.bias', 'roberta.encoder.layer.3.intermediate.dense.weight', 'roberta.encoder.layer.9.attention.output.dense.bias', 'roberta.encoder.layer.10.attention.self.value.bias', 'roberta.encoder.layer.2.output.dense.weight', 'roberta.encoder.layer.1.attention.self.key.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.4.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.dense.bias', 

Some weights of BertModel were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['encoder.layer.11.attention.self.key.weight', 'encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.9.attention.self.value.bias', 'encoder.layer.6.attention.output.LayerNorm.bias', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.11.attention.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.8.attention.self.query.weight', 'encoder.layer.3.output.dense.weight', 'encoder.layer.6.attention.self.value.bias', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.9.attention.output.LayerNorm.weight', 'encoder.layer.11.attention.output.dense.bias', 'encoder.layer.8.attention.self.value.bias', 'encoder.lay

CausalityBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True

## evaluation on the test dataset 



In [21]:
#loss_fn = CrossEntropyLoss()
test_loss = []
test_acc = []
test_prec = []
test_rec = []
test_f1 = []

test_acc_w = [] # weighted
test_prec_w = []
test_rec_w = []
test_f1_w = []

test_acc_b = [] # binary
test_prec_b = []
test_rec_b = []
test_f1_b = []

for batch in tqdm(test_loader):
    batch = tuple(batch[t].to(device) for t in batch)      # batch to GPU
    b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch     # unpack inputs from dataloader

    with torch.no_grad():
        model.eval()
        logits = model(**{"input_ids":b_input_ids, "attention_mask":b_input_mask, "token_type_ids":b_token_type_ids}) # forward pass, calculates logit predictions 
    
    # move logits and labels to CPU
    logits = logits.detach().to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    metrics = compute_metrics(pred_flat, labels_flat, "macro")
    test_acc.append(metrics["accuracy"])
    test_prec.append(metrics["precision"])
    test_rec.append(metrics["recall"])
    test_f1.append(metrics["f1"])

    metrics = compute_metrics(pred_flat, labels_flat, "weighted")
    test_acc_w.append(metrics["accuracy"])
    test_prec_w.append(metrics["precision"])
    test_rec_w.append(metrics["recall"])
    test_f1_w.append(metrics["f1"])
    
    metrics = compute_metrics(pred_flat, labels_flat, "binary")
    test_acc_b.append(metrics["accuracy"])
    test_prec_b.append(metrics["precision"])
    test_rec_b.append(metrics["recall"])
    test_f1_b.append(metrics["f1"])
    
print(F'\n\ttest loss: {np.mean(test_loss)}')
print(F'\n\ttest acc: {np.mean(test_acc)}')
print(F'\n\ttest prec: {np.mean(test_prec)}')
print(F'\n\ttest rec: {np.mean(test_rec)}')
print(F'\n\ttest f1: {np.mean(test_f1)}')
print()
print(F'\n\ttest acc weighted: {np.mean(test_acc_w)}')
print(F'\n\ttest prec weighted: {np.mean(test_prec_w)}')
print(F'\n\ttest rec weighted: {np.mean(test_rec_w)}')
print(F'\n\ttest f1 weighted: {np.mean(test_f1_w)}')
print()
print(F'\n\ttest acc binary: {np.mean(test_acc_b)}')
print(F'\n\ttest prec binary: {np.mean(test_prec_b)}')
print(F'\n\ttest rec binary: {np.mean(test_rec_b)}')
print(F'\n\ttest f1 binary: {np.mean(test_f1_b)}')






  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  ,

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  ,

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  ,

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, 

  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  , "labels" : torch.tensor(self.labels[idx], dtype=torch.long)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|█████████████████████████


	test loss: nan

	test acc: 0.6527133580705009

	test prec: 0.5951582328205705

	test rec: 0.6677101037490646

	test f1: 0.5432614223257578


	test acc weighted: 0.6527133580705009

	test prec weighted: 0.8760435136652668

	test rec weighted: 0.6527133580705009

	test f1 weighted: 0.7125962518412591


	test acc binary: 0.6527133580705009

	test prec binary: 0.23735359157437072

	test rec binary: 0.7045454545454546

	test f1 binary: 0.3364920072712281



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [19]:
for p, r, f in zip(test_prec, test_rec, test_f1):
    print(p,r,f)

0.49090909090909096 0.48717948717948717 0.3650793650793651
0.7833333333333333 0.8090909090909091 0.7922077922077921
0.5 0.21875 0.30434782608695654
0.6875 0.8076923076923077 0.6536796536796536
0.625 0.6538461538461539 0.43529411764705883
0.6666666666666667 0.6666666666666667 0.625
0.5625 0.7666666666666666 0.4589371980676329
0.5833333333333334 0.8333333333333333 0.5428571428571429
0.625 0.9 0.6444444444444445
0.4583333333333333 0.36666666666666664 0.4074074074074074
0.75 0.8846153846153846 0.7681159420289856
0.5714285714285714 0.8 0.49999999999999994
0.6545454545454545 0.717948717948718 0.6666666666666666
0.625 0.7857142857142857 0.5636363636363637
0.5 0.375 0.42857142857142855
0.6666666666666666 0.8571428571428572 0.6666666666666666
0.625 0.9 0.6444444444444445
0.6111111111111112 0.75 0.5151515151515151
0.5714285714285714 0.8 0.49999999999999994
0.8333333333333333 0.9166666666666667 0.8545454545454545
0.5416666666666667 0.5512820512820513 0.5428571428571429
0.6111111111111112 0.75 0.5

### Print predictions of last test set batch:

In [123]:
# take last batch of test set:

for i in range(len(batch)):
    tokens = tokenizer.convert_ids_to_tokens(b_input_ids[i])
    print("\nPadded Sentence:")
    print(tokens)
    print("prediction:", pred_flat[i])
    


Padded Sentence:
['<s>', '@USER', 'Hey', 'Chee@@', 'to', 'man', ',', 'I', 'am', 'older', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
prediction: 0

Padded Sentence:
['<s>', 'My', 'body', 'almost', 'went', 'into', 'diabetic', 'coma', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '

In [124]:
# add seed => check
# add binary accuracy  => ???????
# add plot loss function accuracy => validation accu

# y - axis: loss function; validation accuracy
# x - axis: epochs


# epochs, learning rate => ok

# 90% training => 10% test  => ok
# how to use random batch of training set for validation

# clean notebook

# clean data sheet => check

# Check Pytorch: EarlyStopping add => check