## Bertweet model for sentiment with intermediate fine-tuning on emoji prediction

### 1. install pakages

In [1]:
!pip install transformers



In [2]:
import tensorflow
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import io
import os
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt

In [3]:
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch
from transformers import AutoModel, AutoTokenizer

bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

# For transformers v4.x+:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 2. data preparation 

In [6]:
import pandas as pd
# define a function for data preparation
def data_prepare(file_path, lab2ind, tokenizer, max_len = 64, mode = 'train'):
    '''
    file_path: the path to input file. 
                In train mode, the input must be a tsv file that includes two columns where the first is text, and second column is label.
                The first row must be header of columns.

                In predict mode, the input must be a tsv file that includes only one column where the first is text.
                The first row must be header of column.

    lab2ind: dictionary of label classes
    tokenizer: BERT tokenizer
    max_len: maximal length of input sequence
    mode: train or predict
    '''
    # if we are in train mode, we will load two columns (i.e., text and label).
    if mode == 'train':
        # Use pandas to load dataset
        df = pd.read_csv(file_path, delimiter='\t',header=0, names=['content','label'])
        print("Data size ", df.shape)
        labels = df.label.values
        
        # Create sentence and label lists
        labels = [lab2ind[i] for i in labels] 
        print("Label is ", labels[0])
        
        # Convert data into torch tensors
        labels = torch.tensor(labels)

    # if we are in predict mode, we will load one column (i.e., text).
    elif mode == 'predict':
        df = pd.read_csv(file_path, delimiter='\t',header=0, names=['content'])
        print("Data size ", df.shape)
        # create placeholder
        labels = []
    else:
        print("the type of mode should be either 'train' or 'predict'. ")
        return
        
    # Create sentence and label lists
    content = df.content.values
    #### REF START ####

    # We need to add a special token at the beginning for BERT to work properly.
    content = ["[CLS] " + text for text in content]

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.
    tokenized_texts = [tokenizer.tokenize(text) for text in content]
    
    # if the sequence is longer the maximal length, we truncate it to the pre-defined maximal length
    tokenized_texts = [ text[:max_len+1] for text in tokenized_texts]

    # We also need to add a special token at the end.
    tokenized_texts = [ text+['[SEP]'] for text in tokenized_texts]
    print ("Tokenize the first sentence:\n",tokenized_texts[0])
    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    print ("Index numbers of the first sentence:\n",input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(input_ids, maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)
    print ("Index numbers of the first sentence after padding:\n",input_ids[0])

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for pad tokens
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    #### REF END ####

    return inputs, labels, masks    

In [7]:
lab2ind_emoji = {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8, 9:9, 10:10, 11:11, 12:12, 13:13, 14:14, 15:15, 16:16, 17:17, 18:18, 19:19}

In [8]:
lab2ind_sentiment = {0: 0, 1: 1, 2:2}

In [9]:
train_inputs_emoji, train_labels_emoji, train_masks_emoji = data_prepare("train_emoji.tsv", lab2ind_emoji,tokenizer)

Data size  (45000, 2)
Label is  12
Tokenize the first sentence:
 ['[@@', 'CL@@', 'S@@', ']', 'Sunday', 'afternoon', 'walking', 'through', 'Venice', 'in', 'the', 'sun', 'with', '@@@', 'user', '️', '️', '️', '@', 'Ab@@', 'bot', 'Kin@@', 'ney@@', ',', 'Venice', '[SEP]']
Index numbers of the first sentence:
 [61658, 6411, 381, 317, 970, 2464, 1508, 292, 18911, 16, 6, 1599, 30, 5238, 4699, 3, 3, 3, 59157, 3547, 5178, 12229, 11558, 7, 18911, 3]
Index numbers of the first sentence after padding:
 [61658  6411   381   317   970  2464  1508   292 18911    16     6  1599
    30  5238  4699     3     3     3 59157  3547  5178 12229 11558     7
 18911     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3]


In [10]:
validation_inputs_emoji, validation_labels_emoji, validation_masks_emoji = data_prepare("val_emoji.tsv", lab2ind_emoji,tokenizer)

Data size  (5000, 2)
Label is  0
Tokenize the first sentence:
 ['[@@', 'CL@@', 'S@@', ']', 'A', 'little', 'throwback', 'with', 'my', 'favourite', 'person', '@', 'Water', 'Wall', '[SEP]']
Index numbers of the first sentence:
 [61658, 6411, 381, 317, 104, 263, 16189, 30, 23, 2060, 282, 59157, 3782, 3072, 3]
Index numbers of the first sentence after padding:
 [61658  6411   381   317   104   263 16189    30    23  2060   282 59157
  3782  3072     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3]


In [11]:
batch_size = 32
# We'll take training samples in random order in each epoch. 
train_data_emoji = TensorDataset(train_inputs_emoji, train_masks_emoji, train_labels_emoji)
train_dataloader_emoji = DataLoader(train_data_emoji, 
                              sampler = RandomSampler(train_data_emoji), # Select batches randomly
                              batch_size=batch_size)

In [12]:
validation_data_emoji = TensorDataset(validation_inputs_emoji, validation_masks_emoji, validation_labels_emoji)
validation_dataloader_emoji = DataLoader(validation_data_emoji, 
                                   sampler = SequentialSampler(validation_data_emoji), # Pull out batches sequentially.
                                   batch_size=batch_size)

In [13]:
train_inputs_sentiment, train_labels_sentiment, train_masks_sentiment = data_prepare("train_sentiment.tsv", lab2ind_sentiment,tokenizer)

Data size  (45615, 2)
Label is  2
Tokenize the first sentence:
 ['[@@', 'CL@@', 'S@@', ']', '"@@', 'QT', '@@@', 'user', 'In', 'the', 'original', 'draft', 'of', 'the', '7th', 'book@@', ',', 'Rem@@', 'us', 'Lup@@', 'in', 'survived', 'the', 'Battle', 'of', 'Hog@@', 'war@@', 'ts@@', '.', '#HappyBirthday@@', 'Rem@@', 'us@@', 'Lup@@', 'in@@', '"', '[SEP]']
Index numbers of the first sentence:
 [61658, 6411, 381, 317, 61933, 19989, 5238, 4699, 173, 6, 1782, 4271, 15, 6, 4133, 8470, 7, 8867, 148, 30411, 16, 9610, 6, 2606, 15, 60201, 3628, 4813, 4, 11835, 8867, 1924, 30411, 520, 26, 3]
Index numbers of the first sentence after padding:
 [61658  6411   381   317 61933 19989  5238  4699   173     6  1782  4271
    15     6  4133  8470     7  8867   148 30411    16  9610     6  2606
    15 60201  3628  4813     4 11835  8867  1924 30411   520    26     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3

In [14]:
validation_inputs_sentiment, validation_labels_sentiment, validation_masks_sentiment = data_prepare("val_sentiment.tsv", lab2ind_sentiment,tokenizer)

Data size  (2000, 2)
Label is  1
Tokenize the first sentence:
 ['[@@', 'CL@@', 'S@@', ']', 'Dark', 'Souls', '3', 'April', 'Launch', 'Date', 'Confirmed', 'With', 'New', 'Trail@@', 'er:', 'Embrace', 'the', 'dark@@', 'ness@@', '.', '[SEP]']
Index numbers of the first sentence:
 [61658, 6411, 381, 317, 3713, 20987, 163, 1249, 9767, 2553, 22650, 458, 210, 53681, 25570, 26755, 6, 13863, 16571, 4, 3]
Index numbers of the first sentence after padding:
 [61658  6411   381   317  3713 20987   163  1249  9767  2553 22650   458
   210 53681 25570 26755     6 13863 16571     4     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3]


In [15]:
batch_size = 32
# We'll take training samples in random order in each epoch. 
train_data_sentiment = TensorDataset(train_inputs_sentiment, train_masks_sentiment, train_labels_sentiment)
train_dataloader_sentiment = DataLoader(train_data_sentiment, 
                              sampler = RandomSampler(train_data_sentiment), # Select batches randomly
                              batch_size=batch_size)

In [16]:
validation_data_sentiment = TensorDataset(validation_inputs_sentiment, validation_masks_sentiment, validation_labels_sentiment)
validation_dataloader_sentiment = DataLoader(validation_data_sentiment, 
                                   sampler = SequentialSampler(validation_data_sentiment), # Pull out batches sequentially.
                                   batch_size=batch_size)

### 3. Bertweet model - emoji

In [17]:
class Bertweet_cls(nn.Module):

    def __init__(self, lab2ind, model_path, hidden_size):
        super(Bertweet_cls, self).__init__()
        self.model_path = model_path
        self.hidden_size = hidden_size
        self.bert_model = AutoModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True)
        
        self.label_num = len(lab2ind)
        
        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.hidden_size, self.label_num)

    def forward(self, bert_ids, bert_mask):
        outputs = self.bert_model(input_ids=bert_ids, attention_mask = bert_mask)
        pooler_output = outputs['pooler_output']
        attentions = outputs['attentions']
        
        x = self.dense(pooler_output)
        x = torch.tanh(x)
        x = self.dropout(x)
        fc_output = self.fc(x)

        return fc_output, attentions

In [18]:
bertweet_model = Bertweet_cls(lab2ind_emoji, "vinai/bertweet-base", 768).to(device)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# Parameters for emoji model:
lr = 0.0001
max_grad_norm = 1.0
epochs = 5
warmup_proportion = 0.1
num_training_steps  = len(train_dataloader_emoji) * epochs
num_warmup_steps = num_training_steps * warmup_proportion

### In Transformers, optimizer and schedules are instantiated like this:
# Note: AdamW is a class from the huggingface library
# the 'W' stands for 'Weight Decay"
optimizer = AdamW(bertweet_model.parameters(), lr=lr, correct_bias=False)
# schedules
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

# We use nn.CrossEntropyLoss() as our loss function. 
criterion = nn.CrossEntropyLoss()



### 4. model training - emoji


In [20]:
def train(model, iterator, optimizer, scheduler, criterion):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        input_ids, input_mask, labels = batch

        outputs,_ = model(input_ids, input_mask)

        loss = criterion(outputs, labels)
        # delete used variables to free GPU memory
        del batch, input_ids, input_mask, labels
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.cpu().item()
        optimizer.zero_grad()
    
    # free GPU memory
    if device == 'cuda':
        torch.cuda.empty_cache()

    return epoch_loss / len(iterator)

In [21]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_pred=[]
    all_label = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            input_ids, input_mask, labels = batch

            outputs,_ = model(input_ids, input_mask)
            
            loss = criterion(outputs, labels)

            # delete used variables to free GPU memory
            del batch, input_ids, input_mask
            epoch_loss += loss.cpu().item()

            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(labels.cpu())
    
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return epoch_loss / len(iterator), accuracy, f1score

In [22]:
import os
save_path = './drive/My Drive/Colab Notebooks/ckpt_BERTweet/'
if os.path.exists(save_path) == False:
    os.makedirs(save_path)

In [23]:
# Train the model
loss_list = []
acc_list = []

for epoch in trange(epochs, desc="Epoch"):
    train_loss = train(bertweet_model, train_dataloader_emoji, optimizer, scheduler, criterion)  
    val_loss, val_acc, val_f1 = evaluate(bertweet_model, validation_dataloader_emoji, criterion)

    # Create checkpoint at end of each epoch
    state = {
        'epoch': epoch,
        'state_dict': bertweet_model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict()
        }

    torch.save(state, "./drive/My Drive/Colab Notebooks/ckpt_BERTweet/BERT_"+str(epoch+1)+".pt")

    print('\n Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, epochs, train_loss, val_loss, val_acc, val_f1))

Epoch:  20%|██        | 1/5 [18:07<1:12:30, 1087.62s/it]


 Epoch [1/5], Train Loss: 2.1043, Validation Loss: 2.3254, Validation Accuracy: 0.2946, Validation F1: 0.2040


Epoch:  40%|████      | 2/5 [36:03<54:02, 1080.87s/it]  


 Epoch [2/5], Train Loss: 1.7357, Validation Loss: 2.3789, Validation Accuracy: 0.2930, Validation F1: 0.2387


Epoch:  60%|██████    | 3/5 [54:02<35:59, 1079.89s/it]


 Epoch [3/5], Train Loss: 1.4122, Validation Loss: 2.3885, Validation Accuracy: 0.3122, Validation F1: 0.2636


Epoch:  80%|████████  | 4/5 [1:12:02<17:59, 1079.77s/it]


 Epoch [4/5], Train Loss: 1.0733, Validation Loss: 2.6220, Validation Accuracy: 0.3108, Validation F1: 0.2828


Epoch: 100%|██████████| 5/5 [1:30:02<00:00, 1080.54s/it]


 Epoch [5/5], Train Loss: 0.7778, Validation Loss: 2.8473, Validation Accuracy: 0.3096, Validation F1: 0.2878





In [24]:
best_emoji = torch.load("./drive/My Drive/Colab Notebooks/ckpt_BERTweet/BERT_5.pt",map_location=torch.device('cpu'))

In [25]:
# initialize weight for the linear layer
best_emoji['state_dict']['fc.weight']= torch.randn([3, 768]) #need to be normal;

In [26]:
best_emoji['state_dict']['fc.bias']=torch.zeros([3]) #all 0s for bias

In [27]:
torch.save(best_emoji, "./drive/My Drive/Colab Notebooks/ckpt_BERTweet/best_emoji"+".pt")

### 5. Bertweet model - sentiment

In [28]:
best_emoji_checkpoint = torch.load("./drive/My Drive/Colab Notebooks/ckpt_BERTweet/best_emoji.pt",map_location=torch.device('cpu'))

In [29]:
model = Bertweet_cls(lab2ind_sentiment, "vinai/bertweet-base", 768).to(device)
model.load_state_dict(best_emoji_checkpoint['state_dict'])

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [30]:
# Parameters for sentiment:
lr = 0.00002
max_grad_norm = 1.0
epochs = 3
warmup_proportion = 0.1
num_training_steps  = len(train_dataloader_sentiment) * epochs
num_warmup_steps = num_training_steps * warmup_proportion

### In Transformers, optimizer and schedules are instantiated like this:
# Note: AdamW is a class from the huggingface library
# the 'W' stands for 'Weight Decay"
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
# schedules
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

# We use nn.CrossEntropyLoss() as our loss function. 
criterion = nn.CrossEntropyLoss()



### 6. model training - sentiment

In [31]:
# Train the sentiment model
loss_list = []
acc_list = []

for epoch in trange(epochs, desc="Epoch"):
    train_loss = train(model, train_dataloader_sentiment, optimizer, scheduler, criterion)  
    val_loss, val_acc, val_f1 = evaluate(model, validation_dataloader_sentiment, criterion)

    # Create checkpoint at end of each epoch
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict()
        }

    torch.save(state, "./drive/My Drive/Colab Notebooks/ckpt_BERTweet/BERT_sentiment"+str(epoch+1)+".pt")

    print('\n Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, epochs, train_loss, val_loss, val_acc, val_f1))

Epoch:  33%|███▎      | 1/3 [17:48<35:37, 1068.57s/it]


 Epoch [1/3], Train Loss: 1.3196, Validation Loss: 0.6831, Validation Accuracy: 0.7195, Validation F1: 0.7051


Epoch:  67%|██████▋   | 2/3 [35:37<17:48, 1068.78s/it]


 Epoch [2/3], Train Loss: 0.5974, Validation Loss: 0.6597, Validation Accuracy: 0.7265, Validation F1: 0.7121


Epoch: 100%|██████████| 3/3 [53:27<00:00, 1069.09s/it]


 Epoch [3/3], Train Loss: 0.4628, Validation Loss: 0.6841, Validation Accuracy: 0.7340, Validation F1: 0.7139





smaller learning rate for sentiment than for emoji bc it is more fine-grined.

### 7. model evaluation - sentiment

In [32]:
test_inputs_sentiment, test_labels_sentiment, test_masks_sentiment = data_prepare("test_sentiment.tsv", lab2ind_sentiment,tokenizer)

Data size  (12284, 2)
Label is  1
Tokenize the first sentence:
 ['[@@', 'CL@@', 'S@@', ']', '@@@', 'user', '@@@', 'user', 'what', 'do', 'these', "'@@", '1/2', 'naked', 'pic@@', 's@@', "'", 'have', 'to', 'do', 'with', 'anything@@', '?', 'They@@', "'re", 'not', 'even', 'like', 'that@@', '.', '[SEP]']
Index numbers of the first sentence:
 [61658, 6411, 381, 317, 5238, 4699, 5238, 4699, 66, 32, 198, 1909, 3300, 1842, 7781, 423, 69, 36, 9, 32, 30, 48735, 21, 32752, 81, 46, 132, 43, 6139, 4, 3]
Index numbers of the first sentence after padding:
 [61658  6411   381   317  5238  4699  5238  4699    66    32   198  1909
  3300  1842  7781   423    69    36     9    32    30 48735    21 32752
    81    46   132    43  6139     4     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3     3     3     3     3     3     3
     3     3     3     3     3     3]


In [33]:
batch_size = 32
# We'll take training samples in random order in each epoch. 
test_data_sentiment = TensorDataset(test_inputs_sentiment, test_masks_sentiment, test_labels_sentiment)
test_dataloader_sentiment = DataLoader(test_data_sentiment, 
                              sampler = RandomSampler(test_data_sentiment), # Select batches randomly
                              batch_size=batch_size)

In [34]:
best_sentiment_checkpoint = torch.load("./drive/My Drive/Colab Notebooks/ckpt_BERTweet/BERT_sentiment3.pt",map_location=torch.device('cpu'))

In [35]:
model_sentiment = Bertweet_cls(lab2ind_sentiment, "vinai/bertweet-base", 768).to(device)
model_sentiment.load_state_dict(best_sentiment_checkpoint['state_dict'])

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [36]:
test_loss, test_acc, test_f1 = evaluate(model_sentiment, test_dataloader_sentiment, criterion)
print('Test Loss: {:.4f}, Test Accuracy: {:.4f}, Test F1: {:.4f}'.format(test_loss, test_acc, test_f1))

Test Loss: 0.7273, Test Accuracy: 0.7044, Test F1: 0.7056


Test on vaccine data

In [37]:
lab2ind_vac = {'negative': 0, 'neutral':1, 'positive':2}

In [39]:
vac_inputs_sentiment, vac_labels_sentiment, vac_masks_sentiment = data_prepare("test_vaccines.tsv", lab2ind_vac,tokenizer)

Data size  (101, 2)
Label is  1
Tokenize the first sentence:
 ['[@@', 'CL@@', 'S@@', ']', 'According', 'to', 'the', 'latest', '#v@@', 'acc@@', 'ine', 'report', 'by', 'the', '#UK@@', 'Health@@', 'Sec@@', 'urity@@', 'Ag@@', 'ency@@', ',', '#V@@', 'acc@@', 'ine@@', 'Eff@@', 'ectiveness', 'for', 'tri@@', 'ple-@@', 'jab@@', 'bed', 'young', 'and', 'older', 'people', 'are', 'below', 'zero@@', ',', 'and', 'more', 'likely', 'to', 'get', 'with', '#Co@@', 'vid', '[SEP]']
Index numbers of the first sentence:
 [61658, 6411, 381, 317, 3877, 9, 6, 1405, 6615, 6530, 1466, 1649, 61, 6, 20461, 15251, 11060, 43243, 7709, 35615, 7, 2863, 6530, 3800, 24037, 61598, 19, 3203, 35939, 27819, 424, 859, 13, 1879, 83, 41, 2703, 46631, 7, 13, 89, 2056, 9, 51, 30, 8232, 4759, 3]
Index numbers of the first sentence after padding:
 [61658  6411   381   317  3877     9     6  1405  6615  6530  1466  1649
    61     6 20461 15251 11060 43243  7709 35615     7  2863  6530  3800
 24037 61598    19  3203 35939 27819   424

In [40]:
batch_size = 32
# We'll take training samples in random order in each epoch. 
vac_data_sentiment = TensorDataset(vac_inputs_sentiment, vac_masks_sentiment, vac_labels_sentiment)
vac_dataloader_sentiment = DataLoader(vac_data_sentiment, 
                              sampler = RandomSampler(vac_data_sentiment), # Select batches randomly
                              batch_size=batch_size)

In [41]:
best_sentiment_checkpoint = torch.load("./drive/My Drive/Colab Notebooks/ckpt_BERTweet/BERT_sentiment3.pt",map_location=torch.device('cpu'))

In [42]:
model_sentiment = Bertweet_cls(lab2ind_vac, "vinai/bertweet-base", 768).to(device)
model_sentiment.load_state_dict(best_sentiment_checkpoint['state_dict'])

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [43]:
criterion = nn.CrossEntropyLoss()
test_loss, test_acc, test_f1 = evaluate(model_sentiment, vac_dataloader_sentiment, criterion)
print('Test Loss: {:.4f}, Test Accuracy: {:.4f}, Test F1: {:.4f}'.format(test_loss, test_acc, test_f1))

Test Loss: 0.5339, Test Accuracy: 0.7624, Test F1: 0.7461


Test on masks

In [44]:
mask_inputs_sentiment, mask_labels_sentiment, mask_masks_sentiment = data_prepare("test_masks.tsv", lab2ind_vac,tokenizer)

Data size  (90, 2)
Label is  2
Tokenize the first sentence:
 ['[@@', 'CL@@', 'S@@', ']', 'Thank', 'you', 'to', 'all', 'those', 'who', 'still', 'limit', 'indoor', 'social', 'gather@@', 'ings@@', ',', 'get', 'boo@@', 'sted@@', ',', 'mask', 'at', 'the', 'grocery', 'store', '&@@', 'amp@@', ';', 'send', 'their', 'children', 'to', 'school', 'in', 'mask@@', 's.', 'I', 'see', 'you', 'and', 'appreciate', 'you', 'caring', 'about', 'the', 'safety', 'of', 'the', 'rest', 'of', 'us@@', '.', 'We', 'over', 'me@@', '.', '#COVID19', '#Covid@@', 'Is@@', 'Not@@', 'Over', '#M@@', 'ask@@', 'Up', '[SEP]']
Index numbers of the first sentence:
 [61658, 6411, 381, 317, 396, 14, 9, 48, 268, 87, 135, 3947, 17617, 1009, 62780, 17726, 7, 51, 4910, 41756, 7, 7146, 35, 6, 8923, 1297, 19295, 6755, 208, 786, 130, 994, 9, 230, 16, 61163, 32188, 8, 95, 14, 13, 1621, 14, 5325, 62, 6, 3705, 15, 6, 765, 15, 1924, 4, 134, 141, 1677, 4, 4270, 60605, 3166, 8719, 1773, 1230, 11686, 857, 3]
Index numbers of the first sentence af

In [45]:
batch_size = 32
# We'll take training samples in random order in each epoch. 
mask_data_sentiment = TensorDataset(mask_inputs_sentiment, mask_masks_sentiment, mask_labels_sentiment)
mask_dataloader_sentiment = DataLoader(mask_data_sentiment, 
                              sampler = RandomSampler(mask_data_sentiment), # Select batches randomly
                              batch_size=batch_size)

In [46]:
test_loss, test_acc, test_f1 = evaluate(model_sentiment, mask_dataloader_sentiment, criterion)
print('Test Loss: {:.4f}, Test Accuracy: {:.4f}, Test F1: {:.4f}'.format(test_loss, test_acc, test_f1))

Test Loss: 0.6645, Test Accuracy: 0.7222, Test F1: 0.6913


### 8. predict 

In [48]:
def sententce_prepocess(content, tokenizer, max_len = 64):
    """
    content: list of string. Each string is a sample. We only include one sample in this list.
    tokenizer: BertTokenizerFast
    """
    #### REF START ####

    # We need to add a special token at the beginning for BERT to work properly.
    content = ["[CLS] " + text for text in content]

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.
    tokenized_texts = [tokenizer.tokenize(text) for text in content]
    
    # if the sequence is longer the maximal length, we truncate it to the pre-defined maximal length
    tokenized_texts = [ text[:max_len+1] for text in tokenized_texts]

    # We also need to add a special token at the end.
    tokenized_texts = [ text+['[SEP]'] for text in tokenized_texts]
    #print ("Tokenize the first sentence:\n",tokenized_texts[0])
    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    #print ("Index numbers of the first sentence:\n",input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(input_ids, maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)
    #print ("Index numbers of the first sentence after padding:\n",input_ids[0])

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for pad tokens
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    #### REF END ####

    return tokenized_texts, inputs, masks

In [49]:
masks_df = pd.read_csv('masks_twitter.csv')
masks_df.head()

Unnamed: 0,author_id,created_at,tweet_id,tweet_text,CNN_prediction,CNN_confidence,BERTweet_prediction,BERTweet_confidence,BERTweet_fine-tuned_prediction,BERTweet_fine-tuned_confidence,final_prediction
0,835967581693612033,2021-07,1421619622445469697,"The manipulative, politically opportune lying ...",,,,,,,
1,825765275853344773,2021-07,1421619085876547586,@ReallyAmerican1 Someone should introduce a bi...,,,,,,,
2,1412770891335864320,2021-07,1421617357051596804,Just wear a friggin #mask we will never #MakeA...,,,,,,,
3,1254787933493575684,2021-07,1421617137991536642,Proof Vaccines 💉 Do Not work but Harm\n\nCDC r...,,,,,,,
4,841625519779139584,2021-07,1421612879791501312,@barali4793 @RadioFreeTom We now have a huge p...,,,,,,,


In [None]:
emoji_result = []
emoji_confid = []
for index, row in masks_df.iterrows():
    tokenized_texts, input_ids, masks = sententce_prepocess([row['tweet_text']], tokenizer)
    input_ids, masks = input_ids.to(device), masks.to(device)
    outputs,_ = model_sentiment(input_ids, masks)
    probabilities, predicted = torch.max(outputs.cpu().data, 1)
    emoji_result.append(predicted.item())
    emoji_confid.append(probabilities.item())
