## Prerequisites

In [1]:
!pip install emoji

Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[?25l[K     |█▉                              | 10 kB 30.2 MB/s eta 0:00:01[K     |███▊                            | 20 kB 9.1 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 8.2 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 7.8 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 4.3 MB/s eta 0:00:01[K     |███████████▏                    | 61 kB 5.1 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 5.3 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 4.1 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 4.6 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 5.0 MB/s eta 0:00:01[K     |████████████████████▌           | 112 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████▍         | 122 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████▎       | 133 kB 5.0 MB/s eta 0:00:01[K     |██████████

In [2]:
!pip install torch torchvision transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.50.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 41.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.5 MB/s 
Collecting click==8.0
  Downloading click-8.0.0-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 5.5 MB/s 
Building wheel

Note: In order to run BERTweet, you need to install the latest version of transformers:
* `git clone https://github.com/huggingface/transformers.git`
* `cd transformers`
* `pip3 install --upgrade .`

In [3]:
import os
import json
import random
import time
import datetime
import torch
import argparse
import numpy as np
import pandas as pd
from torch.nn import functional as F
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from transformers import get_linear_schedule_with_warmup,AdamW,AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset,DataLoader, RandomSampler, SequentialSampler, Dataset

Define some helper functions

In [4]:
batch_size = 8

In [5]:
def calculate_scores(preds, labels):
    pred_flat = np.argmax(np.concatenate(preds), axis=1).flatten()
    results = dict()
    results['precision_score'] = precision_score(labels, pred_flat, average='binary')
    results['recall_score'] = recall_score(labels, pred_flat, average='binary')
    results['f1_score'] = f1_score(labels, pred_flat, average='binary')
    return results


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Preprocessing

For BERTweet we will only load the data and do not perform any preprocessing at all (even links + usernames will not be removed from the input that we feed to BERTweet)

In [33]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")
        
        
def convert_prediction(pred):
    if pred == 1:
        return "rumour"
    elif pred == 0:
        return "nonrumour"
    else:
        raise Exception("prediction classes must be '0' or '1'")

In [None]:
def load_data(data_file, label_file):
    
    if label_file != None:
        y_true = json.load(open(label_file))
    
    with open(data_file, 'r') as data_train:
        raw_list = list(data_train)

    data_list = []


    for event in raw_list:
        tweets_in_event = json.loads(event)

        tweet = {}

        tweet['id'] = tweets_in_event[0]['id']
        tweet['text'] = tweets_in_event[0]['text']
        
        # append text from follow-up tweets in tweet chain
        follow_up_tweets = ""
        for i in range(1, len(tweets_in_event)):
            follow_up_tweets = follow_up_tweets + tweets_in_event[i]['text'] + " "
        
        # Concatenate text from all tweets in the field 'text'
        tweet['text'] = tweet['text'] + " " + follow_up_tweets
        
        tweet['text'] = tweet['text'].strip()
        
        if label_file != None:
            tweet['label'] = convert_label(y_true[str(tweet['id'])])
        
        data_list.append(tweet)

    df = pd.DataFrame(data_list)

    return df

In [11]:
# train_df = load_data(data_file = '../data/train.data.jsonl', label_file = '../data/train.label.json')
# dev_df = load_data(data_file = '../data/dev.data.jsonl', label_file = '../data/dev.label.json')
# test_df = load_data(data_file = '../data/test.data.jsonl', label_file = None)

train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')

# BERTweet model

In [12]:
def bert_encode(df, tokenizer):
    input_ids = []
    attention_masks = []
    for sentence in df[["text"]].values:
        sentence = sentence.item()
        encoded_dict = tokenizer.encode_plus(
                            sentence,                      
                            add_special_tokens = True,  
                            max_length = 128,
                            pad_to_max_length = True,
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
           
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_word_ids': input_ids,
    'input_mask': attention_masks}

    return inputs

In [13]:
def prepare_dataloaders(train_df,test_df,dev_df, batch_size):
    # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

    tweet_valid = bert_encode(dev_df, tokenizer)
    tweet_valid_labels = dev_df.label.astype(int)
    
    tweet_train = bert_encode(train_df, tokenizer)
    tweet_train_labels = train_df.label.astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)


    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_valid.values()
    labels = torch.tensor(tweet_valid_labels.values)
    val_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )


    validation_dataloader = DataLoader(
                val_dataset, 
                sampler = SequentialSampler(val_dataset),
                batch_size = batch_size 
            )


    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    
    return train_dataloader,validation_dataloader,test_dataloader

In [14]:
train_dataloader,validation_dataloader,test_dataloader = prepare_dataloaders(train_df, test_df, dev_df, batch_size = batch_size)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Experiments

In [56]:
def test_encode(sentence):
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

    encoded_dict = tokenizer.encode_plus(
                        sentence,                      
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                )
           
    return encoded_dict['input_ids']

In [57]:
def test_decode(tokens):
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)
    return tokenizer.convert_ids_to_tokens(tokens)

In [58]:
train_df.text[0]

'4. Can eating garlic help prevent infection with the new coronavirus? #COVID19Malaysia https://t.co/q133xXBiwl 5. Can regularly rinsing your nose with saline help prevent infection with the new coronavirus? https://t.co/ccMjhhD7BK 6. Do vaccines against pneumonia protect you against the new coronavirus? https://t.co/wL0mlEqU95 7. Can spraying alcohol or chlorine all over your body kill the new coronavirus? #Chamber https://t.co/zunVR7Ht0V 8. How effective are thermal scanners in detecting people infected with the new coronavirus? https://t.co/nyLOyKAb1H 9. Can an ultraviolet disinfection lamp kill the new coronavirus? https://t.co/ZrlllbkIjm 10. Are hand dryers effective in killing the new coronavirus? https://t.co/cSDKXO1bGr 11. The new coronavirus CANNOT be transmitted through mosquito bites. https://t.co/ZRL8bjRkpl 12. Taking a hot bath does not prevent the new coronavirus disease https://t.co/bICOqSTOuD 13. Cold weather and snow CANNOT kill the new coronavirus. https://t.co/7yeQQ6

In [59]:
text_test = train_df.text[0]
text_encoded = test_encode(text_test)
text_decoded = test_decode(text_encoded[0, :130])


print(f'Shape      : {text_encoded.shape}')
print(f'Word Ids   : {text_encoded}')
print(f'Decoded Words   : {text_decoded}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Shape      : torch.Size([1, 512])
Word Ids   : tensor([[    0,   204,     4,   427,  1114, 16282,   272,  5742, 12000,    30,
             6,   127,  1456,    21, 41887,  8658,    10,   211,     4,   427,
          9139, 23210,  1248,    44,  2756,    30,  4360,  1466,   272,  5742,
         12000,    30,     6,   127,  1456,    21,    10,   339,     4,   172,
         31965,   480, 38694,  2682,    14,   480,     6,   127,  1456,    21,
            10,   380,     4,   427, 47043,  4273,    72, 24660,  1466,    48,
           141,    44,   716,   897,     6,   127,  1456,    21,   995,  5553,
          4281,    10,   440,     4,   203,  5433,    41, 29620,  2228, 37527,
            16, 13486, 12120,    83, 12398,    30,     6,   127,  1456,    21,
            10,   540,     4,   427,    74, 36037, 31902,  1215, 12000, 17693,
           897,     6,   127,  1456,    21,    10,   251,     4,   533,   803,
          4691, 19110,  5433,    16,  1863,     6,   127,  1456,    21,    10,
     

## Prepare optimizer for BERTweet model

In [60]:
def prepare_model(num_classes, model_to_load=None, total_steps=-1):


    model = AutoModelForSequenceClassification.from_pretrained(
        "vinai/bertweet-base",
        num_labels = num_classes,  
        output_attentions = False, 
        output_hidden_states = False,
    )

    optimizer = AdamW(model.parameters(),
                    lr = 5e-5,
                    eps = 1e-8,
                    weight_decay = 1e-2
                    )
    
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    if model_to_load is not None:
        model.roberta.load_state_dict(torch.load(model_to_load))
        print("Loaded pre-trained model")

    return model, optimizer, scheduler

# BERTweet for development 

This BERTweet model was used for development purposes and makes use of the validation set to evaluate the performance on the given task.

## Training

In [61]:
epochs = 10
total_steps = len(train_dataloader) * epochs

model, optimizer, scheduler = prepare_model(num_classes=2, model_to_load=None, total_steps = total_steps)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [62]:
def validate(model,validation_dataloader, val_labels):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    preds = []
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    t0 = time.time()
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        preds.append(logits)
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("  Accuracy: {0:.3f} %".format(avg_val_accuracy*100))
    avg_val_loss = total_eval_loss / len(test_dataloader)
    print("  Test Loss: {0:.3f}".format(avg_val_loss))
    
    scores = calculate_scores(preds, val_labels)
    print("  Precision Score: {0:.3f} %".format(scores['precision_score']*100))
    print("  Recall Score: {0:.3f} %".format(scores['recall_score']*100))
    print("  F1 Score: {0:.3f} %".format(scores['f1_score']*100))

    
    return preds, avg_val_accuracy, avg_val_loss

In [63]:
def train(model, optimizer, scheduler, train_dataloader, validation_dataloader, val_labels, epochs):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training:')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad() 
            
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
            
        _, avg_val_accuracy, avg_val_loss = validate(model,validation_dataloader, val_labels)
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




In [64]:
train(model,optimizer,scheduler,train_dataloader,validation_dataloader, dev_df.label.astype(int), epochs)


Training:
  Batch    40  of    226.    Elapsed: 0:00:05.
  Batch    80  of    226.    Elapsed: 0:00:10.
  Batch   120  of    226.    Elapsed: 0:00:16.
  Batch   160  of    226.    Elapsed: 0:00:21.
  Batch   200  of    226.    Elapsed: 0:00:26.

  Training loss: 0.40
  Training epoch took: 0:00:29
  Accuracy: 96.071 %
  Test Loss: 0.285
  Precision Score: 80.000 %
  Recall Score: 72.993 %
  F1 Score: 76.336 %

Training:
  Batch    40  of    226.    Elapsed: 0:00:05.
  Batch    80  of    226.    Elapsed: 0:00:10.
  Batch   120  of    226.    Elapsed: 0:00:16.
  Batch   160  of    226.    Elapsed: 0:00:21.
  Batch   200  of    226.    Elapsed: 0:00:26.

  Training loss: 0.23
  Training epoch took: 0:00:29
  Accuracy: 100.179 %
  Test Loss: 0.262
  Precision Score: 85.507 %
  Recall Score: 86.131 %
  F1 Score: 85.818 %

Training:
  Batch    40  of    226.    Elapsed: 0:00:05.
  Batch    80  of    226.    Elapsed: 0:00:10.
  Batch   120  of    226.    Elapsed: 0:00:16.
  Batch   160  of  

In [65]:
torch.save(model.cpu().roberta.state_dict(),"model")

## Inference

In [66]:
def predict(model, test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

In [67]:
result = predict(model,test_dataloader)

In [68]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

with open('test.predictions.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(pred_labels):
    output.write(str(counter) + ',' + str(elem)+'\n')
    counter+=1

pred_scores = softmax(result, axis=1)[:, 1]

predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

Unnamed: 0,id,target
0,1246482832316301319,nonrumour
1,1252279738099433473,nonrumour
2,1236050255394877440,nonrumour
3,1235582115900796928,nonrumour
4,1258787515592572928,nonrumour
...,...,...
553,427944719612915712,nonrumour
554,531206167302012929,nonrumour
555,553099685888790528,rumour
556,1222928724112396288,nonrumour


In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output_v21.json', 'w') as f:
    json.dump(submission, f)

# BERTweet for CodaLab submission

For the CodaLab compeition, the train and development dataset has been merged to increase the size of the training set

In [37]:
combined_df = train_df.append(dev_df, ignore_index = True)
combined_df

Unnamed: 0.1,Unnamed: 0,text,label,id
0,0,4. Can eating garlic help prevent infection wi...,0,1250219300389974016
1,1,French police chief killed himself after #Char...,1,554886875303780352
2,2,Coronavirus disease (COVID-19) advice for the ...,0,1237901309011021825
3,3,Ottawa police confirm that there were multiple...,0,524958128392376320
4,4,if the primary focus of a government isn't to ...,0,1239295488677085185
...,...,...,...,...
2397,590,WHAT ARE THE TREATMENT OPTIONS FOR COVID-19 (I...,0,1249582429565829120
2398,591,"After speculation that he’s been arrested, Ban...",1,524881688825167872
2399,592,*Your questions answered*❓\n\n*Reply with the ...,0,1240908749256232960
2400,593,"►#Anonymous Operation #KKK ►Ku Klux Klan, We n...",1,661102820976930816


In [38]:
combined_df.text[0]

'4. Can eating garlic help prevent infection with the new coronavirus? #COVID19Malaysia https://t.co/q133xXBiwl 5. Can regularly rinsing your nose with saline help prevent infection with the new coronavirus? https://t.co/ccMjhhD7BK 6. Do vaccines against pneumonia protect you against the new coronavirus? https://t.co/wL0mlEqU95 7. Can spraying alcohol or chlorine all over your body kill the new coronavirus? #Chamber https://t.co/zunVR7Ht0V 8. How effective are thermal scanners in detecting people infected with the new coronavirus? https://t.co/nyLOyKAb1H 9. Can an ultraviolet disinfection lamp kill the new coronavirus? https://t.co/ZrlllbkIjm 10. Are hand dryers effective in killing the new coronavirus? https://t.co/cSDKXO1bGr 11. The new coronavirus CANNOT be transmitted through mosquito bites. https://t.co/ZRL8bjRkpl 12. Taking a hot bath does not prevent the new coronavirus disease https://t.co/bICOqSTOuD 13. Cold weather and snow CANNOT kill the new coronavirus. https://t.co/7yeQQ6

In [39]:
def prepare_dataloaders(combined_df, test_df, batch_size):    
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

    
    tweet_train = bert_encode(combined_df, tokenizer)
    tweet_train_labels = combined_df.label.astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)


    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )

    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    
    return train_dataloader,test_dataloader

In [40]:
combined_dataloader,test_dataloader = prepare_dataloaders(combined_df, test_df, batch_size=batch_size)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
epochs = 7
total_steps = len(combined_dataloader) * epochs

model, optimizer, scheduler = prepare_model(num_classes=2, model_to_load = None, total_steps = total_steps)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

## Training

In [None]:
def train(model,optimizer,scheduler,train_dataloader,epochs):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training:')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()        
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
train(model,optimizer,scheduler,train_dataloader, epochs)


Training:
  Batch    40  of  1,161.    Elapsed: 0:00:12.
  Batch    80  of  1,161.    Elapsed: 0:00:22.
  Batch   120  of  1,161.    Elapsed: 0:00:33.
  Batch   160  of  1,161.    Elapsed: 0:00:44.
  Batch   200  of  1,161.    Elapsed: 0:00:54.
  Batch   240  of  1,161.    Elapsed: 0:01:05.
  Batch   280  of  1,161.    Elapsed: 0:01:16.
  Batch   320  of  1,161.    Elapsed: 0:01:26.
  Batch   360  of  1,161.    Elapsed: 0:01:37.
  Batch   400  of  1,161.    Elapsed: 0:01:48.
  Batch   440  of  1,161.    Elapsed: 0:01:58.
  Batch   480  of  1,161.    Elapsed: 0:02:09.
  Batch   520  of  1,161.    Elapsed: 0:02:20.
  Batch   560  of  1,161.    Elapsed: 0:02:30.
  Batch   600  of  1,161.    Elapsed: 0:02:41.
  Batch   640  of  1,161.    Elapsed: 0:02:52.
  Batch   680  of  1,161.    Elapsed: 0:03:02.
  Batch   720  of  1,161.    Elapsed: 0:03:13.
  Batch   760  of  1,161.    Elapsed: 0:03:24.
  Batch   800  of  1,161.    Elapsed: 0:03:34.
  Batch   840  of  1,161.    Elapsed: 0:03:45.
  

In [None]:
torch.save(model.cpu().roberta.state_dict(),"./bertweet/bertweet_v35")

## Inference

In [None]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

In [None]:
result = predict(model,test_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

pred_scores = softmax(result, axis=1)[:, 1]

In [None]:
predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

Unnamed: 0,id,target
0,544382249178001408,rumour
1,525027317551079424,rumour
2,544273220128739329,rumour
3,499571799764770816,non-rumour
4,552844104418091008,non-rumour
...,...,...
576,553581227165642752,non-rumour
577,552816302780579840,non-rumour
578,580350000074457088,rumour
579,498584409055174656,non-rumour


In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)

# Perform Inference on COVID dataset

In [None]:
covid_df = load_data(data_file = '../data/covid.data.jsonl', label_file = None)

In [None]:
covid_df

Unnamed: 0,id,text
0,1272262651100434433,"According to the New York Times, Warner Bros. ..."
1,1287153210990395392,Hurricane Hanna has made landfall in Texas.\n\...
2,1266555444283179008,Monkeys on the loose in India with stolen coro...
3,1257715199655755779,Eastleigh and Swahili Arabs in Mombasa where c...
4,1274338812173393920,"“If Trump felt comfortable having it here, the..."
...,...,...
17453,1249502859185590272,I wonder how many lives could’ve been saved if...
17454,1284050414619459586,The @thetimes front page on 17th March. The fi...
17455,1274505289614725122,Trump just completed the racism trifecta in a ...
17456,1267884642637676545,Here are a few of my photographs from today’s ...


In [None]:
train_dataloader,covid_dataloader = prepare_dataloaders(combined_df, covid_df, batch_size = batch_size)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
result = predict(model,covid_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result, axis = 1)

pred_scores = softmax(result, axis=1)[:, 1]

In [None]:
pred_labels

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
predicted_labels = [convert_prediction(pred) for pred in pred_labels]

output = pd.DataFrame({'id':covid_df.id,'target':predicted_labels})
output

Unnamed: 0,id,target
0,1272262651100434433,non-rumour
1,1287153210990395392,non-rumour
2,1266555444283179008,non-rumour
3,1257715199655755779,non-rumour
4,1274338812173393920,non-rumour
...,...,...
17453,1249502859185590272,non-rumour
17454,1284050414619459586,non-rumour
17455,1274505289614725122,non-rumour
17456,1267884642637676545,non-rumour


In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('covid-output.json', 'w') as f:
    json.dump(submission, f)