## Prerequisites

In [22]:
!pip install emoji



In [23]:
!pip install torch torchvision transformers



In [24]:
import os
import random
import time
import datetime
import torch
import argparse
import numpy as np
import pandas as pd
from torch.nn import functional as F
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from transformers import get_linear_schedule_with_warmup,AdamW,AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.utils.data import TensorDataset,DataLoader, RandomSampler, SequentialSampler, Dataset

Define some helper functions

In [25]:
batch_size = 8

In [26]:
def calculate_scores(preds, labels):
    pred_flat = np.argmax(np.concatenate(preds), axis=1).flatten()
    results = dict()
    results['precision_score'] = precision_score(labels, pred_flat, average='binary')
    results['recall_score'] = recall_score(labels, pred_flat, average='binary')
    results['f1_score'] = f1_score(labels, pred_flat, average='binary')
    return results


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Preprocessing

For BERTweet we will only load the data and do not perform any preprocessing at all (even links + usernames will not be removed from the input that we feed to BERTweet)

In [27]:
train_df = pd.read_csv('train_sr.csv')
dev_df = pd.read_csv('dev_sr.csv')
test_df = pd.read_csv('test_sr.csv')

# BERTweet model

In [28]:
def bert_encode(df, tokenizer):
    input_ids = []
    attention_masks = []
    for sentence in df[["text"]].values:
        sentence = sentence.item()
        encoded_dict = tokenizer.encode_plus(
                            sentence,                      
                            add_special_tokens = True,  
                            max_length = 128,
                            pad_to_max_length = True,
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
           
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_word_ids': input_ids,
    'input_mask': attention_masks}

    return inputs

In [29]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

def prepare_dataloader(df,  sorts, batch_size=batch_size):
  tweet = bert_encode(df, tokenizer)
  tweet_indexs = df.index.astype(int)
  if sorts!='Test':
    tweet_labels = df.label.astype(int)
    labels = torch.tensor(tweet_labels.values)
  input_ids, attention_masks = tweet.values()
  indexs = torch.tensor(tweet_indexs.values)
  if sorts=='Train':
    dataset = TensorDataset(input_ids, attention_masks, labels, indexs)
    dataloader = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = batch_size)
  elif sorts=='Test':
    dataset = TensorDataset(input_ids, attention_masks, indexs)
    dataloader = DataLoader(dataset, sampler = SequentialSampler(dataset), batch_size = batch_size)
  elif sorts == 'Dev':
    dataset = TensorDataset(input_ids, attention_masks, labels, indexs)
    dataloader = DataLoader(dataset, sampler = SequentialSampler(dataset), batch_size = batch_size)
  return dataloader


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
train_dataloader = prepare_dataloader(train_df, sorts='Train')
validation_dataloader = prepare_dataloader(dev_df, sorts='Dev')
test_dataloader = prepare_dataloader(test_df, sorts='Test')



## Prepare optimizer for BERTweet model

In [31]:
def prepare_model(num_classes, model_to_load=None, total_steps=-1):

    configuration = AutoConfig.from_pretrained('vinai/bertweet-base')
    configuration.hidden_dropout_prob = 0
    configuration.attention_probs_dropout_prob = 0
    configuration.num_labels = num_classes
    configuration.output_attentions = False
    configuration.output_hidden_states = False

    model = AutoModelForSequenceClassification.from_pretrained(
        "vinai/bertweet-base",
        config = configuration
    )

    optimizer = AdamW(model.parameters(),
                    lr = 5e-5,
                    eps = 1e-8,
                    weight_decay = 1e-2
                    )
    
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    if model_to_load is not None:
        model.roberta.load_state_dict(torch.load(model_to_load))
        print("Loaded pre-trained model")

    return model, optimizer, scheduler

# BERTweet for development 

This BERTweet model was used for development purposes and makes use of the validation set to evaluate the performance on the given task.

## Training

In [32]:
epochs = 10
total_steps = len(train_dataloader) * epochs

model, optimizer, scheduler = prepare_model(num_classes=2, model_to_load=None, total_steps = total_steps)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [33]:
def validate(model,validation_dataloader, val_labels):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    preds = []
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    t0 = time.time()
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        preds.append(logits)
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    # print("  Accuracy: {0:.3f} %".format(avg_val_accuracy*100))
    avg_val_loss = total_eval_loss / len(test_dataloader)
    # print("  Test Loss: {0:.3f}".format(avg_val_loss))
    
    scores = calculate_scores(preds, val_labels)
    # print("  Precision Score: {0:.3f} %".format(scores['precision_score']*100))
    # print("  Recall Score: {0:.3f} %".format(scores['recall_score']*100))
    # print("  F1 Score: {0:.3f} %".format(scores['f1_score']*100))

    
    return preds, avg_val_accuracy, avg_val_loss

In [34]:
def train(model, optimizer, scheduler, train_dataloader, validation_dataloader, val_labels, epochs):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training:')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 400 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad() 
            
            outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            
            loss = outputs.loss
            logits = outputs.logits
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_time = format_time(time.time() - t0)

        print("")
        print("  Training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
            
        _, avg_val_accuracy, avg_val_loss = validate(model,validation_dataloader, val_labels)
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




In [None]:
train(model,optimizer,scheduler,train_dataloader,validation_dataloader, dev_df.label.astype(int), epochs)


Training:
  Batch   400  of  2,836.    Elapsed: 0:01:24.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
torch.save(model.cpu().roberta.state_dict(),F"/content/drive/MyDrive/Colab/model_bert")

# Prediction

In [None]:
def predict(model, test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)

    return preds

In [None]:
result_dev = predict(model,validation_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result_dev, axis = 1)

with open('dev.predictions_sr.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(pred_labels):
    output.write(str(counter) + ',' + str(elem)+'\n')
    counter+=1

In [None]:
result_test = predict(model,test_dataloader)

In [None]:
from scipy.special import softmax

pred_labels = np.argmax(result_test, axis = 1)

with open('test.predictions_sr.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(pred_labels):
    output.write(str(counter) + ',' + str(elem)+'\n')
    counter+=1