To test if a general purpose sentiment analysis model can be used to predict the sentiment of political tweets.

In [1]:
# Define Model
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import RobertaModel
from transformers import RobertaTokenizer

class RobertaClassifier(nn.Module):
    def __init__(self, freeze=False):
        super(RobertaClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = RobertaModel.from_pretrained("roberta-base")

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False
    
    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits
    
# Instantiate the model
model = RobertaClassifier()

# Load pre-trained weights
model.load_state_dict(torch.load('final_roberta_classifier.pt'))

model.to('cuda')
# Set the model to evaluation mode
model.eval()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClassifier(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [2]:
import re

def text_preprocessing(text):
    # Remove entity mentions starting with '@user'
    text = re.sub(r'@\w+', '', text)
    
    # Remove URLs
    text = re.sub(r'(http|https)://[^\s]+', '', text)
    
    # Remove text between two asterisks -- Usually emojis/symbols in the dataset
    text = re.sub(r'\*.*?\*', '', text)
    
    # Correct common errors
    text = text.replace('&amp;', '&')
    
    # Remove excess whitespace characters
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text



In [3]:
def preprocess_for_roberta(data):
    # Initialize lists to store the input_ids and attention_masks
    input_ids = []
    attention_masks = []

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)
    
    for text in tqdm(data):
        # Use tokenizer.encode_plus to tokenize and encode the text
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            pad_to_max_length=True,
            max_length=360, # Determined from the processed text
            return_attention_mask=True,
        )

        # Add the input_ids and attention_mask to the lists
        input_ids.append(encoded.get('input_ids'))
        attention_masks.append(encoded.get('attention_mask'))

    # Convert the lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)


    return input_ids, attention_masks

In [4]:
def input_to_roberta(text=""):
    text = text_preprocessing(text)
    ids, masks = preprocess_for_roberta(text)
    label = model(ids.to('cuda'), masks.to('cuda'))
    #print(f'Predicted Label: {label}')
    return label
    


In [5]:
import torch.nn.functional as F

def roberta_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to('cuda') for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu()

    return probs

In [6]:
def simplify_label(labels):
    labels = labels.float()
    column_means = torch.mean(labels, dim=0)
    return (column_means)

In [7]:
import gc
torch.cuda.empty_cache()
gc.collect()

69

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Create the DataLoader for our testing set
def create_dataloader(test_inputs, test_masks, batch_size):
    test_data = TensorDataset(test_inputs, test_masks)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    return test_dataloader


In [12]:
batch_size = 32
test_data = ["I want more gun control", "I'm against gun control"]
test_inputs, test_masks = preprocess_for_roberta(test_data)

# Create dataloader
test_dataloader = create_dataloader(test_inputs, test_masks, batch_size)

# Compute predicted probabilities on the test set
probs = torch.argmax(roberta_predict(model, test_dataloader), dim=1)



100%|██████████| 2/2 [00:00<00:00, 2050.50it/s]


In [13]:
labels_map = {0: "Negative", 1: "Positive"}

def create_prediction_dictionary(sentences, labels):
    prediction_dict = {}
    for i in range(len(sentences)):
        prediction_dict[sentences[i]] = labels_map[labels[i].item()]
    return prediction_dict




In [14]:
sentiment_dict = create_prediction_dictionary(test_data, probs)
sentiment_dict

{'I want more gun control': 'Negative', "I'm against gun control": 'Negative'}