To test if a general purpose sentiment analysis model can be used to predict the sentiment of political tweets.

In [9]:
# Define Model
import torch
import torch.nn as nn
from tqdm import tqdm
import pandas as pd
from transformers import RobertaModel
from transformers import RobertaTokenizer
from roberta_classifer import RobertaClassifier
    
# Instantiate the model
model = RobertaClassifier()

# Load pre-trained weights
#model.load_state_dict(torch.load('fine_tuned_roberta_classifier.pt'))

model.to('cuda')
# Set the model to evaluation mode
#model.eval()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClassifier(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [10]:
pro_dataset = pd.read_csv('data/pro-posts.csv')
against_dataset = pd.read_csv('data/against-posts.csv')
#Combine the datasets
dataset = pd.concat([pro_dataset, against_dataset], ignore_index=True)
#Shuffle the dataset
dataset = dataset.sample(frac=1).reset_index(drop=True)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dataset['text'], dataset['label'], test_size=0.2, random_state=42)


# Train

In [11]:
train_inputs, train_masks = model.preprocess_for_roberta(x_train)
print('Done training masking.')
test_inputs, test_masks = model.preprocess_for_roberta(x_test)
print('Done testing masking.')


TypeError: RobertaClassifier.preprocess_for_roberta() takes 1 positional argument but 2 were given

In [None]:
# Batch Size
batch_size = 16

# Turn labels into a Tensor
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

# Create the DataLoader for our training set
train_dataloader = model.create_train_dataloader(train_inputs, train_masks, y_train, batch_size)

# Create the DataLoader for our testing set
test_dataloader = model.create_test_dataloader(test_inputs, test_masks, y_test, batch_size)




In [None]:
import time
import random
import gc
import numpy as np
# Loss
loss_fn = nn.CrossEntropyLoss()

random.seed(20)
np.random.seed(20)
torch.manual_seed(20)
torch.cuda.manual_seed_all(20)

def train(roberta_classifer, optimizer, scheduler, epochs=4):
    loss_hist = []
    for epoch in tqdm(range(epochs)):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        roberta_classifer.train()

        progress_bar = tqdm(total=len(train_dataloader), desc=f'Epoch {epoch+1}', position=0)
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1

            # Load batch to GPU
            batch_inputs, batch_masks, batch_labels = tuple(t.to('cuda') for t in batch)

            # Zero out gradients
            roberta_classifer.zero_grad()

            # Perform a forward pass.
            logits = roberta_classifer(batch_inputs, batch_masks)

            # Compute loss
            loss = loss_fn(logits, batch_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            
            # Perform a backward pass
            loss.backward()

            
            # Clip norm
            torch.nn.utils.clip_grad_norm_(roberta_classifer.parameters(), 1.0)

            # step optimizer, update params
            optimizer.step()
            scheduler.step()
            
            progress_bar.update(1)
            progress_bar.set_postfix({'Elapsed': time.time() - t0_epoch, 'Loss': total_loss / batch_counts})
        
        # Calculate the average loss
        avg_train_loss = total_loss / len(train_dataloader)
        loss_hist.append(avg_train_loss)
        print("-"*100)
        




## Start Training Loop

In [None]:
import time
import random
import gc
import numpy as np
# Loss
loss_fn = nn.CrossEntropyLoss()

random.seed(20)
np.random.seed(20)
torch.manual_seed(20)
torch.cuda.manual_seed_all(20)

def train(roberta_classifier, optimizer, scheduler, epochs=4):
    loss_hist = []
    for epoch in tqdm(range(epochs)):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        roberta_classifier.train()

        progress_bar = tqdm(total=len(train_dataloader), desc=f'Epoch {epoch+1}', position=0)
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1

            # Load batch to GPU
            batch_inputs, batch_masks, batch_labels = tuple(t.to('cuda') for t in batch)

            # Zero out gradients
            roberta_classifier.zero_grad()

            # Perform a forward pass.
            logits = roberta_classifier(batch_inputs, batch_masks)

            # Compute loss
            loss = loss_fn(logits, batch_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            
            # Perform a backward pass
            loss.backward()

            
            # Clip norm
            torch.nn.utils.clip_grad_norm_(roberta_classifier.parameters(), 1.0)

            # step optimizer, update params
            optimizer.step()
            scheduler.step()

            # Progress update every 20 batches.
            # if step % 20 == 0 and not step == 0:
            #     # Calculate elapsed time in minutes.
            #     elapsed = time.time() - t0_batch

            #     # Print training results
            #     print(f"{epoch:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {elapsed:^9.2f}")

            #     # Reset batch tracking variables
            #     batch_loss, batch_counts = 0, 0
            #     t0_batch = time.time()
            progress_bar.update(1)
            progress_bar.set_postfix({'Elapsed': time.time() - t0_epoch, 'Loss': total_loss / batch_counts})
        
        # Calculate the average loss
        avg_train_loss = total_loss / len(train_dataloader)
        loss_hist.append(avg_train_loss)
        print("-"*100)



In [None]:
num_epochs = 4


train(model, model.get_optimizer(), model.get_scheduler, epochs=num_epochs)

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt

def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    figure = plt.figure()
    plt.title(f'Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.annotate(f'Acc: {accuracy*100:.2f}%', xy=(0.8, 0.2))
    plt.show()
    return figure

In [None]:
import torch.nn.functional as F

def roberta_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to('cuda') for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate confidence
    probs = F.softmax(all_logits, dim=1).cpu()

    return probs


In [None]:
batch_size = 12
test_inputs, test_masks = model.preprocess_for_roberta(test_data)

# Create dataloader
test_dataloader = model.create_dataloader(test_inputs, test_masks, batch_size)

# Compute predicted probabilities on the test set
probs = torch.argmax(roberta_predict(model, test_dataloader), dim=1)



In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
labels_map = {0: "Negative", 1: "Positive"}

def create_prediction_dictionary(sentences, labels):
    prediction_dict = {}
    for i in range(len(sentences)):
        prediction_dict[sentences[i]] = labels_map[labels[i].item()]
    return prediction_dict




In [None]:
sentiment_dict = create_prediction_dictionary(test_data, probs)
sentiment_dict