### Overview

This notebook is a solution to a three-class sentiment classification problem, where the RoBERTa text classifier is used to classify Facebook posts into three different sentiment classes based on a training dataset of ~8000 Facebook posts.

- Language Model Used:
	- The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google’s BERT model released in 2018.
	- [Blog-Post](https://ai.facebook.com/blog/roberta-an-optimized-method-for-pretraining-self-supervised-nlp-systems/)
	- [Research Paper](https://arxiv.org/pdf/1907.11692)
	- [Documentation for python](https://huggingface.co/transformers/model_doc/roberta.html)

### Environment Setup

In [1]:
# Install the packages needed
!pip install imbalanced-learn
!pip install torch
!pip install transformers

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Preprocess the Dataset and Prepare Dataloaders

In [5]:
# Load and preprocess the data and labels

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FB_posts_labeled.txt', delimiter='\t')
# Create label encoder for 3 classes: Appreciation, Complaint, and Feedback
data['label'] = 0
data.loc[data['Complaint'] == 1, 'label'] = 1
data.loc[data['Feedback'] == 1, 'label'] = 2
new_df = data[['message','label']]
# data = data.rename(columns={'message':'Phrase', 'label':'Sentiment'})
# new_df = data[['Phrase', 'Sentiment']]

In [6]:
# Address the data imbalance problem

# Splitting the data into train and validation sets
train_df, val_df = train_test_split(new_df, test_size=0.2, random_state=42, stratify=new_df['label'])

# Applying RandomOverSampler to handle imbalanced data
ros = RandomOverSampler(random_state=42)
train_features, train_labels = train_df['message'].values.reshape(-1, 1), train_df['label']
train_features_resampled, train_labels_resampled = ros.fit_resample(train_features, train_labels)

# Combine the resampled data back into a dataframe for easier processing
train_df_resampled = pd.DataFrame({
    'message': train_features_resampled.flatten(),  # Ensuring the features are a flat array
    'label': train_labels_resampled
}).reset_index(drop=True)

val_df = val_df.reset_index(drop=True)

In [7]:
# Defining key variables that will be used later on in the training

MAX_LEN = 256
TRAIN_BATCH_SIZE = 12
VALID_BATCH_SIZE = 10
LEARNING_RATE = 1e-05

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:

class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.message
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
# Update the dataset instances with resampled train data and untouched validation data
train_set = SentimentData(train_df_resampled, tokenizer, MAX_LEN)
valid_set = SentimentData(val_df, tokenizer, MAX_LEN)

# DataLoader instances as defined in your notebook
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0}

training_loader = DataLoader(train_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)

In [10]:
# Check the balanced dataset
train_df_resampled.label.value_counts()

label
2    3404
0    3404
1    3404
Name: count, dtype: int64

### Creat the Neural Network for Fine Tuning

#### Neural Network
 - Create a neural network with the `RobertaClass`
 - This network will have the Roberta Language model followed by a `dropout` and finally a `Linear` layer to obtain the final outputs.

#### Loss Function and Optimizer
 - The `Loss Function` is used the calculate the difference in the output created by the model and the actual output.
 - `Optimizer` is used to update the weights of the neural network to improve its performance.

In [11]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [12]:
model = RobertaClass()
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

### Fine Tune the Model

In [13]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [14]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [15]:
# Defining the training function on the 80% of the dataset for tuning the model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [16]:
EPOCHS = 4
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Loss per 5000 steps: 1.1893534660339355
Training Accuracy per 5000 steps: 8.333333333333334


851it [07:06,  1.99it/s]


The Total Accuracy for Epoch 0: 85.59537798668234
Training Loss Epoch: 0.3763131667935365
Training Accuracy Epoch: 85.59537798668234


1it [00:00,  2.03it/s]

Training Loss per 5000 steps: 0.08082928508520126
Training Accuracy per 5000 steps: 100.0


851it [07:10,  1.98it/s]


The Total Accuracy for Epoch 1: 94.44770857814336
Training Loss Epoch: 0.16937742554434912
Training Accuracy Epoch: 94.44770857814336


1it [00:00,  2.12it/s]

Training Loss per 5000 steps: 0.014671084471046925
Training Accuracy per 5000 steps: 100.0


851it [07:10,  1.98it/s]


The Total Accuracy for Epoch 2: 96.46494320407363
Training Loss Epoch: 0.10492863090485598
Training Accuracy Epoch: 96.46494320407363


1it [00:00,  2.04it/s]

Training Loss per 5000 steps: 0.026080826297402382
Training Accuracy per 5000 steps: 100.0


851it [07:10,  1.98it/s]

The Total Accuracy for Epoch 3: 97.66940853897376
Training Loss Epoch: 0.07261810825642999
Training Accuracy Epoch: 97.66940853897376





<a id='section06'></a>
### Validate the Model

In [17]:
def valid(model, valid_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(valid_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu


In [18]:
acc = valid(model, valid_loader)
print("Accuracy on test data = %0.2f%%" % acc)

2it [00:00,  4.21it/s]

Validation Loss per 100 steps: 1.3606959581375122
Validation Accuracy per 100 steps: 70.0


160it [00:21,  7.56it/s]

Validation Loss Epoch: 0.37659847633040044
Validation Accuracy Epoch: 90.58380414312617
Accuracy on test data = 90.58%





<a id='section07'></a>
### Save the Trained Model Artifacts

In [19]:
output_model_file = 'pytorch_roberta_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


<a id='section08'></a>
### Make Predictions Using the Model

In [20]:
# load actual unlabeled data
data_1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FB_posts_unlabeled.txt', delimiter='\t')
data_1['label'] = 0
# data_1 = data_1.rename(columns={'message':'Phrase', 'label':'Sentiment'})
new_df_1 = data_1[['message', 'label']]

unlabeled_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

unlabeled_set = SentimentData(new_df_1, tokenizer, MAX_LEN)
unlabeled_data_loader = DataLoader(unlabeled_set, **unlabeled_params)

In [21]:
model.eval()
n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0

results = []

with torch.no_grad():
    for _, data in tqdm(enumerate(unlabeled_data_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids).squeeze()

        results.append(outputs)

204it [00:25,  8.04it/s]


In [22]:
# Process the output to readable csv file
results_df = []
for i in results:
  for j in i:
    results_df.append(j)

pred = []
for t in results_df:
  arr = t.cpu().numpy()
  idx = np.argmax(arr)
  pred.append(idx)

data_1['pred'] = pred

data_1['Appreciation_pred'] = np.where(data_1['pred']==0, 1, 0)
data_1['Complaint_pred'] = np.where(data_1['pred']==1, 1, 0)
data_1['Feedback_pred'] = np.where(data_1['pred']==2, 1, 0)

In [23]:
data_1[['postId', 'Appreciation_pred', 'Complaint_pred', 'Feedback_pred']].to_csv('output.csv',index=False)