In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import contractions
import langid
import spacy
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn import metrics
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
splits = {'train': 'simplified/train-00000-of-00001.parquet', 'validation': 'simplified/validation-00000-of-00001.parquet', 'test': 'simplified/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
valid_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["validation"])
test_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [4]:
id2label = {i:emotions[i] for i in range(len(emotions))}

In [5]:
for i in id2label:
    train_df[id2label[i]] = train_df.labels.apply(lambda x: 1 if i in x else 0)
    valid_df[id2label[i]] = valid_df.labels.apply(lambda x: 1 if i in x else 0)
    test_df[id2label[i]] = test_df.labels.apply(lambda x: 1 if i in x else 0)

In [6]:
def demojize_text(text):
    return emoji.demojize(text)

In [7]:
def clean_text(text):

    # Ensure the input is a string
    text = str(text).lower()
    
    # Remove specific patterns and unwanted characters
    text = re.sub(r'\:(.*?)\:', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove HTML content
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags, newlines, and words with numbers
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove all punctuation
    text = re.sub(r"[^\w\s]", "", text)  # Removes everything except word characters and spaces
    
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [8]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
            
    x=new_text[:]
    new_text.clear()
    return " ".join(x)

In [9]:
def text_preprocessing(df):
    df['text'] = df['text'].apply(lambda x: contractions.fix(x))
    df['text'] = df['text'].apply(lambda x: demojize_text(x))
    df['text'] = df['text'].apply(lambda x: clean_text(x))
    df['text'] = df['text'].apply(lambda x: remove_stopwords(x))
    return df

In [10]:
train_df = text_preprocessing(train_df)
test_df = text_preprocessing(test_df)
valid_df = text_preprocessing(valid_df)

  text = BeautifulSoup(text, 'html.parser').get_text()


In [11]:
# from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
import torch

In [12]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [13]:
if torch.backends.mps.is_available():
   device = torch.device("mps")

In [14]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 2e-05

In [15]:
class EmotionDataset:
    def __init__(self,df,labels,tokenizer,max_len):
        self.data = df.text
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = df[labels].values

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        text = self.data[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[idx], dtype=torch.float)
        }

In [16]:
from sklearn.utils.class_weight import compute_class_weight

# Flatten the binary labels to calculate frequency per class
class_counts = train_df[emotions].sum(axis=0)
total_samples = train_df[emotions].shape[0]

# Compute weights: inverse frequency
class_weights = total_samples / (len(emotions) * class_counts)

print("Class Weights:", class_weights)

Class Weights: admiration         0.375389
amusement          0.665961
anger              0.989379
annoyance          0.627675
approval           0.527512
caring             1.426272
confusion          1.133302
curiosity          0.707603
desire             2.418654
disappointment     1.221716
disapproval        0.766744
disgust            1.955053
embarrassment      5.116690
excitement         1.817535
fear               2.601270
gratitude          0.582403
grief             20.134508
joy                1.067739
love               0.743220
nervousness        9.453397
optimism           0.980618
pride             13.967181
realization        1.396718
relief            10.133053
remorse            2.844692
sadness            1.169198
surprise           1.462601
neutral            0.109034
dtype: float64


In [17]:
training_set = EmotionDataset(train_df,emotions, tokenizer, MAX_LEN)
validation_set = EmotionDataset(valid_df,emotions, tokenizer, MAX_LEN)
testing_set = EmotionDataset(test_df,emotions, tokenizer, MAX_LEN)

In [18]:
train_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=0, shuffle=True, pin_memory=True)
valid_loader = DataLoader(validation_set, batch_size=VALID_BATCH_SIZE, 
                          num_workers=0, shuffle=False, pin_memory=True)
test_loader = DataLoader(testing_set, batch_size=VALID_BATCH_SIZE, 
                          num_workers=0, shuffle=False, pin_memory=True)

In [19]:
class RoBERTaClass(torch.nn.Module):
    def __init__(self):
        super(RoBERTaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained('roberta-base', return_dict=True)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 28)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = RoBERTaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [20]:
def loss_fn(outputs,class_weights, targets):
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
    class_weights_tensor = class_weights_tensor.to(device)
    return torch.nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor)(outputs, targets)

In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [22]:

def train(epoch, class_weights):
    model.train()  # Set model to training mode
    running_loss = 0.0  # Initialize running loss
    total_batches = len(train_loader)  # Total number of batches
    accumulation_steps = 4  # Accumulate gradients to simulate larger batch size

    for batch_idx, data in enumerate(train_loader, 1):  # Start counting batches from 1
        # Move data to the device
        input_ids = data['ids'].to(device, dtype=torch.long)
        attention_mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        # Forward pass
        outputs = model(input_ids, attention_mask, token_type_ids)
        # Compute loss
        loss = loss_fn(outputs, class_weights, targets)
        loss = loss / accumulation_steps  # Scale loss for accumulation

        # Backward pass
        loss.backward()
        
        # Perform optimization step only after accumulation steps
        if (batch_idx % accumulation_steps) == 0:
            optimizer.step()
            scheduler.step()  # Update learning rate
            optimizer.zero_grad()

        # Accumulate loss
        running_loss += loss.item() * accumulation_steps  # Undo the scaling for tracking

    # Compute average loss for the epoch
    average_loss = running_loss / total_batches
    print(f'\nEpoch: {epoch} completed. Average Loss: {average_loss:.4f}\n')


In [23]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [24]:
for epoch in range(EPOCHS):
    train(epoch,class_weights)

  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)



Epoch: 0 completed. Average Loss: 0.1758

F1 Score (Micro) -t1 = 0.00785052598524101
F1 Score (Macro) -t1 = 0.018601190476190476
F1 Score (Micro) -t2 = 0.02311960542540074
F1 Score (Macro) -t2 = 0.04367287540774253


  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)



Epoch: 1 completed. Average Loss: 0.1207

F1 Score (Micro) -t1 = 0.2315873437915448
F1 Score (Macro) -t1 = 0.24081845555017917
F1 Score (Micro) -t2 = 0.31791975526532534
F1 Score (Macro) -t2 = 0.32285622973955486


  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)



Epoch: 2 completed. Average Loss: 0.1012

F1 Score (Micro) -t1 = 0.3246250604741171
F1 Score (Macro) -t1 = 0.3312325012681277
F1 Score (Micro) -t2 = 0.37608131487889274
F1 Score (Macro) -t2 = 0.37243289857823964


  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)



Epoch: 3 completed. Average Loss: 0.0919

F1 Score (Micro) -t1 = 0.3645904631530915
F1 Score (Macro) -t1 = 0.3645036822713374
F1 Score (Micro) -t2 = 0.40468858647064665
F1 Score (Macro) -t2 = 0.3902262615500729


  class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)



Epoch: 4 completed. Average Loss: 0.0855

F1 Score (Micro) -t1 = 0.37701657458563537
F1 Score (Macro) -t1 = 0.37237325017439143
F1 Score (Micro) -t2 = 0.40340076223981236
F1 Score (Macro) -t2 = 0.3857807002784775


In [None]:
outputs, targets = validation()

print("Scores with 0.5 threshold")
outputs_t1 = np.array(outputs) >= 0.5

f1_score_micro = metrics.f1_score(targets, outputs_t1, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs_t1, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

print("Scores with 0.3 threshold")
outputs_t2 = np.array(outputs) >= 0.3

f1_score_micro = metrics.f1_score(targets, outputs_t2, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs_t2, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")