In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import contractions
import langid
import spacy
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
from spacy.language import Language
from spacy_language_detection import LanguageDetector
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
splits = {'train': 'simplified/train-00000-of-00001.parquet', 'validation': 'simplified/validation-00000-of-00001.parquet', 'test': 'simplified/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
valid_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["validation"])
test_df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [4]:
len(emotions)

28

In [5]:
id2label = {i:emotions[i] for i in range(len(emotions))}

In [6]:
for i in id2label:
    train_df[id2label[i]] = train_df.labels.apply(lambda x: 1 if i in x else 0)
    valid_df[id2label[i]] = valid_df.labels.apply(lambda x: 1 if i in x else 0)
    test_df[id2label[i]] = test_df.labels.apply(lambda x: 1 if i in x else 0)

In [7]:
def demojize_text(text):
    return emoji.demojize(text)

In [8]:
def clean_text(text):

    # Ensure the input is a string
    text = str(text).lower()
    
    # Remove specific patterns and unwanted characters
    text = re.sub(r'\:(.*?)\:', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove HTML content
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags, newlines, and words with numbers
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove all punctuation
    text = re.sub(r"[^\w\s]", "", text)  # Removes everything except word characters and spaces
    
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [9]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
            
    x=new_text[:]
    new_text.clear()
    return " ".join(x)

In [10]:
def text_preprocessing(df):
    df['text'] = df['text'].apply(lambda x: contractions.fix(x))
    df['text'] = df['text'].apply(lambda x: demojize_text(x))
    df['text'] = df['text'].apply(lambda x: clean_text(x))
    df['text'] = df['text'].apply(lambda x: remove_stopwords(x))
    return df

In [11]:
train_df = text_preprocessing(train_df)
test_df = text_preprocessing(test_df)
valid_df = text_preprocessing(valid_df)

  text = BeautifulSoup(text, 'html.parser').get_text()


In [12]:
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import torch

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
if torch.backends.mps.is_available():
   device = torch.device("mps")

In [15]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 1e-05

In [16]:
class EmotionDataset:
    def __init__(self,df,labels,tokenizer,max_len):
        self.data = df.text
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = df[labels].values

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        text = self.data[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[idx], dtype=torch.float)
        }

In [17]:
# emo_d = EmotionDataset(train_df,emotions,tokenizer,MAX_LEN)

In [18]:
training_set = EmotionDataset(train_df,emotions, tokenizer, MAX_LEN)
validation_set = EmotionDataset(valid_df,emotions, tokenizer, MAX_LEN)
testing_set = EmotionDataset(test_df,emotions, tokenizer, MAX_LEN)

In [19]:
train_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=0, shuffle=True, pin_memory=True)
valid_loader = DataLoader(validation_set, batch_size=VALID_BATCH_SIZE, 
                          num_workers=0, shuffle=False, pin_memory=True)
test_loader = DataLoader(testing_set, batch_size=VALID_BATCH_SIZE, 
                          num_workers=0, shuffle=False, pin_memory=True)

In [20]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 28)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [21]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [22]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [27]:
def train(epoch):
    model.train()  # Set model to training mode
    running_loss = 0.0  # Initialize running loss
    total_batches = len(train_loader)  # Total number of batches

    for batch_idx, data in enumerate(train_loader, 1):  # Start counting batches from 1
        # Move data to the device
        input_ids = data['ids'].to(device, dtype=torch.long)
        attention_mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        # Forward pass
        outputs = model(input_ids, attention_mask, token_type_ids)
        # Compute loss
        loss = loss_fn(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accumulate loss
        running_loss += loss.item()

    # Compute average loss for the epoch
    average_loss = running_loss / total_batches
    print(f'\nEpoch: {epoch} completed. Average Loss: {average_loss:.4f}\n')


In [28]:
for epoch in range(EPOCHS):
    train(epoch)


Epoch: 0 completed. Average Loss: 0.1367


Epoch: 1 completed. Average Loss: 0.1017


Epoch: 2 completed. Average Loss: 0.0915



In [30]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [31]:
from sklearn import metrics
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.3974571586511885
F1 Score (Micro) = 0.5214508580343213
F1 Score (Macro) = 0.34540146815399464
Accuracy Score = 0.3974571586511885
F1 Score (Micro) = 0.5214508580343213
F1 Score (Macro) = 0.34540146815399464
Accuracy Score = 0.3974571586511885
F1 Score (Micro) = 0.5214508580343213
F1 Score (Macro) = 0.34540146815399464
