<a href="https://colab.research.google.com/github/QuangMinhPhan23/emotions-classification-for-tweets/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U datasets huggingface_hub fsspec


Collecting fsspec
  Using cached fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)


In [None]:
from datasets import load_dataset
dataset = load_dataset("dair-ai/emotion", "split")

In [None]:
print(dataset['train'][0])
print(dataset['train'].features['label'].names)

{'text': 'i didnt feel humiliated', 'label': 0}
['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [None]:
from collections import Counter
Counter(dataset['train']['label'])

Counter({0: 4666, 3: 2159, 2: 1304, 5: 572, 4: 1937, 1: 5362})

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 50

def tokenize(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=MAX_LEN)

dataset = dataset.map(tokenize)


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader, Dataset
import torch
class EmotionDataset(Dataset):
    def __init__(self, dataset_split):
        self.input_ids = dataset_split['input_ids']
        self.attention_mask = dataset_split['attention_mask']
        self.labels = dataset_split['label']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx])
        }, torch.tensor(self.labels[idx])


train_dataset = EmotionDataset(dataset['train'])
val_dataset = EmotionDataset(dataset['validation'])
test_dataset = EmotionDataset(dataset['test'])


In [None]:
class RNNEmotionClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.rnn = torch.nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        _, hidden = self.rnn(embedded)
        output = self.fc(hidden.squeeze(0))
        return output


In [None]:
vocab_size = tokenizer.vocab_size
embed_dim = 128
hidden_dim = 256
output_dim = 6  # 6 emotions
pad_idx = tokenizer.pad_token_id

model = RNNEmotionClassifier(vocab_size, embed_dim, hidden_dim, output_dim, pad_idx)


In [None]:
from torch.utils.data import DataLoader, Dataset
test_loader = DataLoader(val_dataset, batch_size=32)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
for epoch in range(5):
    model.train()
    for batch in train_loader:
        inputs, labels = batch
        outputs = model(inputs['input_ids'])
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} loss: {loss.item():.4f}")

  'input_ids': torch.tensor(self.input_ids[idx]),
  'attention_mask': torch.tensor(self.attention_mask[idx])
  }, torch.tensor(self.labels[idx])


Epoch 0 loss: 1.4803
Epoch 1 loss: 1.6445
Epoch 2 loss: 0.2620
Epoch 3 loss: 0.1680
Epoch 4 loss: 0.2176


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
def evaluate(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs, labels = batch
            outputs = model(inputs['input_ids'])
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    return all_preds, all_labels

preds, labels = evaluate(test_loader)
print(classification_report(labels, preds, target_names=dataset['train'].features['label'].names))


  'input_ids': torch.tensor(self.input_ids[idx]),
  'attention_mask': torch.tensor(self.attention_mask[idx])
  }, torch.tensor(self.labels[idx])


              precision    recall  f1-score   support

     sadness       0.94      0.94      0.94       550
         joy       0.94      0.90      0.92       704
        love       0.73      0.85      0.79       178
       anger       0.90      0.90      0.90       275
        fear       0.87      0.82      0.84       212
    surprise       0.79      0.91      0.85        81

    accuracy                           0.90      2000
   macro avg       0.86      0.89      0.87      2000
weighted avg       0.90      0.90      0.90      2000

