In [1]:
from google.colab import files
uploaded = files.upload()

Saving tweet_emotions.csv to tweet_emotions.csv


In [4]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
import re
from collections import Counter
from tqdm import tqdm
import random

In [5]:

df = pd.read_csv('tweet_emotions.csv')

In [6]:
print(df.head())
print(df.columns)

     tweet_id   sentiment                                            content
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696     sadness                Funeral ceremony...gloomy friday...
3  1956967789  enthusiasm               wants to hang out with friends SOON!
4  1956968416     neutral  @dannycastillo We want to trade with someone w...
Index(['tweet_id', 'sentiment', 'content'], dtype='object')


In [7]:
#Encode Target Labels
le = LabelEncoder()
df['emotion_label'] = le.fit_transform(df['sentiment'])
num_classes = len(le.classes_)
print("Emotion Classes:", list(le.classes_))

Emotion Classes: ['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'happiness', 'hate', 'love', 'neutral', 'relief', 'sadness', 'surprise', 'worry']


In [8]:
#Train/Test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['emotion_label'], random_state=42)

#Tokenization & Vocabulary
def simple_tokenize(text):
    return re.findall(r'\b\w+\b', str(text).lower())

In [9]:
#Building vocab from training data
counter = Counter()
for text in train_df['content']:
    counter.update(simple_tokenize(text))
vocab = ['<PAD>', '<UNK>'] + [w for w, c in counter.items() if c >= 2]
word2idx = {w: i for i, w in enumerate(vocab)}

def encode(text):
    return [word2idx.get(w, word2idx['<UNK>']) for w in simple_tokenize(text)]

VOCAB_SIZE = len(word2idx)
MAXLEN = 40  # tweets are short

In [10]:
#Pytorch Dataset & Dataloader
class TextEmotionDataset(Dataset):
    def __init__(self, df, word2idx, maxlen=40):
        self.texts = df['content'].tolist()
        self.labels = df['emotion_label'].tolist()
        self.word2idx = word2idx
        self.maxlen = maxlen
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, i):
        ids = encode(self.texts[i])
        if len(ids) > self.maxlen:
            ids = ids[:self.maxlen]
        return torch.tensor(ids, dtype=torch.long), torch.tensor(self.labels[i], dtype=torch.long)

def collate_fn(batch):
    texts, labels = zip(*batch)
    lens = [len(x) for x in texts]
    padded = pad_sequence(texts, batch_first=True, padding_value=0)
    if padded.size(1) < MAXLEN:
        pad = torch.zeros(padded.size(0), MAXLEN - padded.size(1), dtype=torch.long)
        padded = torch.cat([padded, pad], dim=1)
    elif padded.size(1) > MAXLEN:
        padded = padded[:, :MAXLEN]
    return padded, torch.tensor(labels)

train_ds = TextEmotionDataset(train_df, word2idx, MAXLEN)
test_ds = TextEmotionDataset(test_df, word2idx, MAXLEN)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [11]:
#LSTM Model
class EmotionLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_classes, n_layers=1, bidirectional=True, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True, num_layers=n_layers,
                            bidirectional=bidirectional, dropout=dropout)
        d = hid_dim * 2 if bidirectional else hid_dim
        self.fc = nn.Linear(d, num_classes)
    def forward(self, x):
        emb = self.embedding(x)
        _, (h_n, _) = self.lstm(emb)
        if self.lstm.bidirectional:
            h = torch.cat([h_n[-2], h_n[-1]], dim=-1)
        else:
            h = h_n[-1]
        return self.fc(h)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmotionLSTM(vocab_size=VOCAB_SIZE, emb_dim=100, hid_dim=128, num_classes=num_classes).to(device)



In [12]:
# 8. Train the Model
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

epochs = 7
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1} - Loss: {avg_loss:.4f}')

#Accuracy and F1-score
model.eval()
all_preds, all_trues = [], []
with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device)
        output = model(x)
        preds = torch.argmax(output, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_trues.extend(y.numpy())

accuracy = accuracy_score(all_trues, all_preds)
f1 = f1_score(all_trues, all_preds, average='weighted')
print(f'Accuracy: {accuracy:.3f}')
print(f'F1 score: {f1:.3f}')

100%|██████████| 1000/1000 [01:26<00:00, 11.58it/s]


Epoch 1 - Loss: 2.0411


100%|██████████| 1000/1000 [01:27<00:00, 11.48it/s]


Epoch 2 - Loss: 1.8743


100%|██████████| 1000/1000 [01:27<00:00, 11.48it/s]


Epoch 3 - Loss: 1.7513


100%|██████████| 1000/1000 [01:27<00:00, 11.45it/s]


Epoch 4 - Loss: 1.6116


100%|██████████| 1000/1000 [01:27<00:00, 11.43it/s]


Epoch 5 - Loss: 1.4415


100%|██████████| 1000/1000 [01:27<00:00, 11.44it/s]


Epoch 6 - Loss: 1.2434


100%|██████████| 1000/1000 [01:27<00:00, 11.46it/s]


Epoch 7 - Loss: 1.0357
Accuracy: 0.300
F1 score: 0.288


In [13]:
#Sample Predictions
sample_indices = np.random.choice(len(test_ds), 5, replace=False)
for idx in sample_indices:
    text = test_df.iloc[idx]['content']
    true = le.inverse_transform([test_df.iloc[idx]['emotion_label']])[0]

# Predict
    x_seq = torch.tensor(encode(text), dtype=torch.long).unsqueeze(0).to(device)
    if x_seq.size(1) < MAXLEN:
        pad = torch.zeros(1, MAXLEN - x_seq.size(1), dtype=torch.long).to(device)
        x_padded = torch.cat([x_seq, pad], dim=1)
    else:
        x_padded = x_seq[:, :MAXLEN]
    pred_idx = torch.argmax(model(x_padded)).item()
    pred = le.inverse_transform([pred_idx])[0]
    print(f"Text: {text}\nActual Emotion: {true}\nPredicted Emotion: {pred}\n{'-'*50}")

Text: back from taking jarid to the airport  but looking forward to meeting my new nephew in 5 days.
Actual Emotion: enthusiasm
Predicted Emotion: worry
--------------------------------------------------
Text: y do i only have 2 people following me  people follow me please x
Actual Emotion: worry
Predicted Emotion: neutral
--------------------------------------------------
Text: @mmitchelldaviss It saddens me, you're youtube has changed. It's just like every other youtube kid, you used to be different
Actual Emotion: worry
Predicted Emotion: neutral
--------------------------------------------------
Text: @alliemunchkin :3, Youtube loves my HD videos  well maybe.
Actual Emotion: happiness
Predicted Emotion: surprise
--------------------------------------------------
Text: They took my property
Actual Emotion: sadness
Predicted Emotion: worry
--------------------------------------------------


Sometimes the model gets confused by subtle or mixed emotions, or it might not have seen enough similar examples during training. Tweets can also be tricky—they’re short, informal, and can mean different things depending on context, which the model can miss.