<a href="https://www.kaggle.com/code/swarna21saha/bilstm?scriptVersionId=281498281" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lastly/collect_preprocessed_dataset - collect_preprocessed_dataset.csv


In [2]:
!pip install scikit-learn pandas numpy torch

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss, jaccard_score
from torch.nn.utils.rnn import pad_sequence


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
df = pd.read_csv("/kaggle/input/lastly/collect_preprocessed_dataset - collect_preprocessed_dataset.csv")

emotion_cols = ['Love','Joy','Anger','Surprise','Sadness','Fear','Hate']
df[emotion_cols] = df[emotion_cols].astype(int)

texts = df["Data"].astype(str).tolist()
labels = df[emotion_cols].values


In [4]:
from collections import Counter

def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    vocab = {"<PAD>":0, "<UNK>":1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(texts)


In [5]:
def encode(text):
    return torch.tensor([vocab.get(w, 1) for w in text.split()], dtype=torch.long)


In [6]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [encode(t) for t in texts]
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    sequences, labels = zip(*batch)
    padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    return padded, torch.stack(labels)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

train_ds = EmotionDataset(X_train, y_train)
val_ds = EmotionDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [8]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]   # last time step
        out = self.fc(out)
        return self.sigmoid(out)


In [9]:
model = BiLSTMClassifier(
    vocab_size=len(vocab),
    embed_dim=128,
    hidden_dim=256,
    num_labels=7
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [10]:
def train_epoch():
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        preds = model(x)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


In [11]:
def evaluate():
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device)
            preds = model(x).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(y.numpy())

    preds = np.vstack(all_preds)
    labels = np.vstack(all_labels)

    preds_bin = (preds > 0.5).astype(int)

    metrics = {
        "accuracy_exact": accuracy_score(labels, preds_bin),
        "precision_macro": precision_score(labels, preds_bin, average="macro"),
        "recall_macro": recall_score(labels, preds_bin, average="macro"),
        "f1_macro": f1_score(labels, preds_bin, average="macro"),
        "hamming": hamming_loss(labels, preds_bin),
        "jaccard_samples": jaccard_score(labels, preds_bin, average="samples"),
    }
    return metrics


In [13]:
for epoch in range(20):
    loss = train_epoch()
    metrics = evaluate()
    print(f"\nEpoch {epoch+1}")
    print("Training loss:", loss)
    for k,v in metrics.items():
        print(f"{k}: {v:.4f}")


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1
Training loss: 0.05233919426822906
accuracy_exact: 0.5040
precision_macro: 0.6108
recall_macro: 0.5850
f1_macro: 0.5970
hamming: 0.1221
jaccard_samples: 0.5530


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 2
Training loss: 0.04433044120562316
accuracy_exact: 0.5090
precision_macro: 0.6108
recall_macro: 0.5927
f1_macro: 0.6004
hamming: 0.1223
jaccard_samples: 0.5589


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 3
Training loss: 0.038514328504652966
accuracy_exact: 0.5052
precision_macro: 0.6153
recall_macro: 0.5825
f1_macro: 0.5954
hamming: 0.1225
jaccard_samples: 0.5533


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 4
Training loss: 0.033946550436165374
accuracy_exact: 0.5176
precision_macro: 0.6137
recall_macro: 0.6012
f1_macro: 0.6066
hamming: 0.1206
jaccard_samples: 0.5669


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 5
Training loss: 0.028316812768067374
accuracy_exact: 0.5110
precision_macro: 0.6141
recall_macro: 0.6041
f1_macro: 0.6083
hamming: 0.1211
jaccard_samples: 0.5651


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 6
Training loss: 0.027107818325982837
accuracy_exact: 0.5059
precision_macro: 0.6056
recall_macro: 0.6052
f1_macro: 0.6041
hamming: 0.1229
jaccard_samples: 0.5628


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 7
Training loss: 0.024190769133414943
accuracy_exact: 0.5110
precision_macro: 0.6068
recall_macro: 0.6064
f1_macro: 0.6039
hamming: 0.1233
jaccard_samples: 0.5651


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 8
Training loss: 0.02101260435084368
accuracy_exact: 0.5081
precision_macro: 0.6055
recall_macro: 0.6122
f1_macro: 0.6076
hamming: 0.1223
jaccard_samples: 0.5670


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 9
Training loss: 0.01874416186216678
accuracy_exact: 0.5036
precision_macro: 0.5999
recall_macro: 0.6086
f1_macro: 0.6031
hamming: 0.1245
jaccard_samples: 0.5624


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 10
Training loss: 0.020001800710669232
accuracy_exact: 0.5031
precision_macro: 0.6106
recall_macro: 0.5929
f1_macro: 0.5994
hamming: 0.1228
jaccard_samples: 0.5559


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 11
Training loss: 0.01575228157020027
accuracy_exact: 0.5031
precision_macro: 0.6126
recall_macro: 0.5996
f1_macro: 0.6037
hamming: 0.1237
jaccard_samples: 0.5565


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 12
Training loss: 0.01403747726450594
accuracy_exact: 0.5007
precision_macro: 0.6063
recall_macro: 0.6004
f1_macro: 0.6020
hamming: 0.1225
jaccard_samples: 0.5560


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 13
Training loss: 0.010923587019042107
accuracy_exact: 0.5059
precision_macro: 0.6053
recall_macro: 0.6068
f1_macro: 0.6047
hamming: 0.1240
jaccard_samples: 0.5615


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 14
Training loss: 0.010264394717649276
accuracy_exact: 0.4975
precision_macro: 0.5967
recall_macro: 0.5922
f1_macro: 0.5939
hamming: 0.1241
jaccard_samples: 0.5510


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 15
Training loss: 0.015073432525952384
accuracy_exact: 0.5081
precision_macro: 0.6169
recall_macro: 0.5913
f1_macro: 0.6033
hamming: 0.1203
jaccard_samples: 0.5563


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 16
Training loss: 0.01492587977738639
accuracy_exact: 0.5011
precision_macro: 0.6059
recall_macro: 0.5908
f1_macro: 0.5975
hamming: 0.1221
jaccard_samples: 0.5512


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 17
Training loss: 0.009288254432730115
accuracy_exact: 0.4948
precision_macro: 0.6061
recall_macro: 0.5979
f1_macro: 0.5996
hamming: 0.1238
jaccard_samples: 0.5508


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 18
Training loss: 0.010529981775002905
accuracy_exact: 0.4971
precision_macro: 0.6056
recall_macro: 0.5938
f1_macro: 0.5979
hamming: 0.1230
jaccard_samples: 0.5497


  _warn_prf(average, modifier, msg_start, len(result))



Epoch 19
Training loss: 0.011010574558268076
accuracy_exact: 0.5079
precision_macro: 0.6125
recall_macro: 0.6015
f1_macro: 0.6066
hamming: 0.1212
jaccard_samples: 0.5599

Epoch 20
Training loss: 0.008313596192070774
accuracy_exact: 0.5131
precision_macro: 0.6175
recall_macro: 0.5996
f1_macro: 0.6052
hamming: 0.1206
jaccard_samples: 0.5614


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
torch.save(model.state_dict(), "/kaggle/working/bilstm_emotion.pt")
print("Model saved!")


Model saved!


In [16]:
label_cols = ['Love','Joy','Anger','Surprise','Sadness','Fear','Hate']


In [17]:
def predict(text):
    model.eval()
    tokens = encode(text)
    tokens = tokens.unsqueeze(0).to(device)

    with torch.no_grad():
        probs = model(tokens).cpu().numpy()[0]

    return dict(zip(label_cols, probs))

print(predict("I am feeling very happy today"))


{'Love': 0.008630006, 'Joy': 0.18931852, 'Anger': 0.003203986, 'Surprise': 0.98320675, 'Sadness': 0.010412657, 'Fear': 0.018196339, 'Hate': 0.0039367643}
