In [20]:
!pip install transformers datasets



In [21]:
import pandas as pd

# Load datasets
train_data = pd.read_csv("/content/eng.csv")
dev_data = pd.read_csv("/content/eng_a.csv")

# Inspect datasets
print("Train Dataset Shape:", train_data.shape)
print("Dev Dataset Shape:", dev_data.shape)
print("Train Columns:", train_data.columns)
print("Dev Columns:", dev_data.columns)
print("Missing Values in Train:\n", train_data.isna().sum())
print("Missing Values in Dev:\n", dev_data.isna().sum())


Train Dataset Shape: (2768, 7)
Dev Dataset Shape: (116, 7)
Train Columns: Index(['id', 'text', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'], dtype='object')
Dev Columns: Index(['id', 'text', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'], dtype='object')
Missing Values in Train:
 id          0
text        0
Anger       0
Fear        0
Joy         0
Sadness     0
Surprise    0
dtype: int64
Missing Values in Dev:
 id            0
text          0
Anger       116
Fear        116
Joy         116
Sadness     116
Surprise    116
dtype: int64


In [22]:
# Fill NaN values with 0 in the development dataset
dev_data.fillna(0, inplace=True)

In [23]:
import re

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing
train_data['text'] = train_data['text'].apply(preprocess_text)
dev_data['text'] = dev_data['text'].apply(preprocess_text)

In [24]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Split training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['text'],
    train_data[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']].values,
    test_size=0.2,
    random_state=42
)

In [25]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [26]:
# Tokenize text data
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(train_texts, tokenizer)
val_encodings = tokenize_texts(val_texts, tokenizer)
dev_encodings = tokenize_texts(dev_data['text'], tokenizer)

In [27]:
import torch
from torch import nn
from transformers import BertModel

# Define the model
class MultiLabelEmotionClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(MultiLabelEmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)
        return logits

# Initialize the model
num_labels = 5
model = MultiLabelEmotionClassifier('bert-base-uncased', num_labels)

In [28]:
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

# Create Dataset class
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [29]:
# Prepare datasets
train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)

In [30]:
# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [31]:
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.BCEWithLogitsLoss()



In [32]:
# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Loss: 0.5277801769120353
Epoch 2, Loss: 0.3639319432633264
Epoch 3, Loss: 0.23224520512989588
Epoch 4, Loss: 0.13458579682878086
Epoch 5, Loss: 0.0843249094539455
Epoch 6, Loss: 0.055035713315010074
Epoch 7, Loss: 0.03315222029175077
Epoch 8, Loss: 0.02661127466708422
Epoch 9, Loss: 0.023717906445797
Epoch 10, Loss: 0.01656039927953056


In [33]:
from sklearn.metrics import f1_score

# Evaluation function
def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.sigmoid(logits).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

In [42]:
from sklearn.metrics import f1_score
import numpy as np

# Evaluation function
def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.sigmoid(logits).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Convert predictions to binary using a threshold  (Corrected indentation)
    threshold = 0.5
    binary_preds = (np.array(all_preds) > threshold).astype(int)
    f1 = f1_score(all_labels, binary_preds, average='micro')
    return f1

In [53]:
# Evaluate on validation set
val_loader = DataLoader(val_dataset, batch_size=32)
f1_val = evaluate_model(model, val_loader)
print("Validation F1 Score:", f1_val)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Validation F1 Score: 0.7149425287356321


In [54]:
# Evaluate on validation set
val_loader = DataLoader(val_dataset, batch_size=32)
f1_val = evaluate_model(model, val_loader)
print("Micro-Average F1 Score (micro):", f1_val)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Micro-Average F1 Score (micro): 0.7149425287356321


In [48]:
def predict_emotions(sentence):
    # Tokenize the input
    inputs = tokenizer(sentence, return_tensors="pt", max_length=128, truncation=True, padding="max_length").to(device)

    # Get the model's predictions
    with torch.no_grad():
        # Only pass input_ids and attention_mask to the model
        logits = model(inputs['input_ids'], inputs['attention_mask'])
        probs = torch.sigmoid(logits)

    # Define emotion labels
    emotion_labels = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

    # Get predicted emotions based on a threshold (e.g., 0.5)
    predicted_emotions = [emotion_labels[i] for i, prob in enumerate(probs[0]) if prob > 0.5]

    return predicted_emotions

In [51]:
sentence = "She threw a party for her Graduation"
result = predict_emotions(sentence)
print("Predicted Emotions:", result)

Predicted Emotions: ['Joy']
