In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset



# Encode labels
label_encoder = LabelEncoder()
df['emotion_encoded'] = label_encoder.fit_transform(df['emotion'])
val_df['emotion_encoded'] = label_encoder.transform(val_df['emotion'])

# Load Hinglish RoBERTa tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")
model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hing-bert", num_labels=8)  # Set num_labels to the number of emotion classes

# Tokenize and prepare data
def tokenize_data(data, tokenizer, max_length=128):
    tokenized_data = tokenizer(data['utterance'].tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return tokenized_data

train_tokenized = tokenize_data(df, tokenizer)
val_tokenized = tokenize_data(val_df, tokenizer)

# Convert labels to PyTorch tensors
y_train = torch.tensor(df['emotion_encoded'].values)
y_val = torch.tensor(val_df['emotion_encoded'].values)

# Create DataLoader for training and validation
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], y_train)
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], y_val)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1).cpu().numpy()

            val_predictions.extend(predictions)
            val_true_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    val_report = classification_report(val_true_labels, val_predictions)
    val_weighted_f1 = f1_score(val_true_labels, val_predictions, average='weighted')

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print("Validation Accuracy:", val_accuracy)
    print("Validation Classification Report:\n", val_report)
    print("Validation Weighted F1 Score:", val_weighted_f1)
    print("-" * 50)


Hingbert with LSTM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import LSTM, Dropout



# Encode labels
label_encoder = LabelEncoder()
df['emotion_encoded'] = label_encoder.fit_transform(df['emotion'])
val_df['emotion_encoded'] = label_encoder.transform(val_df['emotion'])

# Load Hinglish BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-bert")
base_model = AutoModel.from_pretrained("l3cube-pune/hing-bert")

# Define a custom classifier with an additional LSTM layer
class CustomClassifier(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomClassifier, self).__init__()
        self.bert = base_model
        self.lstm = LSTM(input_size=768, hidden_size=128, batch_first=True, bidirectional=True)
        self.dropout = Dropout(0.5)  # Experiment with dropout rate
        self.classifier = torch.nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        lstm_output, _ = self.lstm(last_hidden_state)
        avg_pooled = lstm_output.mean(dim=1)
        dropout_output = self.dropout(avg_pooled)
        logits = self.classifier(dropout_output)
        return logits

# Create an instance of the custom classifier
model = CustomClassifier(base_model, num_labels=len(label_encoder.classes_))

# Tokenize and prepare data
def tokenize_data(data, tokenizer, max_length=128):
    tokenized_data = tokenizer(data['utterance'].tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return tokenized_data

train_tokenized = tokenize_data(df, tokenizer)
val_tokenized = tokenize_data(val_df, tokenizer)

# Convert labels to PyTorch tensors
y_train = torch.tensor(df['emotion_encoded'].values)
y_val = torch.tensor(val_df['emotion_encoded'].values)

# Create DataLoader for training and validation
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], y_train)
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], y_val)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Use a learning rate scheduler
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define early stopping parameters
best_val_accuracy = 0.0
patience = 3
early_stopping_counter = 0

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs
            predictions = torch.argmax(logits, dim=1).cpu().numpy()

            val_predictions.extend(predictions)
            val_true_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    val_report = classification_report(val_true_labels, val_predictions)
    val_weighted_f1 = f1_score(val_true_labels, val_predictions, average='weighted')

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print("Validation Accuracy:", val_accuracy)
    print("Validation Classification Report:\n", val_report)
    print("Validation Weighted F1 Score:", val_weighted_f1)
    print("-" * 50)

    # Save the model if validation accuracy improves
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    # Early stopping
    if early_stopping_counter >= patience:
        print("Early stopping. No improvement in validation accuracy.")
        break
