Installs

In [None]:
!pip install datasets
!pip install wordcloud
!pip install textblob
!pip install nltk
!pip install tensorflow-gpu

Imports

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig, AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, matthews_corrcoef
from tqdm import tqdm, trange, tnrange, tqdm_notebook
import random
import os
import io
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

Load dataset

In [None]:
dataset = load_dataset("dair-ai/emotion", "unsplit")
data = pd.DataFrame(dataset['train'])
print("data",len(data),data.shape)
data = data.sample(frac=0.01, random_state=42).reset_index(drop=True)
print("data frac",len(data),data.shape)

Pre-processing

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

MAX_LEN = 256

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print("data.text.val",len(data['text'].values))
print("data.text",len(data['text']))

input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True) for sent in data['text'].values]
print("inputs.ids",len(input_ids))

labels = data['label'].values

print("labels, data.labels",len(labels))

print("Actual sentence before tokenization: ", data['text'].values[2])
print("Encoded Input from dataset: ", input_ids[2])

attention_masks = []
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]
print(attention_masks[2])

train_inputs, temp_inputs, train_labels, temp_labels = train_test_split(input_ids, labels, random_state=41, test_size=0.2, stratify=labels)
train_masks, temp_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=41, test_size=0.2, stratify=labels)

validation_inputs, test_inputs, validation_labels, test_labels = train_test_split(temp_inputs, temp_labels, random_state=41, test_size=0.5, stratify=temp_labels)
validation_masks, test_masks, _, _ = train_test_split(temp_masks, temp_inputs, random_state=41, test_size=0.5, stratify=temp_labels)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
test_inputs = torch.tensor(test_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)

print(len(train_inputs))
print(len(test_inputs))

Data loading for model

In [None]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
print(len(train_data))

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
print(len(test_data))

Device setup

In [None]:
torch.cuda.is_available(),tf.test.gpu_device_name(),torch.cuda.device_count(),torch.cuda.get_device_name(0)

In [None]:
if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

SEED = 32
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda")
print(device)

Model initialisation

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6).to(device)

lr = 2e-5
adam_epsilon = 1e-8

epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader) * epochs

optimizer = AdamW(model.parameters(), lr=lr, eps=adam_epsilon, correct_bias=False) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) 

Training loop

In [None]:
train_loss_set = []
train_accuracy_set = []
learning_rate = []

model.zero_grad()

for epoch in tnrange(1, epochs + 1, desc='Epoch'):
    
    print("<" + "=" * 22 + f" Epoch {epoch} " + "=" * 22 + ">")
    batch_loss = 0
    batch_accuracy = 0

    for step, batch in enumerate(train_dataloader):
        model.train()

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = outputs[:2]

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

        optimizer.zero_grad()

        batch_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        batch_accuracy += accuracy_score(labels_flat, pred_flat)

    avg_train_loss = batch_loss / len(train_dataloader)
    avg_train_accuracy = batch_accuracy / len(train_dataloader)

    for param_group in optimizer.param_groups:
        print("\n\tCurrent Learning rate: ", param_group['lr'])
        learning_rate.append(param_group['lr'])

    train_loss_set.append(avg_train_loss)
    train_accuracy_set.append(avg_train_accuracy)
    print(f'\n\tAverage Training loss: {avg_train_loss}')
    print(f'\n\tAverage Training accuracy: {avg_train_accuracy}')

    model.eval()

    eval_accuracy, eval_mcc_accuracy, nb_eval_steps = 0, 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = logits[0].to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        df_metrics = pd.DataFrame({'Epoch': epochs, 'Actual_class': labels_flat, 'Predicted_class': pred_flat})

        tmp_eval_accuracy = accuracy_score(labels_flat, pred_flat)
        tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)

        eval_accuracy += tmp_eval_accuracy
        eval_mcc_accuracy += tmp_eval_mcc_accuracy
        nb_eval_steps += 1

    print(f'\n\tValidation Accuracy: {eval_accuracy / nb_eval_steps}')
    print(f'\n\tValidation MCC Accuracy: {eval_mcc_accuracy / nb_eval_steps}')

model.eval()

Testing loop

In [None]:
test_accuracy, test_mcc_accuracy, nb_test_steps = 0, 0, 0

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = logits[0].to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()

    df_metrics = pd.DataFrame({'Actual_class': labels_flat, 'Predicted_class': pred_flat})

    tmp_test_accuracy = accuracy_score(labels_flat, pred_flat)
    tmp_test_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)

    test_accuracy += tmp_test_accuracy
    test_mcc_accuracy += tmp_test_mcc_accuracy
    nb_test_steps += 1

print(f'\n\tTest Accuracy: {test_accuracy / nb_test_steps}')
print(f'\n\tTest MCC Accuracy: {test_mcc_accuracy / nb_test_steps}')

Plotting functions and metrics

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

#  confusion matrix for test set
cm = confusion_matrix(labels_flat, pred_flat)
plot_confusion_matrix(cm, classes=[0, 1, 2, 3, 4, 5], title='Confusion Matrix for Test Set')
plt.show()

#  accuracy over epochs
plt.figure(figsize=(10,5))
plt.bar(range(1, epochs+1), train_accuracy_set, color='skyblue', label='Train Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Train Accuracy over Epochs')
plt.legend()
plt.show()

#  loss over epochs
plt.figure(figsize=(10,5))
plt.bar(range(1, epochs+1), train_loss_set, color='salmon', label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Train Loss over Epochs')
plt.legend()
plt.show()
