In [None]:
import csv
import glob
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import re

from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix, classification_report

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchmetrics import AUROC

from transformers import BertTokenizerFast as BertTokenizer, AlbertModel, AlbertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoTokenizer

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
pl.seed_everything(RANDOM_SEED)

# 1. MELD 

**Column Specification:**

**Column Name = Description** 

Sr No. = Serial numbers of the utterances mainly for referencing the utterances in case of different versions or multiple copies with different subsets.                

Utterance = Individual utterances from EmotionLines as a string.                                           

Speaker = Name of the speaker associated with the utterance.                                             

Emotion = The emotion (neutral, joy, sadness, anger, surprise, fear, disgust)in the utterance.

Sentiment = The sentiment (positive, neutral, negative) expressed by the speaker in the utterance.         

Dialogue_ID = The index of the dialogue starting from 0.                                                     

Utterance_ID = The index of the particular utterance in the dialogue starting from 0.                         

Season = The season no. of Friends TV Show to which a particular utterance belongs.

**Data Splits:**

Train = 9989

Dev = 1109

Test = 2610

Total = 13708

In [None]:
df_train = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/train.csv") 
df_dev = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/dev.csv") 
df_test = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/test.csv")

print(df_train.shape, df_dev.shape, df_test.shape)

In [None]:
df_train = df_train[["transcription","sentiment"]]
df_test = df_test[["transcription","sentiment"]]
df_dev = df_dev[["transcription","sentiment"]]

In [None]:
encode_map = {'negative': 0,'neutral': 1,'positive': 2}

df_train['sentiment'] = df_train['sentiment'].astype('category')
df_train['sentiment'].replace(encode_map, inplace=True)

df_dev['sentiment'] = df_dev['sentiment'].astype('category')
df_dev['sentiment'].replace(encode_map, inplace=True)

df_test['sentiment'] = df_test['sentiment'].astype('category')
df_test['sentiment'].replace(encode_map, inplace=True)

# EDA

In [None]:
frames = [df_train, df_dev, df_test]
combine = pd.concat(frames)
all_data = combine.reset_index(drop=True)

In [None]:
sns.countplot(x = 'sentiment', data=all_data)

# Preprocessing

In [None]:
len(all_data)

In [None]:
all_data.head(5) #neg=0, neu=1, pos=2

In [None]:
LABEL_COLUMNS = all_data.columns.tolist()[1:]

In [None]:
LABEL_COLUMNS

In [None]:
all_data['sentiment'].value_counts() #neg=0, neu=1, pos=2

# Tokenization

In [None]:
MODEL_NAME = 'albert-base-v2'
tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)

# Number of tokens

In [None]:
token_counts = []

for _, row in all_data.iterrows():
    token_count = len(tokenizer.encode(
        row["transcription"],
        max_length=512,
        truncation=True
    ))

    token_counts.append(token_count)

In [None]:
sns.histplot(token_counts)
plt.xlim([0, 60]);

**Most of the utterances contain less than 60 tokens. Therefore, the limit set will be 60.**

In [None]:
MAX_TOKEN_COUNT = 60

## Tokenization process for the whole dataset

In [None]:
class Dataset(Dataset):
    
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: AlbertTokenizer,
        max_token_len: int = 60
    ):

        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        
        data_row = self.data.iloc[index]
        
        Utterance = data_row.transcription
        labels = data_row[LABEL_COLUMNS]  ##all

        encoding = self.tokenizer.encode_plus(
            Utterance,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return dict(
            Utterance=Utterance,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.LongTensor(labels)

        )
    

## Sample:

In [None]:
#Sample item from the dataset
train_dataset = Dataset(
  df_train_final,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT

)

sample_item = train_dataset[0]
sample_item.keys()

In [None]:
sample_item["Utterance"]

In [None]:
sample_item["labels"]

In [None]:
sample_item["input_ids"].shape

In [None]:
sample_item["attention_mask"].squeeze()[:27]

In [None]:
sample_item

## Loading ALBERT model into a sample_item data

In [None]:
albert_model = AlbertModel.from_pretrained(MODEL_NAME, return_dict=True)

sample_batch = next(iter(DataLoader(train_dataset, batch_size=8)))

sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape, sample_batch["labels"].shape

In [None]:
output = albert_model(sample_batch["input_ids"], sample_batch["attention_mask"])
output.last_hidden_state.shape, output.pooler_output.shape

In [None]:
albert_model.config.hidden_size 

**768 dimension comes from the ALBERT hidden size. 
The larger verison of ALBERT has more attentions heads and larger hidden size.**

# Encapsulating all data

In [None]:
class DataModule(pl.LightningDataModule):
    
    def __init__(self, df_train, df_test, df_dev, tokenizer, batch_size=16, max_token_len=60):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = df_train
        self.test_df = df_test
        self.dev_df = df_dev
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = Dataset(
          self.train_df,
          self.tokenizer,
          self.max_token_len
        )

        self.dev_dataset = Dataset(
          self.dev_df,
          self.tokenizer,
          self.max_token_len
        )
        
        self.test_dataset = Dataset(
          self.test_df,
          self.tokenizer,
          self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
          self.train_dataset,
          batch_size=self.batch_size,
          shuffle=True,
          num_workers=0
        )

    def val_dataloader(self):
        return DataLoader(
          self.dev_dataset,
          batch_size=self.batch_size,
          num_workers=0
        )

    def test_dataloader(self):
        return DataLoader(
          self.test_dataset,
          batch_size=self.batch_size,
          num_workers=0
        )

**Instance for the data module:**

In [None]:
N_EPOCHS = 10
BATCH_SIZE = 16

data_module = DataModule(df_train, df_dev, df_test, 
                         tokenizer, batch_size=BATCH_SIZE,max_token_len=MAX_TOKEN_COUNT)

# Modelling

**The first model is a pre-trained AlBertModel and a linear layer to convert the Albert representation to a classification task.**

In [None]:
class Tagger(pl.LightningModule):
    
    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.albert = AlbertModel.from_pretrained(MODEL_NAME, return_dict=True)
        for parameter in self.albert.parameters():
            parameter.require_grad = False
        self.classifier = nn.Linear(self.albert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.CrossEntropyLoss()
        

    def forward(self, input_ids, attention_mask, labels=None):

        output = self.albert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.softmax(output, dim=1)
        loss = 0
        
        #Print to debug
        #print(output)
        #print(labels)
        
        if labels is not None:
            labels = labels.flatten() ##
            loss = self.criterion(output, labels)
        return loss, output
    

    def training_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_epoch_end(self, outputs):

        labels = []
        predictions = []

        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)
        
        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)
        pred = torch.argmax(predictions, dim=1)
        
        train_acc = accuracy(pred, labels, num_classes=3)
        #print("Label:", labels)
        #print("Prediction:", pred)
        print("Training Accuracy:", train_acc)
        
        label = labels.flatten()
        auroc = AUROC(num_classes=3)
        auroc = auroc(predictions, label)
        print("AUROC:", auroc)
        

    def configure_optimizers(self): #configuring the optimizers

        optimizer = AdamW(self.parameters(), lr=2e-5)

        scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=self.n_warmup_steps,
          num_training_steps=self.n_training_steps

        )

        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )

        )

In [None]:
steps_per_epoch=len(df_train) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [None]:
#1/5 of the training steps as warm-up
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

**Instance of the model**

In [None]:
model = Tagger(
  n_classes=3,
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

# Training

Checkpointing that saves the best model (based on validation loss):

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="C:/Users/id301281/NLP/NLU/MELD/ALBERT",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
    )

And early stopping triggers when the loss hasn’t improved for the last 2 epochs:

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

Starting training process:

In [None]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback, early_stopping_callback],
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

In [None]:
trainer.fit(model, data_module)

In [None]:
trainer.test()

# Prediction

In [None]:
trained_model = Tagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=3
)

trained_model.eval()
trained_model.freeze()

### Test 1 

In [None]:
test_comment = "Hi, I'm Tracy, How can I help?"

encoding = tokenizer.encode_plus(

  test_comment,
  add_special_tokens=True,
  max_length=60,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

_, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()


for i, prediction in zip(encode_map, test_prediction):
    print(f"{i}: {prediction}")

### Test 2

In [None]:
THRESHOLD = 0.5

test_comment = "You such a loser! You'll regret everything you've done to me!"

encoding = tokenizer.encode_plus(
    test_comment,
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors='pt',
)

_, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()

for i, prediction in zip(encode_map, test_prediction):
    if prediction < THRESHOLD:
        continue
print(f"{i}: {prediction}")

# Evaluation

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

val_dataset = Dataset(
  df_test,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

preds = []
labels = []

for item in tqdm(val_dataset):
    _, prediction = trained_model(
        item["input_ids"].unsqueeze(dim=0).to(device),
        item["attention_mask"].unsqueeze(dim=0).to(device)
    )

    preds.append(prediction.flatten())
    labels.append(item["labels"].int())

preds = torch.stack(preds).detach().cpu()
labels = torch.stack(labels).detach().cpu()

### Accuracy

In [None]:
accuracy(preds, labels)

### ROC

In [None]:
print("AUROC per tag")

#for i, name in enumerate(encode_map):
label = labels.flatten()
auroc = AUROC(num_classes=3)
auroc = auroc(preds, label)
print("AUROC:", auroc)

### Classification report

In [None]:
#y_pred = preds.numpy()
y_pred = torch.argmax(preds, dim=1)
y_test = labels.numpy()

In [None]:
print(classification_report(y_test, y_pred, target_names=encode_map))

### Confusion matrix

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=encode_map, columns=encode_map)
show_confusion_matrix(df_cm)