In [None]:
from transformers import RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torchmetrics import AUROC

from tqdm.auto import tqdm

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.metrics.functional import accuracy, f1, auroc

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
pl.seed_everything(RANDOM_SEED)

In [None]:
#download the data here = https://affective-meld.github.io/
df_train_final = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/train.csv") #please put your path
df_dev_final = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/dev.csv") #please put your path
df_test_final = pd.read_csv(r"C:/Data/Sentiment Analysis/MELD/Processed/Processed_final/v2/test.csv") #please put your path

In [None]:
print(df_train_final.shape, df_dev_final.shape,df_test_final.shape)

In [None]:
df_train_final

In [None]:
df_train_final['sentiment'] = df_train_final['sentiment'].astype('category')
encode_map = {'negative': 0,'neutral': 1,'positive': 2}
df_train_final['sentiment'].replace(encode_map, inplace=True)

In [None]:
df_dev_final['sentiment'] = df_dev_final['sentiment'].astype('category')
encode_map = {'negative': 0,'neutral': 1,'positive': 2}
df_dev_final['sentiment'].replace(encode_map, inplace=True)

In [None]:
df_test_final['sentiment'] = df_test_final['sentiment'].astype('category')
encode_map = {'negative': 0,'neutral': 1,'positive': 2}
df_test_final['sentiment'].replace(encode_map, inplace=True)

In [None]:
df_train_final = df_train_final[["name","transcription","sentiment"]]
df_test_final = df_test_final[["name","transcription","sentiment"]]
df_dev_final = df_dev_final[["name","transcription","sentiment"]]

In [None]:
df_train_final = df_train_final.rename(columns={"name": "file_ID"})
df_dev_final = df_dev_final.rename(columns={"name": "file_ID"})
df_test_final = df_test_final.rename(columns={"name": "file_ID"})

In [None]:
frames = [df_train_final, df_dev_final, df_test_final]
combine = pd.concat(frames)
all_data = combine.reset_index(drop=True)

In [None]:
len(all_data)

In [None]:
all_data.head(5) #neg=0, neu=1, pos=2

In [None]:
LABEL_COLUMNS = all_data.columns.tolist()[2:]

In [None]:
LABEL_COLUMNS

In [None]:
MAX_TOKEN_COUNT = 60

In [None]:
MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

### Tokenization 

In [None]:
class Dataset(Dataset):
    
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: RobertaTokenizer,
        max_token_len: int = 60
    ):

        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        
        data_row = self.data.iloc[index]
        
        Utterance = data_row.transcription
        labels = data_row[LABEL_COLUMNS] ##all
        fileID = data_row.file_ID

        encoding = self.tokenizer.encode_plus(
            Utterance,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return dict(
            Utterance=Utterance,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.LongTensor(labels),
            FileID=fileID

        )

### Data loaders 

In [None]:
class DataModule(pl.LightningDataModule):
    
    def __init__(self, df_train_final, df_test_final, df_dev_final, tokenizer, batch_size=16, max_token_len=60):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = df_train_final
        self.test_df = df_test_final
        self.dev_df = df_dev_final
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = Dataset(
          self.train_df,
          self.tokenizer,
          self.max_token_len
        )

        self.dev_dataset = Dataset(
          self.dev_df,
          self.tokenizer,
          self.max_token_len
        )
        
        self.test_dataset = Dataset(
          self.test_df,
          self.tokenizer,
          self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
          self.train_dataset,
          batch_size=self.batch_size,
          shuffle=True,
          num_workers=0
        )

    def val_dataloader(self):
        return DataLoader(
          self.dev_dataset,
          batch_size=self.batch_size,
          num_workers=0
        )

    def test_dataloader(self):
        return DataLoader(
          self.test_dataset,
          batch_size=self.batch_size,
          num_workers=0
        )

#### Data module instance:

In [None]:
N_EPOCHS = 10
BATCH_SIZE = 16

data_module = DataModule(df_train_final, df_dev_final, df_test_final, 
                         tokenizer, batch_size=BATCH_SIZE,max_token_len=MAX_TOKEN_COUNT)

### Embedding model 

### Last four hidden layers 

In [None]:
class TextModel(pl.LightningModule):
    
    def __init__(self,n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(MODEL_NAME, return_dict=True, output_hidden_states=True)
        for parameter in self.roberta.parameters():
            parameter.require_grad = False
        
        self.linear_1 = nn.Linear(3072, 1872)
        self.linear_2 = nn.Linear(1872, 1172)
        self.linear_3 = nn.Linear(1172, 256)
        self.classifier = nn.Linear(256, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.roberta(input_ids, attention_mask=attention_mask)
        #option 1
        
        #output = self.classifier(output.pooler_output)
        #output = torch.softmax(output, dim=1)
        #loss = 0
        
        #option 2
        
        hidden_states = output[2]
        # get last four layers
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
        # take the mean of the concatenated vector over the token dimension
        output = torch.mean(cat_hidden_states, dim=1)
        output = self.linear_1(output)
        output = self.linear_2(output)
        output = self.linear_3(output)
        output = self.classifier(output)
        output = torch.softmax(output, dim=1)
        loss = 0
        
        if labels is not None:
            labels = labels.flatten() ##
            loss = self.criterion(output, labels)
        
        return loss, output
    
    def training_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_epoch_end(self, outputs):

        labels = []
        predictions = []

        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)
        
        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)
        pred = torch.argmax(predictions, dim=1)
        
        train_acc = accuracy(pred, labels, num_classes=3)
        #print("Label:", labels)
        #print("Prediction:", pred)
        print("Training Accuracy:", train_acc)
        
        label = labels.flatten()
        auroc = AUROC(num_classes=3)
        auroc = auroc(predictions, label)
        print("AUROC:", auroc)

    def configure_optimizers(self): #configuring the optimizers

        optimizer = AdamW(self.parameters(), lr=2e-5)

        scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=self.n_warmup_steps,
          num_training_steps=self.n_training_steps

        )

        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )

        )

In [None]:
steps_per_epoch=len(df_train_final) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [None]:
#1/5 of the training steps as warm-up
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

In [None]:
model = TextModel(
  n_classes=3,
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

### Training: 

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="C:/Users/id301281/NLP/NLU/MELD/RoBERTa",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
    )

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback, early_stopping_callback],
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

In [None]:
torch.cuda.is_available()

In [None]:
trainer.fit(model, data_module)

In [None]:
trainer.test()

In [None]:
trained_model = TextModel.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=3
)

trained_model.eval()
trained_model.freeze()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

val_dataset = Dataset(
  df_test_final,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

preds = []
labels = []

for item in tqdm(val_dataset):
    _, prediction = trained_model(
        item["input_ids"].unsqueeze(dim=0).to(device),
        item["attention_mask"].unsqueeze(dim=0).to(device)
    )

    preds.append(prediction.flatten())
    labels.append(item["labels"].int())

preds = torch.stack(preds).detach().cpu()
labels = torch.stack(labels).detach().cpu()

In [None]:
accuracy(preds, labels)

In [None]:
print("AUROC per tag")

#for i, name in enumerate(encode_map):
label = labels.flatten()
auroc = AUROC(num_classes=3)
auroc = auroc(preds, label)
print("AUROC:", auroc)

In [None]:
#y_pred = preds.numpy()
y_pred = torch.argmax(preds, dim=1)
y_test = labels.numpy()

In [None]:
print(classification_report(y_test, y_pred, target_names=encode_map))

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=encode_map, columns=encode_map)
show_confusion_matrix(df_cm)

### Storing test predictions:

In [None]:
trained_model = TextModel.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=3
)

trained_model.eval()
trained_model.freeze()

### Last four hidden layers 

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

test_dataset = Dataset(
  df_test_final,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

test_predictions = { "fileID_roberta_lfl": [], "Negative_roberta_lfl" : [], "Neutral_roberta_lfl" : [], "Positive_roberta_lfl" : [], "predicted_roberta_lfl": [], "actual_roberta_lfl": []}


for item in tqdm(test_dataset):
    _, prediction = trained_model(
        item["input_ids"].unsqueeze(dim=0).to(device),
        item["attention_mask"].unsqueeze(dim=0).to(device)
    )

    pred = prediction.flatten().detach().cpu().numpy()
    pred2 = prediction.flatten()
    pred3 = torch.argmax(pred2).squeeze().tolist()
    test_predictions["predicted_roberta_lfl"].append(pred3)
    test_predictions["Negative_roberta_lfl"].append(pred[0])
    test_predictions["Neutral_roberta_lfl"].append(pred[1])
    test_predictions["Positive_roberta_lfl"].append(pred[2])
    test_predictions["actual_roberta_lfl"].append(item["labels"].squeeze(-1).numpy())
    test_predictions["fileID_roberta_lfl"].append(item["FileID"])

In [None]:
df = pd.DataFrame(data=test_predictions)
df

In [None]:
#saving to csv
df.to_csv(r"C:/Data/Sentiment Analysis/MELD/ensemble preds/roberta_text/test_preds_roberta_lfl_v2.csv")