#### Importing Libraries

In [None]:
import json
import torch
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, DistilBertModel, BertTokenizer
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score
from tqdm import tqdm, trange

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
model_config = dict(
    model = 'BERT Base Uncased Task 2', 
    batch_size = 32, 
    learning_rate = 1e-5, 
    optimizer = 'AdamW',
    loss_function = 'BCELoss', 
    epochs = 5
)

#### Data Loading

In [None]:
train = json.load(open('train_file.json'))
test = json.load(open('val_file.json'))

In [None]:
print('Train size:', len(train))
print('Test size:', len(test))

#### Data Preprocessing

In [None]:
def prepare_data(data):
    dialogue_ids = []
    speaker = []
    emotion = []
    utterance = []
    erf_label = []
    for i in range(len(data)):
        dialogue_ids.append(data[i]['episode'])
        speaker.append(data[i]['speakers'])
        emotion.append(data[i]['emotions'])
        utterance.append(data[i]['utterances'])
        erf_label.append(data[i]['triggers'])
    df_data = pd.DataFrame(list(zip(dialogue_ids, speaker, emotion, utterance, erf_label)), columns =['Dialogue_ID', 'Speaker', 'Emotion', 'Utterance', 'ERF_Label'])
    return df_data

df_train = prepare_data(train)
df_test = prepare_data(test)
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)

##### Visualization

In [None]:
df_train.head()

In [None]:
df_test.head()

##### Data Cleaning

In [None]:
unicode_mapping = {}
unicode_mapping['\x85'] = '...' 
unicode_mapping['\x91'] = "'"
unicode_mapping['\x92'] = "'"
unicode_mapping['\x93'] = '"'
unicode_mapping['\x94'] = '"'
unicode_mapping['\x97'] = '--'

unicode_mapping['\u2014'] = '--'
unicode_mapping['\u2019'] = "'"
unicode_mapping['\u2026'] = '...'

unicode_mapping['\xe9'] = 'e'

def clean_utterance(utterance_list):
    '''	
    This function takes a list of utterances and replaces the unicode with the proper characters.
    input: list of utterances
    output: list of cleaned utterances
    '''
    cleaned_utterances_list = []
    for utterance in utterance_list:
        for key in unicode_mapping:
            utterance = utterance.replace(key, unicode_mapping[key])
        cleaned_utterances_list.append(utterance)
    return cleaned_utterances_list

train_uttr = df_train['Utterance'].apply(lambda x: clean_utterance(x))
df_train['Utterance'] = train_uttr
test_uttr = df_test['Utterance'].apply(lambda x: clean_utterance(x))
df_test['Utterance'] = test_uttr
    

In [None]:
def erf_labelling(data):
    '''	
    This function takes a list of dialogue, and labels the utterances with ERF labels.
    input: list of dialogue
    output: list of dialogue with ERF labels
    '''
    for i in range(len(data)):
        for j in range(len(data['ERF_Label'][i])):
            if data['ERF_Label'][i][j] != 1.0 and data['ERF_Label'][i][j] != 0.0:
                data['ERF_Label'][i][j] = 0.0
    return data

df_train = erf_labelling(df_train)
df_test = erf_labelling(df_test)

##### Sanity Check 

In [None]:
print(df_train['Utterance'][0])

In [None]:
# max length episode
max_len = 0
for i in range(len(df_train)):
    max_len = max(max_len, len(df_train['Utterance'][i]))
print(max_len)

#### Datasets and Tokenization

##### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

##### Dataset

In [None]:
class Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = self.process(data, tokenizer)
        self.max_len = max_len

    def __len__(self):
        return len(self.data['input'])

    def __getitem__(self, index):
        #tokenize the input
        input = self.data['input'][index]
        target = self.data['target'][index]
        encoding = self.tokenizer(input, return_tensors='pt', padding='max_length', max_length=self.max_len, truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'target':torch.tensor(target)}
        
    def process(self, data, tokenizer):
        new_data = {'input': [], 'target': []}
        for i in range(len(data)):
            newinput = data.loc[i].copy()
            stringlist = []
            for j in range(len(newinput['Utterance'])):
                stringlist.append(newinput['Speaker'][j] + ': ' + newinput['Utterance'][j] + ":" + newinput['Emotion'][j])
            for j in range(len(newinput['Utterance'])):
                #join till jth utterance
                temp = ' '.join(stringlist[:j])
                temp += '</s></s>'
                #add jth utterance
                temp += newinput['Speaker'][j] + ': ' + newinput['Utterance'][j] + ":" + newinput['Emotion'][j]
                temp += '</s></s>'
                #add later Utterence
                temp += ' '.join(stringlist[j+1:])
                temp = '<s> ' + temp + '</s>'
                new_data['input'].append(temp)
                target = newinput['ERF_Label'][j]
                # print(target)
                new_data['target'].append(target)
        return new_data

In [None]:
def collate_fn(batch):
    input_ids = []
    attention_mask = []
    target = []
    for b in batch:
        input_ids.append(b['input_ids'])
        attention_mask.append(b['attention_mask'])
        target.append(b['target'])
    input_ids = torch.stack(input_ids)
    attention_mask = torch.stack(attention_mask)
    target = torch.stack(target)
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}

In [None]:
train_Dataset = Dataset(df_train, tokenizer, 2)
train_loader = DataLoader(train_Dataset, batch_size=model_config['batch_size'], shuffle=True, collate_fn=collate_fn)
test_Dataset = Dataset(df_test, tokenizer, 2)
test_loader = DataLoader(test_Dataset, batch_size=model_config['batch_size'], shuffle=True, collate_fn=collate_fn)

#### Model Architecture

In [None]:
class Model(nn.Module):
    '''
    This class defines the model architecture using BERT.
    It utilizes the BERT model's encoder to get the embeddings of the input text 
    and then passes it through a linear layer to get the output. 
    The linear layer acts as a classifier head to encodings taken from the BERT model.
    '''
    def __init__(self, num_classes=2):
        super(Model, self).__init__()
        self.bart = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(self.bart.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bart(input_ids, attention_mask)
        out = outputs[0]
        out = out[:, 0, :]
        out = self.classifier(out)
        # make sure output is either 0 or 1
        out = torch.sigmoid(out)
        out = torch.round(out)
        return out

In [None]:
model = Model(1)
model.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=model_config['learning_rate'])

##### WandB Setup

In [None]:
import wandb
wandb.login(key="f669722ccde3fc9df322c58c0943f6a8cd01a084")

In [None]:
wandb.init(project='assignment-4', entity='nlp-assignments', config=model_config)

#### Model Training and Evaluation

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, minibatch):
    model.train()
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target = batch['target'].to(device)
        optimizer.zero_grad()
        output = model(input_ids, attention_mask)
        loss = criterion(output.view(-1), target)
        loss.backward()
        optimizer.step()
        minibatch_log = {
            'train_minibatch_loss' : loss.item(), 
            'minibatch' : minibatch
        }
        wandb.log(minibatch_log)
        minibatch += 1
    return minibatch, model

In [None]:
def evaluation(model, dataloader, optimizer, criterion, epoch):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        outputs = []
        targets = []
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['target'].to(device)
            output = model(input_ids, attention_mask)
            loss = criterion(output.view(-1), target)
            val_loss += loss.item()
            outputs.extend(output.cpu().numpy())
            targets.extend(target.cpu().numpy())
    outputs = np.array(outputs)
    targets = np.array(targets)
    outputs = np.round(outputs)
    micro_f1 = f1_score(targets, outputs, average='micro')
    macro_f1 = f1_score(targets, outputs, average='macro')
    weighted_f1 = f1_score(targets, outputs, average='weighted')    
    val_loss /= len(dataloader)
    return val_loss, micro_f1, macro_f1, weighted_f1

In [None]:
def train(model,train_loader,test_loader,criterion,optimizer,num_class,epochs):
    train_losses = []
    test_losses = []
    minibatch = 1
    wandb.define_metric('epoch')
    wandb.define_metric('train_epoch_loss', step_metric='epoch')
    wandb.define_metric('val_epoch_loss', step_metric='epoch')
    wandb.define_metric('f1_micro', step_metric='epoch')
    wandb.define_metric('f1_macro', step_metric='epoch')
    wandb.define_metric('f1_weighted', step_metric='epoch')
    wandb.define_metric('train_f1_micro', step_metric='epoch')
    wandb.define_metric('train_f1_macro', step_metric='epoch')
    wandb.define_metric('train_f1_weighted', step_metric='epoch')
    wandb.define_metric('minibatch')
    wandb.define_metric('train_minibatch_loss', step_metric='minibatch')
    for epoch in range(epochs):
        minibatch, model = train_epoch(model, train_loader, criterion, optimizer, minibatch)
        val_loss, val_f1_micro, val_f1_macro, val_f1_weighted = evaluation(model, test_loader, optimizer, criterion, epoch)
        train_loss, train_f1_micro, train_f1_macro, train_f1_weighted = evaluation(model, train_loader, optimizer, criterion, epoch)
        train_losses.append(train_loss)
        test_losses.append(val_loss)
        log_dict = {
        'epoch': epoch,
        'train_epoch_loss': train_loss,
        'val_epoch_loss': val_loss,
        'f1_micro': val_f1_micro,
        'f1_macro': val_f1_macro,
        'f1_weighted': val_f1_weighted,
        'train_f1_micro': train_f1_micro,
        'train_f1_macro': train_f1_macro,
        'train_f1_weighted': train_f1_weighted
        }
        wandb.log(log_dict)
    return train_losses,test_losses         

In [None]:
train_losses, test_losses = train(model,train_loader,test_loader,criterion,optimizer,1,3)
wandb.finish()

In [None]:
# save the model
torch.save(model.state_dict(), 'Task2_BERT.pt')

In [None]:
def plot_loss(loss,label):
    plt.plot(loss, label=label)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    
plot_loss(test_losses,'val')
plot_loss(train_losses,'train')