In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
# from sklearnex import patch_sklearn
# patch_sklearn()

import json
import pandas as pd
import numpy as np

max_length=128

In [None]:
PATH='/kaggle/input/emotion-dataset/train_file.json'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "roberta-large"

In [None]:
# read json
with open(PATH) as f:
    data = json.load(f)
    # to pandas
    df = pd.DataFrame(data)


In [None]:
df.head()

In [None]:
sentence_len = []
text=[]
EOS='</s>'
SEP='[SEP]'
SOS='<s>'

for i in range(len(df)):
    uterances=df.iloc[i]['utterances']
    speaker=df.iloc[i]['speakers']
    length=0
    sentence=SOS+' '
    for (speaker,uterance) in zip(speaker,uterances):
        sentence+=speaker+':'+uterance+' '+SEP+' '

        length+=len(uterance.split())
    if(length==2):
        print(sentence)
        print(df.iloc[i])
    sentence_len.append(length)
    text.append(sentence+EOS)
    # print(length)
    # break

In [None]:
text_df=pd.DataFrame(text,columns=['text'])
print(len(text),len(set(text)))
text_df.head()

In [None]:
label_encoding ={
    'pad':0,
    'surprise':1,
    'sadness':2,
    'anger':3,
    'fear':4,
    'disgust':5,
    'joy':6,
    'neutral':7
}


In [None]:
y_train=[]
for i in df["emotions"]:
    lst=[]
    for j in i:
        lst.append(label_encoding[j])

    lst.extend([0]*(max_length-len(lst)))
    y_train.append(lst)
x_train=text_df['text']

In [None]:
y_train[0],y_train[1]

In [None]:
from torch import nn
config = AutoConfig.from_pretrained(model_id)
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)
model = RobertaForSequenceClassification.from_pretrained(model_id,config=config)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 7),
    nn.Softmax()  
)
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0005)

In [None]:
# x_train_tokenized = tokenizer(text_df["text"].tolist(),padding=True,truncation=True,max_length=128)
# x_train_tokenized[0]

In [None]:
model = model.to(device)
model

In [None]:
# class Dataset(torch.utils.data.Dataset):
#     def __init__(self,encodings,labels=None):
#         self.labels = labels
#         self.encodings = encodings
    
#     def __getitem__(self,idx):
#         item = {key:torch.tensor(val[idx]) for key,val in self.encodings.items()}
#         if self.labels is not None:
#             item["labels"]  = torch.tensor(self.labels[idx])
#         return item
# #             return input_data["input_ids"],input_data["attention_mask"],self.labels[idx]


#     def __len__(self):
#         return len(self.encodings["input_ids"])

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer,labels):
        self.data=data
        self.tokenizer = tokenizer
        self.labels = labels
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):   
        encoded_pair = self.tokenizer(self.data[idx],max_length=128,truncation=True,return_tensors="pt",padding="max_length")
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        return input_ids,attention_mask,torch.tensor(self.labels[idx]).squeeze(0)

In [None]:
train_dataset = Dataset(x_train,tokenizer,y_train)

In [None]:
for i in train_dataset:
    for j in i:
        print(len(j))
    break

In [None]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(train_dataset,batch_size=16,shuffle=False)

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
from tqdm.notebook import tqdm
import gc
train_losses=[]
val_losses=[]
def train_epoch(model, optimizer,epoch):
    model.train()
    losses = 0
    progress = tqdm(train_dataloader, desc=f"Epoch:{epoch}",total=len(train_dataloader), leave=False)
    i=0
    for batch in progress:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        logits=model(input_ids,attention_mask=attention_mask).logits.to(torch.float64).view(-1)*7
        loss=loss_fn(logits,labels)
        losses += loss.item()
        loss.backward()
        optimizer.step()
        del input_ids
        del attention_mask
        del labels
        del logits
        gc.collect()
        torch.cuda.empty_cache()
        progress.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})
    x = losses / len(list(train_dataloader))
    train_losses.append(x)
#     wandb.log({'epoch':epoch,'train_loss':x})
    tqdm.write(f"Epoch:{epoch}, Avg Train Loss: {x}")
    gc.collect()
    torch.cuda.empty_cache()
    return x



def evaluate(model,val_dataloader):
    with torch.no_grad():
        model.eval()
        losses = 0
        
        total_logits=[]
        for batch in val_dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            logits=model(input_ids,attention_mask=attention_mask).logits.to(torch.float64).view(-1)*5
            total_logits.extend(list(logits.cpu().detach().numpy()))
            
        return total_logits

In [None]:
gc.collect()
torch.cuda.empty_cache()
for epoch in range(1, 2):
    train_loss = train_epoch(model, optimizer,epoch)
    val_loss = evaluate(model)
    torch.save(model.state_dict(), f"model1A_epoch{epoch}.pth")
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}"))