In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#check out https://github.com/JULIELab/EmoBank for more info about the data
df = pd.read_csv("emobank.csv")

print(df[['text', 'V', 'A', 'D']].head())

sns.pairplot(df[['V', 'A', 'D']])
plt.suptitle("VAD Distributions", y=1.02)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

df = df[['text', 'V', 'A', 'D']]

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
print(len(train_df))
train_df.dropna(axis=0,inplace=True)
print(len(train_df))

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import torch
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class VADDataset(Dataset):
    def __init__(self, df):
        self.encodings = tokenizer(df["text"].tolist(), truncation=True, padding=True, max_length=128)
        self.labels = df[['V', 'A', 'D']].values.astype(np.float32)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = VADDataset(train_df)
test_dataset = VADDataset(test_df)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", problem_type="regression", num_labels=3)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/vad_regression",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/vad_regression/logs",
    logging_strategy="epoch",
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

model.save_pretrained("/content/drive/MyDrive/vad_regression/vad-bert")
tokenizer.save_pretrained("/content/drive/MyDrive/vad_regression/vad-bert")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("RobroKools/vad-bert")
tokenizer.push_to_hub("RobroKools/vad-bert")