In [1]:
!pip install evaluate



In [1]:
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from transformers import AdamW
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import evaluate
import warnings  

warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_float32_matmul_precision('high')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
def set_random_seed(seed):
    """
    Set the random seed for NumPy and PyTorch.
    Parameters:
    seed (int): The seed value to set for random number generation.
    """
    np.random.seed(seed)  # Set seed for NumPy
    torch.manual_seed(seed)  # Set seed for CPU
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # Set seed for GPU
        torch.cuda.manual_seed_all(seed) #Set for all GPUs

set_random_seed(42)

In [22]:
lr = 6e-4
batch_size = 128
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert_checkpoints",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [23]:
df1 = pd.read_csv('data/goemotions_1.csv')
df2 = pd.read_csv('data/goemotions_2.csv')
df3 = pd.read_csv('data/goemotions_3.csv')
print(df1.shape, df2.shape, df3.shape)

# df = pd.concat((df1, df2, df3))
df = df1.copy()
print(df.shape)

train_df, test_df = train_test_split(df, test_size=.3, shuffle=True)

(70000, 37) (70000, 37) (71225, 37)
(70000, 37)


In [24]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.tokenizer = tokenizer
        self.text = df.text.values
        emotions = df.loc[:, 'admiration':'neutral']
        self.emotion_names = emotions.columns
        self.emotions_values =emotions.values
    def __getitem__(self, idx):
        dict_idx = self.tokenizer(self.text[idx], truncation=True, padding='max_length', max_length=512)
        dict_idx['labels'] = self.emotions_values[idx]
        return dict_idx
    def __len__(self):return self.text.shape[0]

In [25]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_dataset = TextDataset(train_df, tokenizer = tokenizer)
test_dataset = TextDataset(test_df,tokenizer = tokenizer)

In [26]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(train_dataset.emotion_names),
    output_attentions = False,
    output_hidden_states = False,
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [28]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    labels = np.argmax(labels, axis = 1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc}

In [29]:
class CustomTrainer(Trainer):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.epoch_predictions = []
        self.epoch_labels = []
        self.epoch_loss = []
    
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            **inputs
        )

        loss = torch.nn.CrossEntropyLoss()(outputs['logits'],
                                            inputs['labels'].type(torch.float32))
        return (loss, outputs) if return_outputs else loss

class CosineLearningRate():
    def __init__(self, max_lr, min_lr, warmupsteps, max_steps):
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.warmupsteps = warmupsteps
        self.max_steps = max_steps
    
    def get_lr(self, it):
        if it < self.warmupsteps:
            return self.max_lr * (it + 1)
        if it > self.max_steps:
            return self.min_lr
        decay_ratio = (it - self.warmupsteps) / (self.max_steps - self.warmupsteps)
        assert 0 <= decay_ratio <= 1
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        return self.min_lr + coeff * (self.max_lr - self.min_lr)

In [32]:
optimizer = torch.optim.AdamW(model.parameters(), betas = (0.9, 0.95), eps = 1e-8)
coslr = CosineLearningRate(max_lr = lr, 
                           min_lr = lr * .1, 
                           warmupsteps = 10, 
                           max_steps = 50)
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer, lr_lambda = coslr.get_lr)
trainer = CustomTrainer(model = model, 
                 args = training_args,
                 train_dataset = train_dataset,
                 eval_dataset = test_dataset,
                 compute_metrics = compute_metrics,
                 optimizers = (optimizer, lr_scheduler),
                 )
trainer.train()

  0%|          | 0/3830 [00:00<?, ?it/s]

ValueError: Expected input batch_size (128) to match target batch_size (3584).

In [31]:
# apply model to validation dataset
predictions = trainer.predict(test_dataset)

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

ValueError: Expected input batch_size (128) to match target batch_size (3584).

In [None]:
emotions = df.loc[:, 'admiration':'neutral'].columns
sample = torch.tensor(test_dataset[4]['input_ids']).reshape(1, -1).to(device)
print(tokenizer.decode(test_dataset[4]['input_ids'])) 
print(np.argmax(test_dataset[4]['labels']))
print(torch.argmax(model(sample)[0]))

NameError: name 'df' is not defined

In [None]:
torch.tensor(test_dataset[4]['input_ids']).to(device)

NameError: name 'test_dataset' is not defined

In [None]:
!ls

bert_checkpoints
