## Data pre-processing

In [1]:
import tokenizers
import torch
from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader

torch.cuda.empty_cache() 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("dair-ai/emotion")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [4]:
dataset['train'][:5]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy'],
 'label': [0, 0, 3, 2, 3]}

In [5]:
import tiktoken

In [6]:
token_encoder = tiktoken.get_encoding("gpt2")

In [7]:
token_encoder.n_vocab

50257

In [8]:
def encode_text(x):
    text = str(x['text'])
    num_classes = 5
    max_seq_len = 36
    
    output = token_encoder.encode(text)
    output = output if len(output) <= max_seq_len else output[:max_seq_len]
    
    padding_length = max_seq_len - len(output)
    if padding_length > 0:
        output += [0] * padding_length
    
    label = [0 for _ in range(num_classes)]
    label[x['label']-1] = 1
    
    result = {
        'text': text,
        'encoded_text': output,
        'label': label
    }
    return result


In [9]:
tokenized_dataset_train = dataset['train'].map(encode_text)
tokenized_dataset_test = dataset['test'].map(encode_text)
tokenized_dataset_validation = dataset['validation'].map(encode_text)

In [10]:
len(tokenized_dataset_train[8]['encoded_text'])

36

In [11]:
train_dataloader = DataLoader(tokenized_dataset_train, batch_size=512, shuffle=True)
test_dataloader = DataLoader(tokenized_dataset_test, shuffle=True)
val_dataloader = DataLoader(tokenized_dataset_validation, batch_size=512, shuffle=True)

## Model Building

In [12]:
import torch 

print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
0
NVIDIA GeForce GTX 1650


In [13]:
import sys
sys.path.append('..')

In [14]:
from model.transformers import EncoderClassifier
import torch

In [15]:
num_classes = 5
max_seq_len = 36

In [26]:
config = {
    "num_layers": 4,
    "vocab_size": token_encoder.n_vocab,
    "embed_dims": 768,
    "max_seq_len": max_seq_len,
    "n_segments": 5,
    "heads": 8,
    "dropout": 0.3,
    "device": "cpu",
    "ff_layer_sizes": [768, 256, 768],
    "batch_size": 512,
    "num_classes": 5
}

In [27]:
model = EncoderClassifier(config)

In [28]:
# model_file = "./models/model_epoch_62.pth"
# model.load_state_dict(torch.load(model_file))

# em = token_encoder.encode("i feel low energy i m just thirsty")
# n = 36 - len(em)
# inp = torch.Tensor(em + [0 for _ in range(n)]).int()
# inp = inp.reshape((36,1))

# o = model(inp)
# o

In [29]:
!nvidia-smi

Wed Nov 29 09:19:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 527.41       Driver Version: 527.41       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   72C    P8     3W /  N/A |   3936MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [30]:
from tqdm.autonotebook import tqdm
from torch.optim import Adam

optim = Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
epochs = 100
step = 0
device = config['device']
acc_list = []
loss_list = []
val_acc_list = []
val_loss_list = []

for epoch in range(epochs):
    # Training Loop
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    
    for batch in tqdm(train_dataloader, leave=True, desc=f"Epoch {epoch}:"):
        step+=1
        optim.zero_grad()
        inputs = torch.stack(batch['encoded_text']).int().to(device)
        labels = torch.stack(batch['label']).float()
        labels = torch.Tensor(labels).to(device)
        labels = labels.transpose(0, 1)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optim.step()

        total_loss += loss.item()
        _, labels = torch.max(labels, 1)
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()

    average_loss = total_loss / len(train_dataloader)
    accuracy = correct_predictions / len(train_dataloader)
    acc_list.append(accuracy)
    loss_list.append(average_loss)

    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {average_loss:.4f}, Training Accuracy: {accuracy:.4f}")

    # Validation Loop
    model.eval()
    total_val_loss = 0.0
    correct_val_predictions = 0

    for val_batch in tqdm(val_dataloader, leave=True):
        val_inputs = torch.stack(val_batch['encoded_text']).int().to(device)
        val_labels = torch.stack(val_batch['label']).float()
        val_labels = torch.Tensor(val_labels).to(device)
        val_labels = val_labels.transpose(0, 1)

        val_outputs = model(val_inputs)
        val_loss = criterion(val_outputs, val_labels)

        total_val_loss += val_loss.item()
        _, val_labels = torch.max(val_labels, 1)
        _, val_predicted = torch.max(val_outputs, 1)
        correct_val_predictions += (val_predicted == val_labels).sum().item()

    average_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = correct_val_predictions / len(val_dataloader)
    val_acc_list.append(val_accuracy)
    val_loss_list.append(average_val_loss)

    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {average_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Save the model after each epoch if needed
    # torch.save(model.state_dict(), f'models/model_epoch_{epoch + 1}.pth')