## Data pre-processing

In [23]:
import tokenizers
import torch
from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader

In [24]:
dataset = load_dataset("dair-ai/emotion")

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [26]:
dataset['train'][:5]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy'],
 'label': [0, 0, 3, 2, 3]}

In [27]:
import tiktoken

In [127]:
token_encoder = tiktoken.get_encoding("gpt2")
max_sqe_len = 36

In [142]:
token_encoder.n_vocab

50257

In [203]:
def encode_text(x):
    text = str(x['text'])
    output = token_encoder.encode(text)
    output = output if len(output) <= max_sqe_len else output[:max_sqe_len]
    if len(output) < max_sqe_len:
        for _ in range(max_sqe_len):
            output.append(0)
    result = {
        'text': [text],
        'encoded_text': [output],
        'label': [x['label']]
    }
    return result

In [204]:
tokenized_dataset_train = dataset['train'].map(encode_text)
tokenized_dataset_test = dataset['test'].map(encode_text)
tokenized_dataset_validation = dataset['validation'].map(encode_text)

Map: 100%|██████████| 16000/16000 [00:04<00:00, 3346.68 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2919.74 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 3423.97 examples/s]


In [205]:
len(tokenized_dataset_train[0]['encoded_text'])

1

In [206]:
train_dataloader = DataLoader(tokenized_dataset_train, batch_size=16, shuffle=True)
test_dataloader = DataLoader(tokenized_dataset_test, shuffle=True)
val_dataloader = DataLoader(tokenized_dataset_validation, batch_size=16, shuffle=True)

## Model Building

In [139]:
import sys
sys.path.append('..')

In [140]:
from model.transformers import EncoderClassifier
import torch

In [207]:
config = {
    "num_layers": 4,
    "vocab_size": token_encoder.n_vocab,
    "embed_dims": 768,
    "max_seq_len": max_sqe_len,
    "n_segments": 5,
    "heads": 8,
    "dropout": 0.3,
    "device": "cpu",
    "ff_layer_sizes": [768, 256, 768],
    "batch_size": 16,
    "num_classes": 5
}

In [208]:
model = EncoderClassifier(config)

In [209]:
from tqdm.autonotebook import tqdm
from torch.optim import Adam

optim = Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

In [221]:
z = [0 for _ in range(config['num_classes'])]
z[labels[0]-1] = 1
z

[0, 0, 0, 0, 1]

In [224]:
labels_

[tensor([5])]

In [223]:
epochs = 10
step = 0
device = config['device']

for epoch in range(epochs):
    loop = tqdm(train_dataloader, leave=True)
    
    model.train()
    
    total_loss = 0.0
    correct_predictions = 0
    for batch in loop:
        optim.zero_grad()

        inputs = [t for t in batch['encoded_text']]
        inputs = torch.Tensor(inputs).to(device, dtype=torch.int)
        inputs = inputs.transpose(0, 1)
        
        labels_ = [t for t in batch['label']]
        labels = torch.LongTensor(labels_).to(device)
        
        outputs = model(inputs)
        outputs = outputs[0]
        # labels = 
        loss = criterion(outputs, labels)
        
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()

    average_loss = total_loss / len(train_dataloader)
    accuracy = correct_predictions / 16000
    
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch 0:   0%|          | 29/16000 [00:43<6:37:52,  1.49s/it, loss=1.61]


IndexError: Target 5 is out of bounds.

In [161]:
batch['encoded_text']

AttributeError: 'list' object has no attribute 'to'