# Overview
- Here we will be training a pretrained model. In this case the model is `bert-base-multilingual-uncased` from `Huggingface`.
- The tokenizer we will be using is `BertTokenizer`
- We will be using the `GoEmotions` dataset that was preprocessed earlier for training purposes.



In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from torch import cuda
!pip install torchviz
from torchviz import make_dot
device = 'cuda:0' if cuda.is_available() else 'cpu'



In [2]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [2]:
X_train, X_test, X_val = np.load("/content/drive/MyDrive/NLP/Datasets/goemotion_train_text_processed.npz", allow_pickle=True), np.load("/content/drive/MyDrive/NLP/Datasets/goemotion_test_text_processed.npz", allow_pickle=True), np.load("/content/drive/MyDrive/NLP/Datasets/goemotion_val_text_processed.npz", allow_pickle=True)
X_train, X_test, X_val = X_train.f.arr_0, X_test.f.arr_0, X_val.f.arr_0
y_train, y_test, y_val = np.load("/content/drive/MyDrive/NLP/Datasets/goemotion_train_labels.npz", allow_pickle=True), np.load("/content/drive/MyDrive/NLP/Datasets/goemotion_test_labels.npz", allow_pickle=True), np.load("/content/drive/MyDrive/NLP/Datasets/goemotion_val_labels.npz", allow_pickle=True)
y_train, y_test, y_val = y_train.f.arr_0, y_test.f.arr_0, y_val.f.arr_0

### 1. Defining parameters

In [3]:
MAX_LEN = 50
BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

### 2. Defining Custom Dataset using `Dataset` class
Here in constructor
- X : The list of sentences
- label : The class labeled in the dataset against each sentence
- tokenizer : The tokenizer to be used in embedding the words
- max_len : Length of the sequence

In `__getitem__(self, index)` method
- the input text string returs `input_ids`, `attention_mask`, `token_type_ids` as input arguments for the `model`.

In [4]:
class CustomDataset(Dataset):
    def __init__(self, X, label, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = X
        a = label
        labels = torch.zeros((a.size, a.max() + 1))
        labels[np.arange(a.size), a] = 1
        self.targets = labels.type(torch.float).to(device)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': self.targets[index].clone().detach().requires_grad_(True)
        }

In [5]:
training_set = CustomDataset(X_train, y_train, tokenizer, MAX_LEN)
testing_set = CustomDataset(X_test, y_test, tokenizer, MAX_LEN)

### 3. Perparing Datasets for training in batches of size 4


In [14]:
train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': 1,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### 4. Defining and Initiating the `model`
- Using weights from `HuggingFace` `BERT` multilingual uncased version
- On the original output from `BERT`, it has a `logits` layer with `768` features
- To fine tune it a `Dropout` layer and a `Linear` layer has been added sequentially
- Final layer has `28` outputs



In [11]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-multilingual-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 28)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

### 5. Saving a view of the `model` to be finetuned

In [64]:
# Create dummy input tensors
dummy_ids = torch.randint(0, 100, (1, 128)).to(device)
dummy_mask = torch.randint(0, 2, (1, 128)).to(device)
dummy_token_type_ids = torch.randint(0, 2, (1, 128)).to(device)

# Generate a visualization of the model's computation graph
dot = make_dot(model(dummy_ids, dummy_mask, dummy_token_type_ids),
               params=dict(model.named_parameters()))

# Save the graph to a file
dot.render("bert_model_graph")

# Display the graph
dot.view()

'bert_model_graph.pdf'

### 6. Defining `optimizer` and `loss`

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## 7. Training the `model` for `4` epochs with batch size `4`

In [15]:
def evaluate(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0.0
    total_samples = 0

    with torch.no_grad():
        for data in data_loader:
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)

            total_loss += loss.item()
            total_samples += targets.size(0)

    average_loss = total_loss / total_samples
    print(f'Validation Loss: {average_loss}')


def train(epoch):
    model.train()
    try:
      for _, data in enumerate(training_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.float)

          outputs = model(ids, mask, token_type_ids)

          optimizer.zero_grad()
          loss = loss_fn(outputs, targets)
          loss.backward()
          optimizer.step()
          if _%1000==0:
              print(f'Epoch: {epoch}, Loss:  {loss.item()}')
              if testing_loader is not None:
                evaluate(model, testing_loader, loss_fn, device)
    except Exception as e:
      pass

for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  3.3253016471862793
Validation Loss: 3.2862547790013514
Epoch: 1, Loss:  2.938786745071411
Validation Loss: 3.008619713480134
Epoch: 2, Loss:  1.847422480583191
Validation Loss: 1.6520635094888327
Epoch: 3, Loss:  1.2256757020950317
Validation Loss: 1.5537537498485763


### 8. Saving the model states for inference

In [16]:
torch.save(model.state_dict(), "/content/drive/MyDrive/NLP/model.pt")