In [1]:
!pip install numpy
!pip install torch
!pip install transformers
!pip install tensorflow
!pip install tensorboard



In [2]:
from transformers import RobertaTokenizer
import torch
from pathlib import Path
from tqdm.auto import tqdm
import random
import os

import tensorflow as tf

2024-02-11 21:56:30.687884: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print(tf.config.list_physical_devices('GPU'))
torch.cuda.is_available()
# WTF?

[]


True

In [4]:
!nvidia-smi

Sun Feb 11 21:56:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:01:00.0 Off |                  N/A |
| 27%   25C    P8              21W / 250W |      5MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:02:0

In [5]:
filename = 'ecfp0'
samples_count = '2M'
model_name = f'molberto_{filename}_{samples_count}'

In [6]:
tokenizer = RobertaTokenizer.from_pretrained(model_name, max_len=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [7]:
def mlm(tensor):
    # create random array of floats with equal dims to tensor
    rand = torch.rand(tensor.shape)
    # mask random 15% where token is not 0 <s>, 1 <pad>, or 2 <s/>
    mask_arr = (rand < .15) * (tensor != 0) * (tensor != 1) * (tensor != 2)
    # loop through each row in tensor (cannot do in parallel)
    for i in range(tensor.shape[0]):
        # get indices of mask positions from mask array
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        # mask tensor
        tensor[i, selection] = 4
    return tensor

In [8]:
paths = [str(x) for x in Path(f'data/{filename}').glob('*.txt')]
# initialize lists of tensors
input_ids = []
mask = []
labels = []
# take first 2M sentences for training
for path in tqdm(paths[:200]):
    # open the file and split into list by newline characters
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    # encode
    sample = tokenizer(lines, max_length=512, truncation=True, padding='max_length')
    # convert tokens to tensor
    labels.append(torch.tensor(sample.input_ids))
    # create attention mask tensor
    mask.append(torch.tensor(sample.attention_mask))
    # mask ~15% of tokens to create inputs
    input_ids.append(mlm(labels[-1].detach().clone()))
# convert lists of tensors into tensors
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

  0%|          | 0/200 [00:00<?, ?it/s]

We have 2000000 tokenized sequences, each containing 512 tokens:

In [9]:
input_ids.shape

torch.Size([1992675, 512])

In [10]:
input_ids[0]

tensor([  0, 344,   4, 279, 273, 273, 279,   4,   4, 337,   4, 307, 337, 279,
        333, 279, 356,   4, 327, 294, 294, 294, 294, 294, 273, 410, 273, 279,
        348, 320,   4,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,  

We can see the special tokens here, `0` is our **<s\>** token, `2` our **<s\\>** token, `3` our **<mask\>** token, and at the end we have two `1` - or **<pad\>** - tokens.

Let's save these tensors to file for if we need to do any further training later.

In [11]:
import os

if not os.path.exists('molberto_training'):
    os.mkdir('molberto_training')

# torch.save(input_ids, 'molberto_training/input_ids.pt')
# torch.save(mask, 'molberto_training/attention_mask.pt')
# torch.save(labels, 'molberto_training/labels.pt')

# del input_ids, mask, labels

In [12]:
# input_ids = torch.load('molberto_training/input_ids.pt')
# mask = torch.load('molberto_training/attention_mask.pt')
# labels = torch.load('molberto_training/labels.pt')

### dataset and dataloader

In [13]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [15]:
dataset = Dataset(encodings)

In [16]:
loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

And move onto building our model, we first need to create a RoBERTa config object, which will describe which features we want to initialize our RoBERTa model with.

In [17]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab set in previous notebook
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

Then we import and initialize a RoBERTa model with a language modeling head.

In [18]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

  torch.utils._pytree._register_pytree_node(


And now we move onto training. First we setup GPU/CPU usage.

In [19]:
torch.cuda.is_available()

True

In [20]:
device = torch.device('cuda', index=1) if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [21]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-5)



In [22]:
from torch.utils.tensorboard import SummaryWriter

writer = torch.utils.tensorboard.SummaryWriter()

Now we move onto the training loop.

In [23]:
from tqdm import tqdm  # for our progress bar

epochs = 1  # trained for 4 in total
step = 0

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # take loss for tensorboard
        writer.add_scalar('Loss/train', loss, step)
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1

Epoch 0: 100%|███████████████████████████████████████████████████████████████████████| 62272/62272 [9:07:17<00:00,  1.90it/s, loss=0.00369]


In [24]:
model.save_pretrained(model_name)

In [25]:
torch.cuda.empty_cache()

In [26]:
print(torch.cuda.device_count())
print(torch.cuda.current_device())

8
0


In [27]:
torch.device('cuda', index=1)

device(type='cuda', index=1)