In [1]:
!pip install numpy
!pip install torch
!pip install transformers
!pip install tensorflow
!pip install wandb



In [1]:
from transformers import RobertaTokenizer
import torch
from pathlib import Path
from tqdm.auto import tqdm
import random
import os
import wandb

In [3]:
!python3 -m wandb login eb7b1964fb84cd81de96b2a273ecf2bb6254aeac

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/alexeyorlov53/.netrc


In [2]:
radius = 'ecfp0'
folder = 'ecfps_cutted'
folder_with_paths = folder + '/' + radius
samples_count = '2M'
model_name = f'molberto_{radius}_{samples_count}'

In [3]:
# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained(model_name, max_len=512)

In [4]:
def mlm(tensor):
    # create random array of floats with equal dims to tensor
    rand = torch.rand(tensor.shape)
    # mask random 15% where token is not 0 <s>, 1 <pad>, or 2 <s/>
    mask_arr = (rand < .15) * (tensor != 0) * (tensor != 1) * (tensor != 2)
    # loop through each row in tensor (cannot do in parallel)
    for i in range(tensor.shape[0]):
        # get indices of mask positions from mask array
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        # mask tensor
        tensor[i, selection] = 4
    return tensor

In [5]:
paths = [str(x) for x in Path(folder_with_paths).glob('*.txt')]
len(paths)

238

In [None]:
# take sentences for training
def tokenize_ecfps(start, end):
    input_ids = []
    mask = []
    labels = []
    for path in tqdm(paths[start:end]):
        with open(path, 'r', encoding='utf-8') as fp:
            lines = fp.read().split('\n')
        sample = tokenizer(lines, max_length=512, truncation=True, padding='max_length')
        labels.append(torch.tensor(sample['input_ids']))
        mask.append(torch.tensor(sample['attention_mask']))
        input_ids.append(mlm(labels[-1].detach().clone())) # mask ~15% of tokens to create inputs
    
    input_ids = torch.cat(input_ids)
    mask = torch.cat(mask)
    labels = torch.cat(labels)
    return input_ids, mask, labels

In [9]:
train_input_ids, train_mask, train_labels = tokenize_ecfps(0, 200)
validation_input_ids, validation_mask, validation_labels = tokenize_ecfps(200, 220)
test_input_ids, test_mask, test_labels = tokenize_ecfps(220, 238)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [10]:
print(train_input_ids.shape)
print(validation_input_ids.shape)
print(validation_input_ids.shape)

torch.Size([1992675, 512])
torch.Size([200000, 512])
torch.Size([200000, 512])


In [11]:
train_labels[0]

tensor([  0, 344, 348, 279, 273, 279, 298, 348, 318, 307, 414, 320, 320, 320,
        279, 279, 279, 298, 348, 318, 307, 414, 320, 320, 320, 273, 279, 348,
        320, 279, 279, 368, 294, 368, 279, 279, 368, 294, 368, 225,   2,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,  

In [12]:
# torch.save(input_ids, 'molberto_training/input_ids.pt')
# torch.save(mask, 'molberto_training/attention_mask.pt')
# torch.save(labels, 'molberto_training/labels.pt')

# del input_ids, mask, labels

In [13]:
# input_ids = torch.load('molberto_training/input_ids.pt')
# mask = torch.load('molberto_training/attention_mask.pt')
# labels = torch.load('molberto_training/labels.pt')

### dataset and dataloader

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [15]:
train_dataset = Dataset({'input_ids': train_input_ids, 'attention_mask': train_mask, 'labels': train_labels})
validation_dataset = Dataset({'input_ids': validation_input_ids, 'attention_mask': validation_mask, 'labels': validation_labels})
test_dataset = Dataset({'input_ids': test_input_ids, 'attention_mask': test_mask, 'labels': test_labels})

In [16]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

And move onto building our model, we first need to create a RoBERTa config object, which will describe which features we want to initialize our RoBERTa model with.

In [17]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab set in previous notebook
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

Then we import and initialize a RoBERTa model with a language modeling head.

In [18]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

  torch.utils._pytree._register_pytree_node(


And now we move onto training. First we setup GPU/CPU usage.

In [19]:
torch.cuda.is_available()

True

In [20]:
device = torch.device('cuda', index=2) if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).

In [21]:

from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-5)

2024-03-01 09:38:25.288448: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
wandb.init(
    project="efcp_transformer",
    name="RobertaForMLM on ecpf0 training (2M)",
    config={}
)

[34m[1mwandb[0m: Currently logged in as: [33morlov-aleksei53[0m ([33mmoleculary-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Now we move onto the training loop.

In [23]:
from tqdm import tqdm  # for our progress bar
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

epochs = 1
step = 0

validation_iterator = iter(validation_loader)
for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        true_labels = batch['labels'].numpy().flatten()
        pred_labels = torch.nn.functional.softmax(logits, dim=1).argmax(axis=-1).cpu().detach().numpy().flatten()

        # write down loss and metrics
        wandb.log({"loss/train": loss}, step=step)
        wandb.log({"accuracy/train": accuracy_score(true_labels, pred_labels)}, step=step)
        wandb.log({"f1/train": f1_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"precision/train": precision_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"recall/train": recall_score(true_labels, pred_labels, average='micro')}, step=step)
        
        loss.backward()
        optim.step()
        optim.zero_grad()

        with torch.no_grad():
            try:
                validation_batch = next(validation_iterator)
            except StopIteration:
                validation_dataset = Dataset({'input_ids': validation_input_ids, 'attention_mask': validation_mask, 'labels': validation_labels})
                validation_batch = iter(validation_dataset)
                validation_batch = next(validation_iterator)
            
            input_ids = validation_batch['input_ids'].to(device)
            attention_mask = validation_batch['attention_mask'].to(device)
            labels = validation_batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            logits = outputs.logits
            true_labels = batch['labels'].numpy().flatten()
            pred_labels = torch.nn.functional.softmax(logits, dim=1).argmax(axis=-1).cpu().detach().numpy().flatten()
    
            # write down loss and metrics
            wandb.log({"loss/validation": loss}, step=step)
            wandb.log({"accuracy/validation": accuracy_score(true_labels, pred_labels)}, step=step)
            wandb.log({"f1/validation": f1_score(true_labels, pred_labels, average='micro')}, step=step)
            wandb.log({"precision/validation": precision_score(true_labels, pred_labels, average='micro')}, step=step)
            wandb.log({"recall/validation": recall_score(true_labels, pred_labels, average='micro')}, step=step)
            
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        if step > 2500:
            break
        step += len(batch)


Epoch 0:   1%|▉                                                                         | 834/62272 [29:39<36:25:20,  2.13s/it, loss=0.418]


In [24]:
wandb.finish()

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy/train,▁▁▁▁▁▁▁▁▁▂▁▁▂▁▂▂▂▄▅▆▆▇▇▇████████████████
accuracy/validation,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▅▅▆▇▇█████████████████
f1/train,▁▁▁▁▁▁▁▁▁▂▁▁▂▁▂▂▂▄▅▆▆▇▇▇████████████████
f1/validation,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▅▅▆▇▇█████████████████
loss/train,███▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
loss/validation,███▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
precision/train,▁▁▁▁▁▁▁▁▁▂▁▁▂▁▂▂▂▄▅▆▆▇▇▇████████████████
precision/validation,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▅▅▆▇▇█████████████████
recall/train,▁▁▁▁▁▁▁▁▁▂▁▁▂▁▂▂▂▄▅▆▆▇▇▇████████████████
recall/validation,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▅▅▆▇▇█████████████████

0,1
accuracy/train,0.98883
accuracy/validation,0.92694
f1/train,0.98883
f1/validation,0.92694
loss/train,0.41928
loss/validation,0.41759
precision/train,0.98883
precision/validation,0.92694
recall/train,0.98883
recall/validation,0.92694


In [25]:
wandb.init(
    project="efcp_transformer",
    name="RobertaForMLM on ecpf0 testing (2M)",
    config={}
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113502711264624, max=1.0…

In [26]:
step = 0

with torch.no_grad():
    loop = tqdm(test_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        true_labels = batch['labels'].numpy().flatten()
        pred_labels = torch.nn.functional.softmax(logits, dim=1).argmax(axis=-1).cpu().detach().numpy().flatten()

        # write down loss and metrics
        wandb.log({"loss/test": loss}, step=step)
        wandb.log({"accuracy/test": accuracy_score(true_labels, pred_labels)}, step=step)
        wandb.log({"f1/test": f1_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"precision/test": precision_score(true_labels, pred_labels, average='micro')}, step=step)
        wandb.log({"recall/test": recall_score(true_labels, pred_labels, average='micro')}, step=step)
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += len(batch)

wandb.finish()

Epoch 0:   8%|██████▎                                                                       | 459/5625 [04:53<55:02,  1.56it/s, loss=0.416]


KeyboardInterrupt: 

In [27]:
wandb.finish()

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy/test,▆▅█▆▁▇▇▇▅▅▄▆▇▇▆▅▇▇▆▆█▆▇▃▆▄▅▆▇▇▇▇▆▇█▆▅▆▆▅
f1/test,▆▅█▆▁▇▇▇▅▅▄▆▇▇▆▅▇▇▆▆█▆▇▃▆▄▅▆▇▇▇▇▆▇█▆▅▆▆▅
loss/test,██▄▅█▄▅▁█▅▃▇▄▄▇▇▆▄▅▅▂▇▇▄▇▆▇▆▄▃▆▃▆▄▂▄█▅▆▄
precision/test,▆▅█▆▁▇▇▇▅▅▄▆▇▇▆▅▇▇▆▆█▆▇▃▆▄▅▆▇▇▇▇▆▇█▆▅▆▆▅
recall/test,▆▅█▆▁▇▇▇▅▅▄▆▇▇▆▅▇▇▆▆█▆▇▃▆▄▅▆▇▇▇▇▆▇█▆▅▆▆▅

0,1
accuracy/test,0.99158
f1/test,0.99158
loss/test,0.41582
precision/test,0.99158
recall/test,0.99158


In [28]:
model.save_pretrained(f'molberto_{radius}_2.5K_dummy')

In [None]:
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.device_count())
print(torch.cuda.current_device())

In [None]:
torch.device('cuda', index=1)