In [1]:
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import wandb

In [2]:
!python3 -m wandb login eb7b1964fb84cd81de96b2a273ecf2bb6254aeac

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/alexeyorlov53/.netrc


In [None]:
device = torch.device("cuda", index=4) if torch.cuda.is_available() else torch.device('cpu')

### Upload and Split Dataset

In [2]:
dataframe = pd.read_csv("data_10k.csv")

In [3]:
dataframe = dataframe.drop(columns=['ecfp2', 'ecfp3', 'Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa'])

In [4]:
dataframe

Unnamed: 0,Smiles,ecfp1
0,COc1cc(C2(C)CCCc3nc(SCc4ncccn4)n(-c4ccc(F)cc4)...,"['2246728737', '864674487', '3217380708', '321..."
1,COC(=O)c1sc(NC(=O)C2c3ccccc3Oc3ccccc32)c(C(=O)...,"['2246728737', '864674487', '2246699815', '864..."
2,CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...,"['2246728737', '2245384272', '2976033787', '31..."
3,Cc1cccc(-n2cc(C(=O)N3CCC[C@@H]([n+]4cc[nH]c4)C...,"['2246728737', '3217380708', '3218693969', '32..."
4,CCOC(=O)[C@H](C1CC1)N1C(=O)[C@@H](CC(=O)O)C[C@...,"['2246728737', '2245384272', '864674487', '224..."
...,...,...
9995,CCN1CCN(CC(O)c2ccc(Br)cc2)CC1,"['2246728737', '2245384272', '2092489639', '29..."
9996,O=C(O)CNC(=O)CNC(=O)CNC(=O)CSC(=O)c1ccccc1,"['864942730', '2246699815', '864662311', '2245..."
9997,O=C(N[C@@]12CCC[C@@](C#Cc3ccccn3)(CC1)C2)c1ccc...,"['864942730', '2246699815', '847961216', '2976..."
9998,CCOc1ccccc1-c1cc(C(=O)N2CCOCC2)c2ccccc2n1,"['2246728737', '2245384272', '864674487', '321..."


In [5]:
# this because pandas thinks columns with arrays are strings
def preprocess_data_dataset(df, column):
    for row in tqdm(range(len(df))):
        str_ints = eval(df.iloc[row][column])
        str_fingerprint = ' '.join(str_ints[0])
        df.at[row, column] = str_fingerprint

In [6]:
preprocess_data_dataset(dataframe, 'ecfp1')

  0%|          | 0/10000 [00:00<?, ?it/s]

In [7]:
from MolCLR.dataset.dataset import MoleculeDatasetWrapper



In [10]:
molclr_dataset = MoleculeDatasetWrapper(data=dataframe, batch_size=64, num_workers=8, valid_size=0.2, tokenizer_name='molberto_ecfp0_2M')

TypeError: MoleculeDatasetWrapper.__init__() got an unexpected keyword argument 'tokenizer_name'

In [None]:
train_loader, valid_loader = self.dataset.get_data_loaders()

if self.config['model_type'] == 'gin':
    from models.ginet_molclr import GINet
    model = GINet(**self.config["model"]).to(self.device)
    model = self._load_pre_trained_weights(model)
elif self.config['model_type'] == 'gcn':
    from models.gcn_molclr import GCN
    model = GCN(**self.config["model"]).to(self.device)
    model = self._load_pre_trained_weights(model)
else:
    raise ValueError('Undefined GNN model.')
print(model)

optimizer = torch.optim.Adam(
    model.parameters(), self.config['init_lr'], 
    weight_decay=eval(self.config['weight_decay'])
)
scheduler = CosineAnnealingLR(
    optimizer, T_max=self.config['epochs']-self.config['warm_up'], 
    eta_min=0, last_epoch=-1
)

if apex_support and self.config['fp16_precision']:
    model, optimizer = amp.initialize(
        model, optimizer, opt_level='O2', keep_batchnorm_fp32=True
    )

model_checkpoints_folder = os.path.join(self.writer.log_dir, 'checkpoints')

# save config file
_save_config_file(model_checkpoints_folder)

n_iter = 0
valid_n_iter = 0
best_valid_loss = np.inf

for epoch_counter in range(self.config['epochs']):
    for bn, (xis, xjs) in enumerate(train_loader):
        optimizer.zero_grad()

        xis = xis.to(self.device)
        xjs = xjs.to(self.device)

        loss = self._step(model, xis, xjs, n_iter)

        if n_iter % self.config['log_every_n_steps'] == 0:
            self.writer.add_scalar('train_loss', loss, global_step=n_iter)
            self.writer.add_scalar('cosine_lr_decay', scheduler.get_last_lr()[0], global_step=n_iter)
            print(epoch_counter, bn, loss.item())

        if apex_support and self.config['fp16_precision']:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        optimizer.step()
        n_iter += 1

    # validate the model if requested
    if epoch_counter % self.config['eval_every_n_epochs'] == 0:
        valid_loss = self._validate(model, valid_loader)
        print(epoch_counter, bn, valid_loss, '(validation)')
        if valid_loss < best_valid_loss:
            # save the model weights
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), os.path.join(model_checkpoints_folder, 'model.pth'))
    
        self.writer.add_scalar('validation_loss', valid_loss, global_step=valid_n_iter)
        valid_n_iter += 1
    
    if (epoch_counter+1) % self.config['save_every_n_epochs'] == 0:
        torch.save(model.state_dict(), os.path.join(model_checkpoints_folder, 'model_{}.pth'.format(str(epoch_counter))))

    # warmup for the first few epochs
    if epoch_counter >= self.config['warm_up']:
        scheduler.step()

### Create Transformer Model

In [13]:
from transformers import RobertaForMaskedLM
from transformers import RobertaConfig

from MolCLR.models.gcn_molclr import GCN
from MolCLR.molclr import MolCLR

class MolecularDoubleApproach(torch.nn.Module):
    def __init__(self):
        super(MolecularPropertiesClassification, self).__init__()

        config = RobertaConfig(
            vocab_size=30_522,  # we align this to the tokenizer vocab set in previous notebook
            max_position_embeddings=514,
            hidden_size=768,
            num_attention_heads=12,
            num_hidden_layers=6,
            type_vocab_size=1
        )
        self.model1 = RobertaForMaskedLM(config)
        # removing last layer of transformer
        self.model1.pooler = torch.nn.Identity()
        
        self.model2 = GCN(**config["model"]).to(self.device)
        self.model2 = self._load_pre_trained_weights(model)

        self.linear1 = torch.nn.Linear(768 * 2, 768, bias=True)
        self.linear2 = torch.nn.Linear(768, 2, bias=True)

    def forward(self, input_ids = None, attention_mask=None):
        outputs1 = self.model1(input_ids=input_ids, attention_mask=attention_mask)
        outputs2 = MolCLR._step(self.model2, xis, xjs)
        last_hidden_state1 = outputs1[0]
        last_hidden_state2 = outputs2[0]
        
        first_linear_out = self.linear1( \
            torch.cat((last_hidden_state1[:, 0, : ], last_hidden_state2[:, 0, : ]), dim=-1).view(-1, 2 * 768))
        logits = self.linear2(torch.nn.functional.sigmoid(first_linear_out))

        return logits
        

ModuleNotFoundError: No module named 'torch_sparse'

### Create PyTorch DataLoader

In [20]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = 64, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['validation'], shuffle = True, batch_size = 64, collate_fn = data_collator
)

In [11]:
model = MolecularDoubleApproach().to(device)

NameError: name 'MolecularDoubleApproach' is not defined

In [22]:
model

MolecularPropertiesClassification(
  (transformer1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [23]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epoch = 100

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

loss_func = torch.nn.CrossEntropyLoss()



In [24]:
wandb.init(
    project="efcp_transformer",
    name="RobertaForMaskedLM + MolCLR (GCN)",
    config={}
)

[34m[1mwandb[0m: Currently logged in as: [33morlov-aleksei53[0m ([33mmoleculary-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/alexeyorlov53/Transformers-for-Molecules/wandb/run-20240301_005038-nwiqbjs7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mDoubleTransformer with LinearClassifier BBBP training [0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/moleculary-ai/efcp_transformer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/moleculary-ai/efcp_transformer/runs/nwiqbjs7[0m


### Training

In [25]:
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader)))

for epoch in range(num_epoch):
    model.train()
    total_pred_labels = []
    total_true_labels = []
    epoch_loss = 0
    for batch in train_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch['target'] = batch['target'].to(device)
        
        logits = model(**input_batch)
        
        loss = loss_func(logits.view(-1, 2), batch['target'].view(-1))
        loss.backward()
        epoch_loss += loss.item()
        
        pred_labels = torch.argmax(logits, dim=-1)
        true_labels = batch['target']
        total_pred_labels.append(pred_labels)
        total_true_labels.append(true_labels)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    total_pred_labels = torch.cat(total_pred_labels).cpu().detach().numpy()
    total_true_labels = torch.cat(total_true_labels).cpu().detach().numpy()
    
    wandb.log({"loss/train": epoch_loss / len(train_dataloader)}, step=epoch)
    wandb.log({"accuracy/train": accuracy_score(total_true_labels, total_pred_labels)}, step=epoch)
    wandb.log({"f1/train": f1_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)
    wandb.log({"precision/train": precision_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)
    wandb.log({"recall/train": recall_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)

    model.eval()
    total_pred_labels = []
    total_true_labels = []
    epoch_loss = 0
    for batch in eval_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch['target'] = batch['target'].to(device)
        
        with torch.no_grad():
            logits = model(**input_batch)
            loss = loss_func(logits.view(-1, 2), batch['target'].view(-1))
            epoch_loss += loss.item()

            pred_labels = torch.argmax(logits, dim=-1)
            true_labels = batch['target']
            total_pred_labels.append(pred_labels)
            total_true_labels.append(true_labels)
        
        progress_bar_eval.update(1)

    total_pred_labels = torch.cat(total_pred_labels).cpu().detach().numpy()
    total_true_labels = torch.cat(total_true_labels).cpu().detach().numpy()
    
    wandb.log({"loss/validation": epoch_loss / len(eval_dataloader)}, step=epoch)
    wandb.log({"accuracy/validation": accuracy_score(total_true_labels, total_pred_labels)}, step=epoch)
    wandb.log({"f1/validation": f1_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)
    wandb.log({"precision/validation": precision_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)
    wandb.log({"recall/validation": recall_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [26]:
wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:       accuracy/train ▁▂▅▆▆▇▇▇▇▇▇▇█▇▇█▇▇▇▇█▇█████████▇████████
[34m[1mwandb[0m:  accuracy/validation ▁▂▇▇▇▇▇██▇█▇████████████▇█▇█▇███████████
[34m[1mwandb[0m:             f1/train ▁▂▅▆▆▇▇▇▇▇▇▇█▇▇█▇▇▇▇█▇█████████▇████████
[34m[1mwandb[0m:        f1/validation ▁▂▇▇▇▇▇██▇█▇████████████▇█▇█▇███████████
[34m[1mwandb[0m:           loss/train █▆▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:      loss/validation ▅▄▃▆▄▃▇▅▁▂▁▁▃▃▇▁▁█▂▂▁▂▁▁▁▅▆▂▁▁▇▁▅▁▃▁▁▂▄▂
[34m[1mwandb[0m:      precision/train ▁▂▅▆▆▇▇▇▇▇▇▇█▇▇█▇▇▇▇█▇█████████▇████████
[34m[1mwandb[0m: precision/validation ▁▂▇▇▇▇▇██▇█▇████████████▇█▇█▇███████████
[34m[1mwandb[0m:         recall/train ▁▂▅▆▆▇▇▇▇▇▇▇█▇▇█▇▇▇▇█▇█████████▇████████
[34m[1mwandb[0m:    recall/validation ▁▂▇▇▇▇▇██▇█▇████████████▇█▇█▇███████████
[34m[1mwandb[0m: 
[34m

In [27]:
test_dataloader = DataLoader(
    tokenized_dataset['test'], batch_size = 64, collate_fn = data_collator
)

model.eval()
total_pred_labels = []
total_true_labels = []
epoch_loss = 0
for batch in tqdm(test_dataloader):
    input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
    batch['target'] = batch['target'].to(device)

    with torch.no_grad():
        logits = model(**input_batch)
        loss = loss_func(logits.view(-1, 2), batch['target'].view(-1))
        epoch_loss += loss.item()

        pred_labels = torch.argmax(logits, dim=-1)
        true_labels = batch['target']
        total_pred_labels.append(pred_labels)
        total_true_labels.append(true_labels)

total_pred_labels = torch.cat(total_pred_labels).cpu().detach().numpy()
total_true_labels = torch.cat(total_true_labels).cpu().detach().numpy()

wandb.log({"loss/validation": epoch_loss / len(eval_dataloader)}, step=epoch)
wandb.log({"accuracy/validation": accuracy_score(total_true_labels, total_pred_labels)}, step=epoch)
wandb.log({"f1/validation": f1_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)
wandb.log({"precision/validation": precision_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)
wandb.log({"recall/validation": recall_score(total_true_labels, total_pred_labels, average='micro')}, step=epoch)

  0%|          | 0/4 [00:00<?, ?it/s]

Error: You must call wandb.init() before wandb.log()

In [28]:
torch.cuda.empty_cache()