In [1]:
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import wandb

In [2]:
!python3 -m wandb login eb7b1964fb84cd81de96b2a273ecf2bb6254aeac

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/alexeyorlov53/.netrc


In [3]:
filename = 'ecfp0'
samples_count1 = '10M'
model_name1 = f'molberto_{filename}_{samples_count1}'

In [4]:
batch_size = 32

In [5]:
gpu_number = 2

In [6]:
target = 'CT_TOX'

In [7]:
lr = 0.00005

### Upload and Split Dataset

In [8]:
dataframe = pd.read_csv("clintox_with_ecfp.csv")

In [9]:
def preprocess_data_dataset(df, column):
    for row in tqdm(range(len(df))):
        str_ints = eval(df.iloc[row][column])
        str_fingerprint = ' '.join(str_ints)
        df.at[row, column] = str_fingerprint

In [10]:
preprocess_data_dataset(dataframe, 'ecfp0')

  0%|          | 0/1479 [00:00<?, ?it/s]

In [11]:
dataframe

Unnamed: 0,smiles,FDA_APPROVED,CT_TOX,ecfp0
0,[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)C...,1,0,2976033787 2976033787 2976033787 2976033787 29...
1,[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)...,1,0,2245273601 2245273601 2245273601 2246699815 86...
2,[H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])...,1,0,4277593716 847954377 2246699815 3217380708 321...
3,[H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(...,1,0,4277593716 847954377 2246699815 847957139 3217...
4,[N+](=O)([O-])[O-],1,0,848127915 864942730 864942795 864942795
...,...,...,...,...
1474,O[Si](=O)O,1,0,864662311 3387140397 864942730 864662311
1475,O=[Ti]=O,1,0,864942730 869071688 864942730
1476,O=[Zn],1,0,864942730 971583629
1477,OCl(=O)(=O)=O,1,0,864662311 3858440414 864942795 864942795 86494...


In [12]:
dataframe = dataframe.dropna(subset=[target]).reset_index(drop=True)

In [13]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(dataframe)
train_testvalid = dataset.train_test_split(test_size=0.2, seed=15)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=15)

# 10% for test, 10 for validation, 80% for train
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['smiles', 'FDA_APPROVED', 'CT_TOX', 'ecfp0'],
        num_rows: 1183
    })
    test: Dataset({
        features: ['smiles', 'FDA_APPROVED', 'CT_TOX', 'ecfp0'],
        num_rows: 148
    })
    validation: Dataset({
        features: ['smiles', 'FDA_APPROVED', 'CT_TOX', 'ecfp0'],
        num_rows: 148
    })
})

### Tokenize Data

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name1)

tokenizer.model_max_len=512

In [15]:
def tokenize(batch):
  return tokenizer(batch["ecfp0"], truncation=True, max_length=512, padding='max_length')

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/1183 [00:00<?, ? examples/s]

Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Map:   0%|          | 0/148 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['smiles', 'FDA_APPROVED', 'CT_TOX', 'ecfp0', 'input_ids', 'attention_mask'],
        num_rows: 1183
    })
    test: Dataset({
        features: ['smiles', 'FDA_APPROVED', 'CT_TOX', 'ecfp0', 'input_ids', 'attention_mask'],
        num_rows: 148
    })
    validation: Dataset({
        features: ['smiles', 'FDA_APPROVED', 'CT_TOX', 'ecfp0', 'input_ids', 'attention_mask'],
        num_rows: 148
    })
})

In [16]:
columns = ["input_ids", "attention_mask"]
columns.extend([target]) # our labels
print(columns)
tokenized_dataset.set_format('torch', columns=columns)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

['input_ids', 'attention_mask', 'CT_TOX']


### Create Transformer Model

In [17]:
from transformers import AutoModel, AutoConfig

class MolecularPropertiesClassification(torch.nn.Module):
    def __init__(self, model_name1):
        super(MolecularPropertiesClassification, self).__init__()

        config1 = AutoConfig.from_pretrained(model_name1)
        self.transformer1 = AutoModel.from_pretrained(model_name1, config=config1)
        # removing last layer of transformer
        self.transformer1.pooler = torch.nn.Identity()
        # freezing transformer weights
        for param in self.transformer1.parameters():
            param.requires_grad = False

        self.linear1 = torch.nn.Linear(768, 768, bias=True)
        self.linear2 = torch.nn.Linear(768, 2, bias=True)

    def forward(self, input_ids = None, attention_mask=None):
        outputs1 = self.transformer1(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state1 = outputs1[0]
        
        first_linear_out = self.linear1(last_hidden_state1[:, 0, : ].view(-1, 768))
        logits = self.linear2(torch.nn.functional.sigmoid(first_linear_out))

        return logits
        

### Create PyTorch DataLoader

In [18]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = batch_size, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['validation'], shuffle = True, batch_size = batch_size, collate_fn = data_collator
)

test_dataloader = DataLoader(
    tokenized_dataset['test'], shuffle = True, batch_size = batch_size, collate_fn = data_collator
)

In [19]:
device = torch.device("cuda", index=gpu_number) if torch.cuda.is_available() else torch.device('cpu')

model = MolecularPropertiesClassification(model_name1).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at molberto_ecfp0_10M and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model

MolecularPropertiesClassification(
  (transformer1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [21]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Example labels; replace this with your actual label array
labels = np.array([0]*(len(dataframe[target])-sum(dataframe[target])) + [1]*sum(dataframe[target]))  # 1367 False, 112 True

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [22]:
class_weights

tensor([0.5410, 6.6027], device='cuda:2')

In [23]:
import torch
import torch.nn as nn

class WeightedCrossEntropyLoss(nn.Module):
    def __init__(self, weights):
        super(WeightedCrossEntropyLoss, self).__init__()
        self.weights = weights

    def forward(self, logits, targets):
        # Apply softmax to get probabilities
        probs = torch.softmax(logits, dim=1)

        # Compute the cross-entropy loss
        loss = -torch.sum(self.weights * targets * torch.log(probs + 1e-10), dim=1)
        return torch.mean(loss)  # Return the average loss

In [24]:
def compute_loss(logits, targets):
        loss_fn = WeightedCrossEntropyLoss(weights=class_weights.to(device))

        e = torch.eye(2).to(device)
        loss = loss_fn(logits, e[targets])
        return loss

In [25]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=lr)

num_epoch = 100

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

loss_func = torch.nn.CrossEntropyLoss()



In [26]:
wandb.init(
    project="efcp_transformer",
    name='ECFP-BERT-' + samples_count1 + "-ClinTox" + ' ' + target + " weighted_classes" + " lr=" + str(lr),
    config={}
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33morlov-aleksei53[0m ([33mmoleculary-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Training

In [27]:
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader)))

for epoch in range(num_epoch):
    model.train()
    total_pred_labels = []
    total_true_labels = []
    epoch_loss = 0
    for batch in train_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch[target] = batch[target].to(device)
        
        logits = model(**input_batch)
        
        # loss = loss_func(logits.view(-1, 2), batch[target].view(-1))
        loss = compute_loss(logits.view(-1, 2), batch[target].view(-1))
        loss.backward()
        epoch_loss += loss.item()
        
        pred_labels = torch.argmax(logits, dim=-1)
        true_labels = batch[target]
        total_pred_labels.append(pred_labels)
        total_true_labels.append(true_labels)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    total_pred_labels = torch.cat(total_pred_labels).cpu().detach().numpy()
    total_true_labels = torch.cat(total_true_labels).cpu().detach().numpy()
    
    wandb.log({"loss/train": epoch_loss / len(train_dataloader)})
    wandb.log({"accuracy/train": accuracy_score(total_true_labels, total_pred_labels)})
    wandb.log({"f1/train": f1_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"precision/train": precision_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"recall/train": recall_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"roc_auc_score/train": roc_auc_score(total_true_labels, total_pred_labels)})

    model.eval()
    total_pred_labels = []
    total_true_labels = []
    epoch_loss = 0
    for batch in eval_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch[target] = batch[target].to(device)
        
        with torch.no_grad():
            logits = model(**input_batch)
            # loss = loss_func(logits.view(-1, 2), batch[target].view(-1))
            loss = compute_loss(logits.view(-1, 2), batch[target].view(-1))
            epoch_loss += loss.item()

            pred_labels = torch.argmax(logits, dim=-1)
            true_labels = batch[target]
            total_pred_labels.append(pred_labels)
            total_true_labels.append(true_labels)
        
        progress_bar_eval.update(1)

    total_pred_labels = torch.cat(total_pred_labels).cpu().detach().numpy()
    total_true_labels = torch.cat(total_true_labels).cpu().detach().numpy()
    
    wandb.log({"loss/validation": epoch_loss / len(eval_dataloader)})
    wandb.log({"accuracy/validation": accuracy_score(total_true_labels, total_pred_labels)})
    wandb.log({"f1/validation": f1_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"precision/validation": precision_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"recall/validation": recall_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"roc_auc_score/validation": roc_auc_score(total_true_labels, total_pred_labels)})

  0%|          | 0/3700 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [28]:
def test_loop():
    model.eval()
    total_pred_labels = []
    total_true_labels = []
    epoch_loss = 0
    for batch in test_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        batch[target] = batch[target].to(device)
        
        with torch.no_grad():
            logits = model(**input_batch)
            # loss = loss_func(logits.view(-1, 2), batch[target].view(-1))
            loss = compute_loss(logits.view(-1, 2), batch[target].view(-1))
            epoch_loss += loss.item()

            pred_labels = torch.argmax(logits, dim=-1)
            true_labels = batch[target]
            total_pred_labels.append(pred_labels)
            total_true_labels.append(true_labels)
        
        progress_bar_eval.update(1)

    total_pred_labels = torch.cat(total_pred_labels).cpu().detach().numpy()
    total_true_labels = torch.cat(total_true_labels).cpu().detach().numpy()
    
    wandb.log({"loss/test": epoch_loss / len(test_dataloader)})
    wandb.log({"accuracy/test": accuracy_score(total_true_labels, total_pred_labels)})
    wandb.log({"f1/test": f1_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"precision/test": precision_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"recall/test": recall_score(total_true_labels, total_pred_labels, average='micro')})
    wandb.log({"roc_auc_score/test": roc_auc_score(total_true_labels, total_pred_labels)})

In [29]:
test_loop()

In [30]:
wandb.finish()

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy/test,▁
accuracy/train,▁▅▄▆▆▇▇▅▆▅▆▆▇▆▆▇▆▇▇█▇▇██▇▇▇█▇█▇▇▇█▇███▇▇
accuracy/validation,▁▆▇▅█▇█▇▇▇██▇▇██▇▇▇▇▇█▇▇▇█▇▇█▇▇▇▇▇▇▇▇▇▇▇
f1/test,▁
f1/train,▁▅▄▆▆▇▇▅▆▅▆▆▇▆▆▇▆▇▇█▇▇██▇▇▇█▇█▇▇▇█▇███▇▇
f1/validation,▁▆▇▅█▇█▇▇▇██▇▇██▇▇▇▇▇█▇▇▇█▇▇█▇▇▇▇▇▇▇▇▇▇▇
loss/test,▁
loss/train,█▇▆▆▅▅▄▄▄▄▃▄▃▃▃▂▃▃▃▃▂▃▂▂▃▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂
loss/validation,█▆▅▄▅▄▂▃▃▂▂▃▃▂▅▃▂▃▂▄▁▂▃▁▄▂▃▁▅▁▃▁▄▃▂▃▂▂▂▃
precision/test,▁

0,1
accuracy/test,0.72973
accuracy/train,0.74134
accuracy/validation,0.71622
f1/test,0.72973
f1/train,0.74134
f1/validation,0.71622
loss/test,0.5952
loss/train,0.5071
loss/validation,0.55905
precision/test,0.72973


In [31]:
torch.cuda.empty_cache()