In [1]:
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np

In [2]:
filename = 'ecfp0'
samples_count = '10M'
model_name = f'molberto_{filename}_{samples_count}'

In [3]:
molecular_properties = ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa']

### Upload and Split Dataset

In [4]:
dataframe = pd.read_csv("data.csv")

In [5]:
dataframe = dataframe.drop(columns=['Unnamed: 0', 'Smiles', 'ecfp2', 'ecfp3'])

In [6]:
def preprocess_data_dataset(df, column):
    for row in tqdm(range(len(df))):
        str_ints = eval(df.iloc[row][column])
        str_fingerprint = ' '.join(str_ints)
        df.at[row, column] = str_fingerprint

In [7]:
preprocess_data_dataset(dataframe, 'ecfp1')

  0%|          | 0/2372673 [00:00<?, ?it/s]

In [8]:
dataframe

Unnamed: 0,Molecular Weight,Bioactivities,AlogP,Polar Surface Area,CX Acidic pKa,CX Basic pKa,ecfp1
0,415.99,6.0,4.09,56.15,,,2246728737 2245273601 1026928756 3217380708 20...
1,215.25,51.0,2.79,42.23,3.97,,2246728737 3217380708 3218693969 3218693969 32...
2,475.94,2.0,6.00,37.39,,7.76,882399112 3217380708 3218693969 3217380708 882...
3,548.59,1.0,6.37,186.56,6.08,2.29,2246728737 3217380708 3218693969 3217380708 32...
4,314.35,23.0,1.33,92.92,8.77,5.70,2246728737 3217380708 2041434490 3217380708 32...
...,...,...,...,...,...,...,...
2372668,460.49,3.0,4.78,95.78,,0.68,2246728737 864674487 2246699815 864942730 3217...
2372669,382.42,6.0,3.84,69.64,9.49,,864942730 3217380708 2132511834 3217380708 321...
2372670,844.84,,6.05,359.42,-3.92,2.35,847957139 3217380708 3218693969 3217380708 999...
2372671,480.00,4.0,2.55,90.09,11.49,7.19,847957139 2246699815 1026654305 847961216 8473...


In [9]:
print('Percentage on NaNs:')
dataframe.isna().mean()

Percentage on NaNs:


Molecular Weight      0.000000
Bioactivities         0.039649
AlogP                 0.025650
Polar Surface Area    0.025650
CX Acidic pKa         0.443230
CX Basic pKa          0.370395
ecfp1                 0.000000
dtype: float64

In [10]:
rows_with_nans = dataframe['Molecular Weight'].isna() | \
                 dataframe['Bioactivities'].isna() | \
                 dataframe['AlogP'].isna() | \
                 dataframe['Polar Surface Area'].isna() | \
                 dataframe['CX Acidic pKa'].isna() | \
                 dataframe['CX Basic pKa'].isna()
print(f'Count of rows without NaNs: {dataframe.shape[0] - dataframe.loc[rows_with_nans].shape[0]}')

Count of rows without NaNs: 763202


In [11]:
# remove 2 last properties to reduce NaN counts
molecular_properties = ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area']
dataframe = dataframe.drop(columns=['CX Acidic pKa', 'CX Basic pKa'])

In [12]:
# drop NaN's
dataframe = dataframe.dropna().reset_index(drop=True)

In [13]:
dataframe

Unnamed: 0,Molecular Weight,Bioactivities,AlogP,Polar Surface Area,ecfp1
0,415.99,6.0,4.09,56.15,2246728737 2245273601 1026928756 3217380708 20...
1,215.25,51.0,2.79,42.23,2246728737 3217380708 3218693969 3218693969 32...
2,475.94,2.0,6.00,37.39,882399112 3217380708 3218693969 3217380708 882...
3,548.59,1.0,6.37,186.56,2246728737 3217380708 3218693969 3217380708 32...
4,314.35,23.0,1.33,92.92,2246728737 3217380708 2041434490 3217380708 32...
...,...,...,...,...,...
2220510,398.44,2.0,3.72,99.43,2246728737 2245384272 2092489639 3217380708 86...
2220511,460.49,3.0,4.78,95.78,2246728737 864674487 2246699815 864942730 3217...
2220512,382.42,6.0,3.84,69.64,864942730 3217380708 2132511834 3217380708 321...
2220513,480.00,4.0,2.55,90.09,847957139 2246699815 1026654305 847961216 8473...


In [14]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(dataframe)
train_testvalid = dataset.train_test_split(test_size=0.2, seed=15)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=15)

# 10% for test, 10 for validation, 80% for train
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1'],
        num_rows: 1776412
    })
    test: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1'],
        num_rows: 222052
    })
    validation: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1'],
        num_rows: 222051
    })
})

### Tokenize Data

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.model_max_len=512

In [16]:
def tokenize(batch):
  return tokenizer(batch["ecfp1"], truncation=True, max_length=512, padding='max_length')

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/1776412 [00:00<?, ? examples/s]

Map:   0%|          | 0/222052 [00:00<?, ? examples/s]

Map:   0%|          | 0/222051 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', 'input_ids', 'attention_mask'],
        num_rows: 1776412
    })
    test: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', 'input_ids', 'attention_mask'],
        num_rows: 222052
    })
    validation: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', 'input_ids', 'attention_mask'],
        num_rows: 222051
    })
})

In [17]:
columns = ["input_ids", "attention_mask"]
columns.extend(molecular_properties) # our labels
print(columns)
tokenized_dataset.set_format('torch', columns=columns)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

['input_ids', 'attention_mask', 'Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area']


2024-02-27 13:22:10.394550: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Create Model

In [18]:
from transformers import AutoModel, AutoConfig

class MolecularPropertiesRegression(torch.nn.Module):
    def __init__(self, model_name, num_properties):
        super(MolecularPropertiesRegression, self).__init__()
        self.num_properties = num_properties

        config = AutoConfig.from_pretrained(model_name)
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        # removing last layer of transformer
        self.transformer.pooler = torch.nn.Identity()
        # freezing transformer weights
        for param in self.transformer.parameters():
            param.requires_grad = False
        self.regressor = torch.nn.Linear(768, num_properties)

    def forward(self, input_ids = None, attention_mask=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        last_hidden_state = outputs[0]
        # last_hidden_state is the shape of (batch_size=32, input_sequence_length=512, hidden_size=768)
        # so we take only hidden emdedding for [CLS] token (first) as it contains the entire context
        # and would be sufficient for simple downstream tasks such as classification/regression
        predicted_property_values = self.regressor(last_hidden_state[:, 0, : ].view(-1, 768))

        return predicted_property_values
        

In [19]:
# this is how last layer is removed from Roberta
config = AutoConfig.from_pretrained(model_name)
transformer = AutoModel.from_pretrained(model_name, config=config)
transformer.pooler = torch.nn.Identity()
transformer

  torch.utils._pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at molberto_ecfp0_10M and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout)

### Create PyTorch DataLoader

In [20]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = 1024, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['validation'], shuffle = True, batch_size = 512, collate_fn = data_collator
)

In [21]:
device = torch.device("cuda", index=2) if torch.cuda.is_available() else torch.device('cpu')

model = MolecularPropertiesRegression(model_name, num_properties=len(molecular_properties)).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at molberto_ecfp0_10M and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
model

MolecularPropertiesRegression(
  (transformer): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              

In [23]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epoch = 1

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

loss_func = torch.nn.MSELoss()



In [24]:
from datasets import load_metric

# a metric for each property
metrics = { k: load_metric("mse") for k in molecular_properties }

  metrics = { k: load_metric("mse") for k in molecular_properties }


### Training

In [25]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader)))


for epoch in range(num_epoch):
    model.train()
    for batch in train_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        labels_batch = { k: v.to(device) for k, v in batch.items() if k in molecular_properties }

        labeled_property_values = torch.stack(list(labels_batch.values())).T
        predicted_property_values = model(**input_batch)
        
        loss = loss_func(predicted_property_values, labeled_property_values)
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    model.eval()
    for batch in eval_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        labels_batch = { k: v.to(device) for k, v in batch.items() if k in molecular_properties }

        labeled_property_values = torch.stack(list(labels_batch.values())).T
        with torch.no_grad():
            predicted_property_values = model(**input_batch)

        for i, molecular_property in enumerate(molecular_properties):
            metrics[molecular_property].add_batch(predictions = [predicted_property_values[0][i]], references = [labeled_property_values[0][i]])
        progress_bar_eval.update(1)
    
    for molecular_property in molecular_properties:
        print(f'Metric for {molecular_property}:', metrics[molecular_property].compute()) 

  0%|          | 0/1735 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Metric for Molecular Weight: {'mse': 161181.593241697}
Metric for Bioactivities: {'mse': 182.00275182570348}
Metric for AlogP: {'mse': 3.250546004155947}
Metric for Polar Surface Area: {'mse': 5509.086779695163}


In [None]:
torch.cuda.empty_cache()