In [1]:
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np

In [2]:
filename = 'ecfp0'
samples_count = '10M'
model_name = f'molberto_{filename}_{samples_count}'

In [3]:
molecular_properties = ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa']

### Upload and Split Dataset

In [4]:
dataframe = pd.read_csv("data.csv")

In [5]:
dataframe = dataframe.drop(columns=['Unnamed: 0', 'Smiles', 'ecfp2', 'ecfp3'])

In [6]:
def preprocess_data_dataset(df, column):
    for row in tqdm(range(len(df))):
        str_ints = eval(df.iloc[row][column])
        str_fingerprint = ' '.join(str_ints)
        df.at[row, column] = str_fingerprint

In [7]:
preprocess_data_dataset(dataframe, 'ecfp1')

  0%|          | 0/2372673 [00:00<?, ?it/s]

In [8]:
dataframe

Unnamed: 0,Molecular Weight,Bioactivities,AlogP,Polar Surface Area,CX Acidic pKa,CX Basic pKa,ecfp1
0,415.99,6.0,4.09,56.15,,,2246728737 2245273601 1026928756 3217380708 20...
1,215.25,51.0,2.79,42.23,3.97,,2246728737 3217380708 3218693969 3218693969 32...
2,475.94,2.0,6.00,37.39,,7.76,882399112 3217380708 3218693969 3217380708 882...
3,548.59,1.0,6.37,186.56,6.08,2.29,2246728737 3217380708 3218693969 3217380708 32...
4,314.35,23.0,1.33,92.92,8.77,5.70,2246728737 3217380708 2041434490 3217380708 32...
...,...,...,...,...,...,...,...
2372668,460.49,3.0,4.78,95.78,,0.68,2246728737 864674487 2246699815 864942730 3217...
2372669,382.42,6.0,3.84,69.64,9.49,,864942730 3217380708 2132511834 3217380708 321...
2372670,844.84,,6.05,359.42,-3.92,2.35,847957139 3217380708 3218693969 3217380708 999...
2372671,480.00,4.0,2.55,90.09,11.49,7.19,847957139 2246699815 1026654305 847961216 8473...


### Dealing with NaNs

In [9]:
print('Percentage on NaNs:')
dataframe.isna().mean()

Percentage on NaNs:


Molecular Weight      0.000000
Bioactivities         0.039649
AlogP                 0.025650
Polar Surface Area    0.025650
CX Acidic pKa         0.443230
CX Basic pKa          0.370395
ecfp1                 0.000000
dtype: float64

In [10]:
rows_with_nans = dataframe['Molecular Weight'].isna() | \
                 dataframe['Bioactivities'].isna() | \
                 dataframe['AlogP'].isna() | \
                 dataframe['Polar Surface Area'].isna() | \
                 dataframe['CX Acidic pKa'].isna() | \
                 dataframe['CX Basic pKa'].isna()
print(f'Count of rows without NaNs: {dataframe.shape[0] - dataframe.loc[rows_with_nans].shape[0]}')

Count of rows without NaNs: 763202


In [11]:
# remove 2 last properties to reduce NaN counts
molecular_properties = ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area']
dataframe = dataframe.drop(columns=['CX Acidic pKa', 'CX Basic pKa'])

In [12]:
# drop NaN's
dataframe = dataframe.dropna().reset_index(drop=True)

In [13]:
dataframe

Unnamed: 0,Molecular Weight,Bioactivities,AlogP,Polar Surface Area,ecfp1
0,415.99,6.0,4.09,56.15,2246728737 2245273601 1026928756 3217380708 20...
1,215.25,51.0,2.79,42.23,2246728737 3217380708 3218693969 3218693969 32...
2,475.94,2.0,6.00,37.39,882399112 3217380708 3218693969 3217380708 882...
3,548.59,1.0,6.37,186.56,2246728737 3217380708 3218693969 3217380708 32...
4,314.35,23.0,1.33,92.92,2246728737 3217380708 2041434490 3217380708 32...
...,...,...,...,...,...
2220510,398.44,2.0,3.72,99.43,2246728737 2245384272 2092489639 3217380708 86...
2220511,460.49,3.0,4.78,95.78,2246728737 864674487 2246699815 864942730 3217...
2220512,382.42,6.0,3.84,69.64,864942730 3217380708 2132511834 3217380708 321...
2220513,480.00,4.0,2.55,90.09,847957139 2246699815 1026654305 847961216 8473...


### Split Data

In [14]:
from sklearn.model_selection import train_test_split

train_dataframe, test_dataframe = train_test_split(dataframe, test_size=0.20, random_state=42)
validation_dataframe, test_dataframe = train_test_split(test_dataframe, test_size=0.50, random_state=42)

In [15]:
# 80% for train, 10% for test, 10 for validation
print(len(train_dataframe), len(test_dataframe), len(validation_dataframe))

1776412 222052 222051


### Normalize Data

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [17]:
train_dataframe[molecular_properties] = scaler.fit_transform(train_dataframe[molecular_properties])
test_dataframe[molecular_properties] = scaler.transform(test_dataframe[molecular_properties])
validation_dataframe[molecular_properties] = scaler.transform(validation_dataframe[molecular_properties])

# scaler.inverse_transform(train_dataframe[molecular_property])

In [18]:
train_dataframe

Unnamed: 0,Molecular Weight,Bioactivities,AlogP,Polar Surface Area,ecfp1
1413264,-0.159203,-0.142348,-0.872408,-0.479943,2246728737 2246699815 2246703798 3217380708 32...
1413646,-0.095690,-0.017679,-0.766013,1.025944,2246728737 2245384272 3217380708 2041434490 21...
624451,0.589114,-0.121570,-1.138397,1.002462,864942730 2246699815 864674487 2976033787 2968...
1193918,-1.954719,0.065433,-1.319269,-0.497613,2246728737 2092489639 3218693969 2041434490 32...
2045880,-0.987089,-0.121570,-0.430868,0.186628,2246728737 3217380708 3218693969 3218693969 32...
...,...,...,...,...,...
1570006,-0.022798,-0.142348,0.968229,-0.289295,2246728737 2245384272 2245384272 2245384272 22...
732180,0.552628,-0.163126,0.696921,0.374951,2246728737 3217380708 3218693969 3217380708 86...
110268,3.072694,-0.121570,0.441573,1.515817,2246728737 864674487 3217380708 3218693969 321...
1692743,-0.644091,-0.163126,-0.547903,-0.066329,2246728737 2246699815 847336149 847961216 2246...


### Tokenize Data

In [19]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_dataframe),
    'test': Dataset.from_pandas(test_dataframe),
    'validation': Dataset.from_pandas(validation_dataframe)
})

dataset

DatasetDict({
    train: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', '__index_level_0__'],
        num_rows: 1776412
    })
    test: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', '__index_level_0__'],
        num_rows: 222052
    })
    validation: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', '__index_level_0__'],
        num_rows: 222051
    })
})

In [20]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.model_max_len=512

In [21]:
def tokenize(batch):
  return tokenizer(batch["ecfp1"], truncation=True, max_length=512, padding='max_length')

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/1776412 [00:00<?, ? examples/s]

Map:   0%|          | 0/222052 [00:00<?, ? examples/s]

Map:   0%|          | 0/222051 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1776412
    })
    test: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 222052
    })
    validation: Dataset({
        features: ['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'ecfp1', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 222051
    })
})

In [22]:
columns = ["input_ids", "attention_mask"]
columns.extend(molecular_properties) # our labels
print(columns)
tokenized_dataset.set_format('torch', columns=columns)

['input_ids', 'attention_mask', 'Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area']


### Create Model

In [23]:
from transformers import AutoModel, AutoConfig

class MolecularPropertiesRegression(torch.nn.Module):
    def __init__(self, model_name, num_properties):
        super(MolecularPropertiesRegression, self).__init__()
        self.num_properties = num_properties

        config = AutoConfig.from_pretrained(model_name)
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        # removing last layer of transformer
        self.transformer.pooler = torch.nn.Identity()
        # freezing transformer weights
        for param in self.transformer.parameters():
            param.requires_grad = False
        self.regressor = torch.nn.Linear(768, num_properties)

    def forward(self, input_ids = None, attention_mask=None):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        last_hidden_state = outputs[0]
        # last_hidden_state is the shape of (batch_size=32, input_sequence_length=512, hidden_size=768)
        # so we take only hidden emdedding for [CLS] token (first) as it contains the entire context
        # and would be sufficient for simple downstream tasks such as classification/regression
        predicted_property_values = self.regressor(last_hidden_state[:, 0, : ].view(-1, 768))

        return predicted_property_values
        

### Create PyTorch DataLoader

In [24]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = 1024, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['validation'], shuffle = True, batch_size = 512, collate_fn = data_collator
)

2024-02-27 22:10:18.764657: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [25]:
device = torch.device("cuda", index=2) if torch.cuda.is_available() else torch.device('cpu')

model = MolecularPropertiesRegression(model_name, num_properties=len(molecular_properties)).to(device)

  torch.utils._pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at molberto_ecfp0_10M and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
model

MolecularPropertiesRegression(
  (transformer): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              

In [27]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epoch = 3

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

loss_func = torch.nn.MSELoss()



In [28]:
from datasets import load_metric

# a metric for each property
metrics = { k: load_metric("mse") for k in molecular_properties }

  metrics = { k: load_metric("mse") for k in molecular_properties }


### Training

In [29]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader)))


for epoch in range(num_epoch):
    model.train()
    for batch in train_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        labels_batch = { k: v.to(device) for k, v in batch.items() if k in molecular_properties }

        labeled_property_values = torch.stack(list(labels_batch.values())).T
        predicted_property_values = model(**input_batch)
        
        loss = loss_func(predicted_property_values, labeled_property_values)
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    model.eval()
    for batch in eval_dataloader:
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        labels_batch = { k: v.to(device) for k, v in batch.items() if k in molecular_properties }

        labeled_property_values = torch.stack(list(labels_batch.values())).T
        with torch.no_grad():
            predicted_property_values = model(**input_batch)

        for i, molecular_property in enumerate(molecular_properties):
            metrics[molecular_property].add_batch(predictions = [predicted_property_values[0][i]], references = [labeled_property_values[0][i]])
        progress_bar_eval.update(1)
    
    for molecular_property in molecular_properties:
        print(f'Metric for {molecular_property}:', metrics[molecular_property].compute()) 

  0%|          | 0/5205 [00:00<?, ?it/s]

  0%|          | 0/1302 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Metric for Molecular Weight: {'mse': 0.18098786887360357}
Metric for Bioactivities: {'mse': 0.2755645191115328}
Metric for AlogP: {'mse': 0.3097385854907803}
Metric for Polar Surface Area: {'mse': 0.29978269738031266}
Metric for Molecular Weight: {'mse': 0.1134967267640417}
Metric for Bioactivities: {'mse': 2.155692200645302}
Metric for AlogP: {'mse': 0.2586677511742193}
Metric for Polar Surface Area: {'mse': 0.1533400276284276}
Metric for Molecular Weight: {'mse': 0.11809765318889656}
Metric for Bioactivities: {'mse': 0.05533164173159428}
Metric for AlogP: {'mse': 0.18770166932345378}
Metric for Polar Surface Area: {'mse': 0.1518035156499337}


## Post Training Evaluation

In [30]:
model.eval()

test_dataloader = DataLoader(
    tokenized_dataset['test'], batch_size = 512, collate_fn = data_collator
)

for batch in tqdm(test_dataloader):
        input_batch = { k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask'] }
        labels_batch = { k: v.to(device) for k, v in batch.items() if k in molecular_properties }

        labeled_property_values = torch.stack(list(labels_batch.values())).T
        with torch.no_grad():
            predicted_property_values = model(**input_batch)

        for i, molecular_property in enumerate(molecular_properties):
            metrics[molecular_property].add_batch(predictions = [predicted_property_values[0][i]], references = [labeled_property_values[0][i]])
    
for molecular_property in molecular_properties:
    print(f'Metric for {molecular_property}:', metrics[molecular_property].compute())

  0%|          | 0/434 [00:00<?, ?it/s]

Metric for Molecular Weight: {'mse': 0.12865685785221123}
Metric for Bioactivities: {'mse': 0.10600101066374959}
Metric for AlogP: {'mse': 0.20758568381205134}
Metric for Polar Surface Area: {'mse': 0.1454521742691242}


In [31]:
torch.save(model, model_name + '_and_linear_3epochs.pt')

In [32]:
torch.cuda.empty_cache()