# CuteSmileyBert : a toy transformer for SMILES strings

Chemical data has such underlying complexity that it is quite complicated to find representations of molecules that machine learning models can work with. Indeed, we could try different types of human-engineered featurizations, especially for the protein structures. However, they will all inevitably lose some information, or become noisy. For this reason, I believe that the best encoding for this specific task will be a BERT-like transformer of protein structures. ESM2 demonstrated an understanding of protein structures despite only being trained on sequences. We can definitely expect similar results on SMILES strings.

CuteSmileyBert will have only 1 million parameters, which is absolutely **tiny**. I am not expecting it to work very well, but I am curious if it will work at all, and if we can demonstrate some form of scaling laws.

In [3]:
import sys
sys.path.append("..")# Download the dataset

In [4]:
import re
from pathlib import Path
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from src.download_dataset import download_datasets, extract_files

ROOT_DIR = Path("../")
DATA_DIR = Path(ROOT_DIR, "data")

# We start by downloading the dataset and opening it with Pandas
smiles_datasets = {
    'SMILES_Big_Dataset.csv.zip': 'https://www.kaggle.com/api/v1/datasets/download/yanmaksi/big-molecules-smiles-dataset'
}

download_datasets(smiles_datasets)
extract_files(smiles_datasets)

filename = list(smiles_datasets.keys())[0]
file_path = Path(DATA_DIR, filename)
df = pd.read_csv(file_path)

ModuleNotFoundError: No module named 'src'

In [5]:
# This is the column that we are interested in
smiles_list = df["SMILES"].to_list()

df["SMILES"].head()

0           O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1
1    O=c1cc(-c2nc(-c3ccc(-c4cn(CCP(=O)(O)O)nn4)cc3)...
2               NC(=O)c1ccc2c(c1)nc(C1CCC(O)CC1)n2CCCO
3                  NCCCn1c(C2CCNCC2)nc2cc(C(N)=O)ccc21
4                    CNC(=S)Nc1cccc(-c2cnc3ccccc3n2)c1
Name: SMILES, dtype: object

In [None]:
# List of SMILES tokens that we're going to separate into lists
SMILES_REGEX = re.compile(
    r"(\%\d\d|Br|Cl|Si|Na|Ca|Li|@@?|=|#|\(|\)|\.|\[|\]|\/|\\|:|~|\+|\-|\d|[A-Za-z])"
)

# Special tokens to encode different types of empty or masked spaces
MASK_TOKEN = "<MASK>"
PAD_TOKEN = "<PAD>"
BOS_TOKEN = "<BOS>"
EOS_TOKEN = "<EOS>"
UNK_TOKEN = "<UNK>"
MASK_TOKEN = "<MASK>"

SPECIAL_TOKENS = [PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN, MASK_TOKEN]

# This is the max padded length of a "sentence"
MAX_LEN = 150

# Returns a list containing each individual token
def tokenize_smiles(smiles: str) -> list[str]:
    return SMILES_REGEX.findall(smiles)

# Returns two dictionaries mapping unique tokens to integer indexes
def build_vocab(smiles_list: list[str]) -> tuple[dict[str, int], dict[int, str]]:
    # Create a set of tokens (sets don't contain repeated values)
    tokens = set(SPECIAL_TOKENS)
    for s in smiles_list:
        tokens.update(tokenize_smiles(s))
    # Sort all tokens, make a list, and map each token to its index
    vocab = {tok: i for i, tok in enumerate(sorted(tokens))}
    # Reverse dictionary that maps indexes to tokens
    inv_vocab = {i: tok for tok, i in vocab.items()}
    return vocab, inv_vocab

# Turn smiles string into fixed-length list of token IDs, with special tokens
def encode_smiles(
    smiles: str, 
    vocab: dict[str, int], 
    max_len: int=MAX_LEN
) -> list[int]:
    # Adds start and end tokens to the SMILES list
    tokens = [BOS_TOKEN] + tokenize_smiles(smiles) + [EOS_TOKEN]
    # Creates a list with the integer IDs of each token
    token_ids = [vocab.get(t, vocab[UNK_TOKEN]) for t in tokens]
    # Create a vector of length max_len and fill it with pad tokens until the end
    token_ids = token_ids[:max_len] + [vocab[PAD_TOKEN]] * (max_len - len(token_ids))
    return token_ids

# Get a list of token IDs as input, replace some by <MASK>
def mask_tokens(
    input_ids: list[int], 
    vocab: dict[str, int], 
    mask_prob: float=0.15
) -> tuple[list[int], list[int]]:
    # Two copies of ID vector so we can mask one and predict the other
    input_ids = input_ids.clone()
    labels = input_ids.clone()

    # Get the IDs of mask and pad tokens, plus context size
    mask_token_id = vocab[MASK_TOKEN]
    pad_token_id = vocab[PAD_TOKEN]
    vocab_size = len(vocab)

    # Do not mask padding tokens
    maskable = input_ids != pad_token_id
    masked_indices = torch.bernoulli(torch.full(input_ids.shape, mask_prob)).bool() & maskable
    labels[~masked_indices] = -100

    # Replace 80% of tokens with <MASK>
    replace_mask = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices
    input_ids[replace_mask] = mask_token_id

    # Replace 10% with random token
    random_mask = torch.bernoulli(torch.full(input_ids.shape, 0.1)).bool() & masked_indices & ~replace_mask
    random_tokens = torch.randint(vocab_size, input_ids.shape, dtype=torch.long)
    input_ids[random_mask] = random_tokens[random_mask]

    # 10% keep unchanged (no-op for the rest)
    return input_ids, labels

In [7]:
# Dataset class with input-target pairs for masked vs original strings
class SMILESMaskedDataset(torch.utils.data.Dataset):
    # Creates class attribute "data", a list of every encoded SMILES string
    def __init__(
        self, 
        smiles_list: list[str], 
        vocab: dict[str, int], 
        max_len: int=150
    ) -> None:
        self.vocab = vocab
        self.max_len = max_len
        self.data = [encode_smiles(s, vocab, max_len) for s in smiles_list]

    # Length method, required by pytorch
    def __len__(self) -> int:
        return len(self.data)

    # Returns a pair of masked and unmasked encodings in Tensor form
    def __getitem__(self, idx:int) -> tuple[list[int], list[int]]:
        input_ids = torch.tensor(self.data[idx], dtype=torch.long)
        masked_input, labels = mask_tokens(input_ids, self.vocab)
        return masked_input, labels

In [None]:
# Our Transformer class
class CuteSmileyBERT(nn.Module):
    def __init__(self, 
        vocab_size: int, 
        d_model: int=256, 
        nhead: int=8, 
        num_layers: int=6, 
        dim_feedforward: int=1024, 
        dropout: float=0.1, 
        max_len: int=150
    ) -> None:
        # Intialize the nn.Module parent class, needed for proper inheritance
        super().__init__()
        # Set the input max_len to a class attribute
        self.max_len = max_len

        # This defines a method for embeddings, which will be our input representation
        # It effectively creates a weight matrix of shape [vocab_size, d_model]
        # The weights are initialized at random and will be learnt over time
        self.embed = nn.Embedding(vocab_size, d_model)
        # We will also be using positional embeddings, which will be added to the tokens'
        # They will be learnt based solely on integer positions, with no information on the tokens
        self.pos_embed = nn.Embedding(max_len, d_model)

        # Defines a single transfomer layer, with attention and feedforward/MLP layers
        # Each later contains nhead attention heads, and a ff with dimensions dim_feedforward
        # Dropout prevents overfitting by randomly setting 10% of attention coefficients to 0
        # Batch-first means that the input shape is given as [batch, seq_len, d_model]
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )

        # This attribute defines the model's main structure, a sequence of num_layers encoder layers
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # This is the final layer, a simple linear MLP which maps the embeddings to token logits
        # The input dimension is d_model, and the output is vocab_size (1-hot encoded vector)
        self.lm_head = nn.Linear(d_model, vocab_size)

    # This is a single training cycle. 
    def forward(self, input_ids:list[int]):
        # Get batch size and sequence length, used for positional encodings
        batch_size, seq_len = input_ids.shape
        if seq_len > self.max_len:
            raise ValueError(f"Sequence length {seq_len} exceeds maximum {self.max_len}")
        
        # Creates a 1D tensor which simply contains the int positions of each token
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
        # Our positions tensor is embedded, without any information on the actual tokens
        pos_embeddings = self.pos_embed(positions)
        # The positional embeddings are then summed to the token embeddings
        x = self.embed(input_ids) + pos_embeddings

        # We pass the embeddings through the encoder block
        x = self.encoder(x)
        # The linear layer outputs logits
        logits = self.lm_head(x)
        return logits

In [None]:
# Now we're going to wrap our model inside a Hugging Face Transformers class
# This gives us a standard definition that can be easily pushed and pulled

from transformers import PretrainedConfig, PreTrainedModel

class CSBConfig(PretrainedConfig):
    model_type = "CuteSmileyBERT"
    def __init__(self, vocab_size=100, d_model=256, nhead=8, num_layers=6, dim_feedforward=1024, dropout=0.1, max_len=150, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.nhead = nhead
        self.num_layers = num_layers
        self.dim_feedforward = dim_feedforward
        self.dropout = dropout
        self.max_len = max_len

class CuteSmileyBERT_HF(PreTrainedModel):
    config_class = CuteSmileyBERTConfig
    def __init__(self, config):
        super().__init__(config)
        self.model = CuteSmileyBERT(
            vocab_size=config.vocab_size,
            d_model=config.d_model,
            nhead=config.nhead,
            num_layers=config.num_layers,
            dim_feedforward=config.dim_feedforward,
            dropout=config.dropout,
            max_len=config.max_len
        )
        self.post_init()

    def forward(self, input_ids):
        return self.model(input_ids)

In [5]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("No CUDA device detected.")

CUDA available: False
No CUDA device detected.


In [None]:
# Train the model

vocab, inv_vocab = build_vocab(smiles_list)

dataset = SMILESMaskedDataset(smiles_list, vocab, max_len=150)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

config = CuteSmileyBERTConfig(vocab_size=len(vocab))
hf_model = CuteSmileyBERT_HF(config)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(3):
    for masked_input, labels in loader:
        optimizer.zero_grad()
        logits = hf_model(masked_input)
        loss = criterion(logits.view(-1, len(vocab)), labels.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: loss = {loss.item():.4f}")

In [None]:
# Save in Hugging Face format
hf_model.save_pretrained("./cutesmileybert")

# Push to Hugging Face Hub
hf_model.push_to_hub("marcosbolanos/cutesmileybert")
config.push_to_hub("marcosbolanos/cutesmileybert")