# Pre-training
- This notebook contains the code for pre-training the general domain language model on 1 million molecules from the ChEMBL database using the SMILES representation
- The code is adapted from https://github.com/XinhaoLi74/MolPMoFiT/blob/master/notebooks/01_MSPM_Pretraining.ipynb

### Install RDKit on Google colaboratory

In [None]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

Import the important libraries

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from fastai import *
from fastai.text import *

import numpy as np
import threading

## Data
Mount Google Drive to Google Colab to access the google drive files 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Load train and test data 

In [None]:
train = pd.read_csv('/content/gdrive/My Drive/data/ChemBL-LM_train.csv')
valid = pd.read_csv('/content/gdrive/My Drive/data/ChemBL-LM_val.csv')
train.shape, valid.shape

In [None]:
# Create a path to save the resluts
result_path = Path('/content/gdrive/My Drive/results')
name = 'pre-trained'
path = result_path/name
path.mkdir(exist_ok=True, parents=True)

mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)

## SMILES augmentation

In [None]:
def randomize_smiles(smiles):
    m = Chem.MolFromSmiles(smiles)
    ans = list(range(m.GetNumAtoms()))
    np.random.shuffle(ans)
    nm = Chem.RenumberAtoms(m,ans)
    return Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False)

def smiles_augmentation(df, N_rounds):
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        for j in range(N_rounds):
            dist_aug['SMILES'].append(randomize_smiles(df.iloc[i].SMILES))
            dist_aug['canonical'].append('no')

    df_aug = pd.DataFrame.from_dict(dist_aug)
    
    #merge with original df
    df = pd.concat([df, df_aug], sort=False).reset_index(drop=True)
    #shuffle the data
    df = df.reindex(np.random.permutation(df.index))
    return pd.DataFrame.from_dict(df).drop_duplicates('SMILES')

The randomized SMILES are used for data augmentation. Each SMILES is augmented by four other SMILES

In [None]:
%%time
train_aug = smiles_augmentation(train, 4)
valid_aug = smiles_augmentation(valid, 4)

## Data pre-processing

Define a custom tokenizer 

In [None]:
# Don't include the defalut specific token of fastai, only keep the padding token
BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
defaults.text_spec_tok = [PAD]

special_tokens = ['[BOS]', '[C@H]', '[C@@H]','[C@]', '[C@@]','[C-]','[C+]', '[c-]', '[c+]','[cH-]',
                   '[nH]', '[N+]', '[N-]', '[n+]', '[n-]' '[NH+]', '[NH2+]',
                   '[O-]', '[S+]', '[s+]', '[S-]', '[O+]', '[SH]', '[B-]','[BH2-]', '[BH3-]','[b-]',
                   '[PH]','[P+]', '[I+]', 
                  '[Si]','[SiH2]', '[Se]','[SeH]', '[se]', '[Se+]', '[se+]','[te]','[te+]', '[Te]']

class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en', special_tokens = special_tokens):
        self.lang = lang
        self.special_tokens = special_tokens
        
    def tokenizer(self, smiles):
        # add specific token '[BOS]' to represetences the start of SMILES
        smiles = '[BOS]' + smiles
        regex = '(\[[^\[\]]{1,10}\])'
        char_list = re.split(regex, smiles)
        tokens = []
        
        if self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    if char in special_tokens:
                        tokens.append(str(char))
                    else:
                        tokens.append('[UNK]')
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]                    
        
        if not self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    tokens.append(str(char))
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]
                
        #fix the 'Br' be splited into 'B' and 'r'
        if 'B' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'B':
                    if index < len(tokens)-1: # make sure 'B' is not the last character
                        if tokens[index+1] == 'r':
                            tokens[index: index+2] = [reduce(lambda i, j: i + j, tokens[index : index+2])]
        
        #fix the 'Cl' be splited into 'C' and 'l'
        if 'l' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'l':
                    if tokens[index-1] == 'C':
                            tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
        return tokens    
    
    def add_special_cases(self, toks):
        pass

In [None]:
# Tokenizer
tok = Tokenizer(partial(MolTokenizer, special_tokens = special_tokens), n_cpus=6, pre_rules=[], post_rules=[])

Create a text databunch for language modeling:
- It takes as input the train and validation data
- Pass the custom tokenizer defined in the previous step
- Specify the column containing text data
- Define the batch size according to the GPU memory available

In [None]:
%%time
bs = 128  #batch size

data = TextLMDataBunch.from_df(path, train_aug, valid_aug, bs=bs, tokenizer=tok, chunksize=50000, text_cols=0, max_vocab=60000, include_bos=False)

In [None]:
# Save the databunch 
data.save(f'{name}_databunch')

## Training the model

Load the databunch

In [None]:
bs = 128 # batch size
data_lm = load_data(path, f'{name}_databunch', bs=bs)

Create a learner for language modeling:
- As the model is trained from scratch, use pretrained=False
- Pass the text databunch loaded in the previous step
- Drop_mult is a hyperparameter that can be tuned

In [None]:
learner = language_model_learner(data_lm, AWD_LSTM, drop_mult=1, pretrained=False)

In [None]:
# Model architecture
learner.model

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(80, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(80, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=80, bias=True)
    (output_dp): RNNDropout()
  )
)

Train the model using fit_one_cycle
- the first hyperparameter is number of epochs
- the second hyperparameter is learning rate

In [None]:
lr = 3e-3
lr *= bs/48  # Scale learning rate by batch size

learner.unfreeze()
learner.fit_one_cycle(10, lr, moms=(0.8,0.7))

Save the weights and vocabulary of the trained model

In [None]:
lm_fns = [f'{name}_wt', f'{name}_vocab']

learner.save(lm_fns[0], with_opt=False)
learner.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))