# Transfewr Learning

Down the vocabulary and the encoder of the pre-trained model into a folder named "models". 

In [8]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pickle

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') # switch off RDKit warning messages

from fastai import *
from fastai.text import *
from utils import *
from sklearn.model_selection import train_test_split

import sys

## Steps

1. load the vocabulary
2. initialize the tokenizer
3. load the data and data augmentation
4. Build the fastai databunch
5. create the classification/regression learner
5. training (fit_one_cycle)

1. load the vocabulary

In [22]:
model_path = Path('../results/MSPM/') # the parent folder of the "models" folder

with open(f'{model_path}/models/ChemBL_atom_vocab.pkl', 'rb') as f:
    orig_itos = pickle.load(f)
    
vocab = Vocab(orig_itos)
print(f'Vocab Size: {len(vocab.itos)}')

Vocab Size: 80


2. initialize the tokenizer

In [10]:
tok = Tokenizer(partial(MolTokenizer, special_tokens = special_tokens), n_cpus=6, pre_rules=[], post_rules=[])

3. load the data

In [11]:
bbbp = pd.read_csv('../data/QSAR/bbbp.csv')
print(bbbp.shape)
bbbp.head()

(2039, 2)


Unnamed: 0,smiles,p_np
0,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,1
1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,1
2,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,1
3,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,1
4,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...,1


In [12]:
train, test = train_test_split(bbbp,
    test_size=0.1, shuffle = True, random_state = 8)

train, val = train_test_split(train,
    test_size=0.1, shuffle = True, random_state = 42)

print(train.shape[0], test.shape[0], val.shape[0]) 

1651 204 184


(optional) SMILES augmentation

In [13]:
def bbbp_smiles_augmentation(df, N_rounds):
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        if df.iloc[i].p_np == 1:
            for j in range(N_rounds[0]):
                dist_aug['smiles'].append(randomize_smiles(df.iloc[i].smiles))
                dist_aug['p_np'].append(df.iloc[i]['p_np'])

        if df.iloc[i].p_np == 0:
            for j in range(N_rounds[1]):
                dist_aug['smiles'].append(randomize_smiles(df.iloc[i].smiles))
                dist_aug['p_np'].append(df.iloc[i]['p_np'])
        
    df_aug = pd.DataFrame.from_dict(dist_aug)
    df_aug = df_aug.append(df, ignore_index=True)
    return df_aug.drop_duplicates('smiles')

As shown above, the dataset is not balanced. For training data, we generated 10 and 30 randomized SMILES for molecules belong to the positive and negative classes, respectively. The numbers can be changed based on different datasets.

In [15]:
train_aug = bbbp_smiles_augmentation(train, [10,30])
valid_aug = bbbp_smiles_augmentation(val, [5,5])

4. Build the fastai databunch

In [24]:
bs = 128 #batch size

qsar_db = TextClasDataBunch.from_df(model_path, train_aug, valid_aug, bs=bs, tokenizer=tok, 
                                    chunksize=50000, text_cols='smiles',label_cols='p_np', 
                                    vocab=vocab, max_vocab=60000, include_bos=False)

  return np.array(a, dtype=dtype, **kwargs)


  return array(a, dtype, copy=False, order=order)


5. training

In [25]:
cls_learner = text_classifier_learner(qsar_db, AWD_LSTM, pretrained=False, drop_mult=0.1, callback_fns=AUROC)
cls_learner.load_encoder('ChemBl_atom_encoder')
cls_learner.freeze()

In [None]:
cls_learner.fit_one_cycle(4, 3e-2, moms=(0.8,0.7))
cls_learner.freeze_to(-2)
cls_learner.fit_one_cycle(4, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
cls_learner.freeze_to(-3)
cls_learner.fit_one_cycle(4, slice(5e-4/(2.6**4),5e-4), moms=(0.8,0.7))
cls_learner.unfreeze()
cls_learner.fit_one_cycle(6, slice(5e-5/(2.6**4),5e-5), moms=(0.8,0.7))

In [None]:
#save the trained model
cls_learner.save(f'bbbp_model')

6. make predction on test set

see `03_QSAR_Classification.ipynb`