# Transfer Learning

In [18]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pickle

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from fastai import *
from fastai.text import *
from utils import *

import sys

The downloaded folder (named "models") contains:
1. vocabulary: ChEMBL_LM_SPE_vocab.pkl
2. language model encoder (pretrained model weights): ChEMBL_spe_encoder.pth
3. SPE tokens: SPE_ChEMBL.txt (generated from ChEMBL25, see more [here](https://github.com/XinhaoLi74/SmilesPE/blob/master/Examples/train_SPE.ipynb))

**Note**:The `TextClasDataBunch` function needs a `path` as input (see more [here](https://fastai1.fast.ai/text.data.html#TextDataBunch.from_df)). The path should be the parent folder of the downloaded "models" folder. This is due to the fact that `learner.load_encoder()` will load the model from `path/models/`.

1. load the vocabulary
2. initialize the tokenizer

In [2]:
#define the model path
path = Path('../results/SPE_Pretrained/') # the parent folder

In [4]:
with open(f'{path}/models/ChEMBL_LM_SPE_vocab.pkl', 'rb') as f:
    orig_itos = pickle.load(f)
    
vocab = Vocab(orig_itos)

The [SmilesPE](https://github.com/XinhaoLi74/SmilesPE) package needs to be installed.

In [6]:
import codecs
from SmilesPE.tokenizer import *

spe_vob= codecs.open(f'{path}/models/SPE_ChEMBL.txt')
spe = SPE_Tokenizer(spe_vob, exclusive_tokens=special_tokens)
tok = Tokenizer(partial(MolTokenizer_SPE, spe), n_cpus=6, pre_rules=[], post_rules=[])

The following steps should be the same as descripted in `05_Pretrained_Models.ipynb`. The encoder is named `ChEMBL_spe_encoder`

In [8]:
bbbp = pd.read_csv('../data/QSAR/bbbp.csv').sample(n=500)
print(bbbp.shape)
bbbp.head()

(500, 2)


Unnamed: 0,smiles,p_np
1386,[C@]14([C@H]([C@H]3[C@](F)([C@@H](O)C1)[C@@]2(...,1
1049,C1=NC3=C([N]1CCNC(C(C2=CC=CC=C2)O)C)C(N(C)C(N3...,1
1101,C1=C(C(=CC(=C1)Cl)Cl)OCCCN(CC#C)C,1
1250,C1=C(C(OCC)=O)[N](C=N1)C(C)C2=CC=C(C=C2)F,1
645,CN(C1CCCC[C@H]1N2CCCC2)C(=O)Cc3ccc(Cl)c(Cl)c3,1


In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(bbbp,
    test_size=0.1, shuffle = True, random_state = 8)

train, val = train_test_split(train,
    test_size=0.1, shuffle = True, random_state = 42)

print(train.shape[0], test.shape[0], val.shape[0]) 

405 50 45


In [12]:
bs = 128 #batch size

qsar_db = TextClasDataBunch.from_df(path, train, val, bs=bs, tokenizer=tok, 
                                    chunksize=50000, text_cols='smiles',label_cols='p_np', 
                                    vocab=vocab, max_vocab=60000, include_bos=False)

  return np.array(a, dtype=dtype, **kwargs)


  return array(a, dtype, copy=False, order=order)


In [16]:
cls_learner = text_classifier_learner(qsar_db, AWD_LSTM, pretrained=False, drop_mult=0.1, callback_fns=AUROC)
cls_learner.load_encoder('ChEMBL_spe_encoder')
cls_learner.freeze()

In [None]:
cls_learner.fit_one_cycle(4, 3e-2, moms=(0.8,0.7))
cls_learner.freeze_to(-2)
cls_learner.fit_one_cycle(4, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
cls_learner.freeze_to(-3)
cls_learner.fit_one_cycle(4, slice(5e-4/(2.6**4),5e-4), moms=(0.8,0.7))
cls_learner.unfreeze()
cls_learner.fit_one_cycle(6, slice(5e-5/(2.6**4),5e-5), moms=(0.8,0.7))