In [6]:
# Import necessary libraries
import os
import sys
import torch
import random
import threading
import numpy as np
import pandas as pd
from pathlib import Path

# Set environment variables for GPU and device configuration
# os.environ['CUDA_VISIBLE_DEVICES'] = '5'  # Uncomment to set a different GPU
%env CUDA_VISIBLE_DEVICES=4
device = torch.device('cuda:4' if torch.cuda.is_available() else "cpu")

# Add custom paths to the Python environment
sys.path.append('./fastai1/')

# Disable RDKit warning messages
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# Import FastAI modules
from fastai import *
from fastai.text import *
from fastai.vision import *
from fastai.imports import *

# Import PyTorch and related libraries
import torch.nn.functional as F
import torchvision

# Import Scikit-learn utilities
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Display the current working directory
current_path = os.getcwd()
print(f"Current working directory: {current_path}")

# Enable interactive features for Jupyter Notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline


In [17]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [18]:
def randomize_smiles(smiles):
    m = Chem.MolFromSmiles(smiles)
    ans = list(range(m.GetNumAtoms()))
    np.random.shuffle(ans)
    nm = Chem.RenumberAtoms(m,ans)
    return Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False)

In [19]:
def ee_smiles_augmentation(df, N_rounds, noise):
    '''
    noise: add gaussion noise to the label
    '''
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        for j in range(N_rounds):
            dist_aug['smiles'].append(randomize_smiles(df.iloc[i].smiles))
            dist_aug['Yield'].append(df.iloc[i]['Yield'] + np.random.normal(0,noise))
    print(len(dist_aug['smiles']))
    print(len(dist_aug['Yield']))
    #print(len(smiles))
    df_aug = pd.DataFrame.from_dict(dist_aug)
    df_aug = df_aug.append(df, ignore_index=True)
    return df_aug.drop_duplicates('smiles')

In [20]:
def test_smiles_augmentation(df, N_rounds):
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        for j in range(N_rounds):
            dist_aug['smiles'].append(randomize_smiles(df.iloc[i].smiles))
            dist_aug['Yield'].append(df.iloc[i]['Yield'])
    df_aug = pd.DataFrame.from_dict(dist_aug)

    return pd.DataFrame.from_dict(dist_aug)

In [21]:
random_seed(1234, True)

# Create a path to save the results
data_path = Path('./ulmfit/results')
name = 'regressor'
path = data_path / name
path.mkdir(exist_ok=True, parents=True)

In [1]:
import pandas as pd
df=pd.read_excel("./Data/Fine-tuning/866-reaction-yield-CV_1-20.xlsx)
df.shape


Unnamed: 0,Sl. No.,smiles,Yield,product
0,Yu_9_22,O=S(C1=CC(C(CC(C)C)(C#N)CC(C)C)=C(OC)C=C1)(N2[...,81,O=S(C1=CC(C(CC(C)C)(C#N)CC(C)C)=C(OC)C=C1)(N2[...
1,Li_3_28,O=C(N(C1=CC=CC=C1C(O)=O)C)CCC2=C(C)C(C)=CC=C2....,67,O=C(N(C1=CC=CC=C1C(O)=O)C)CCC2=C(C)C(C)=CC(/C=...
2,Yu_2_5,O=C(N(C1=CC=CC=C1F)C2=C(C#N)C=CC=C2)CCC3=C(F)C...,68,O=C(N(C1=CC=CC=C1F)C2=C(C#N)C=CC=C2)CCC3=C(F)C...
3,D_5_3,O=S(CCC1=CC=CC=C1)(OC2=C(C#N)C=CC=C2)=O.C=CP(O...,55,O=S(CCC1=CC=CC=C1)(OC2=C(C#N)C=C(/C=C/P(OCC)(O...
4,Yu_3_14,O=C(N(C1=C(C#N)C=CC=C1)C2=C(C#N)C=CC=C2)C(C)(C...,64,O=C(N(C1=C(C#N)C=CC=C1)C2=C(C#N)C=CC=C2)C(C)(C...
...,...,...,...,...
861,D_5_29,O=C(OC1=C(C#N)C=CC=C1)CCC2=CC=CC(/C(C)=C/C(OC)...,55,O=C(OC1=C(C#N)C=CC=C1)CCC2=CC(/C(C)=C/C(OC)=O)...
862,D_8_13,O=C(CC1=C(C)C=CC=C1)OC2=C(C#N)C=CC(OC)=C2.BrC#...,78,O=C(CC1=C(C)C=CC(C#C[Si](C(C)C)(C(C)C)C(C)C)=C...
863,Li_3_46,O=C(N(C1=CC=CC(F)=C1C(O)=O)C)CCC2=CC(F)=CC=C2....,45,O=C(N(C1=CC=CC(F)=C1C(O)=O)C)CCC2=CC(F)=CC(C3=...
864,Li_1_12,O=C(C1=C(C#N)C=CC=C1)N(C)CCC2=CC=C(Cl)C=C2.C=C...,59,O=C(C1=C(C#N)C=CC=C1)N(C)CCC2=CC=C(Cl)C(/C=C/C...


In [547]:
df=df[['smiles','Yield']]

In [548]:
# train=df.iloc[:605,:]
# train
random_seed(1234, True)

train_ , test = train_test_split(df, test_size=0.20, random_state=100)
train, valid = train_test_split(train_, test_size=0.125, random_state=0)
print(train.shape)
print(test.shape)
print(valid.shape)

(257, 2)
(74, 2)
(37, 2)


In [549]:
# Don't include the defalut specific token of fastai, only keep the padding token
BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
defaults.text_spec_tok = [PAD]

special_tokens = ['[BOS]', '[C@H]', '[C@@H]','[C@]', '[C@@]','[C-]','[C+]', '[c-]', '[c+]','[cH-]',
                   '[nH]', '[N+]', '[N-]', '[n+]', '[n-]' '[NH+]', '[NH2+]',
                   '[O-]', '[S+]', '[s+]', '[S-]', '[O+]', '[SH]', '[B-]','[BH2-]', '[BH3-]','[b-]',
                   '[PH]','[P+]', '[I+]',
                  '[Si]','[SiH2]', '[Se]','[SeH]', '[se]', '[Se+]', '[se+]','[te]','[te+]', '[Te]',
                  '[Pd+2]', '[Cs+]','[N@@+]','[Na+]','[OH-]','[N@]','[K+]','[F-]','[Rh]','[Ag+]','[Si]',
                  '[Pd]', '[Cs2+]', '[Cu]', '[Cu+2]', '[Ge]', '[Sb-]', '[Cl+]', '[Cl-]', '[Br-]', '[NH4+]',
                  '[P-]',
                  ]

class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en', special_tokens = special_tokens):
        self.lang = lang
        self.special_tokens = special_tokens

    def tokenizer(self, smiles):
        # add specific token '[BOS]' to represetences the start of SMILES
        smiles = '[BOS]' + smiles
        regex = '(\[[^\[\]]{1,10}\])'
        char_list = re.split(regex, smiles)
        tokens = []

        if self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    if char in special_tokens:
                        tokens.append(str(char))
                    else:
                        tokens.append('[UNK]')
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]

        if not self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    tokens.append(str(char))
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]

        #fix the 'Br' be splited into 'B' and 'r'
        if 'B' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'B':
                    if index < len(tokens)-1: # make sure 'B' is not the last character
                        if tokens[index+1] == 'r':
                            tokens[index: index+2] = [reduce(lambda i, j: i + j, tokens[index : index+2])]

        #fix the 'Cl' be splited into 'C' and 'l'
        if 'l' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'l':
                    if tokens[index-1] == 'C':
                            tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
        return tokens

    def add_special_cases(self, toks):
        pass

In [550]:
bs = 128
tok = Tokenizer(partial(MolTokenizer, special_tokens = special_tokens), n_cpus=6, pre_rules=[], post_rules=[])

In [551]:
#np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
random_seed(1234, True)

In [552]:
lm_vocab = TextLMDataBunch.from_df(path, train, valid, bs=bs, tokenizer=tok,
                              chunksize=50000, text_cols=0,label_cols=1, max_vocab=60000, include_bos=False, min_freq=1, num_workers=0)
print(f'Vocab Size: {len(lm_vocab.vocab.itos)}')

  return np.array(a, dtype=dtype, **kwargs)


Vocab Size: 48


In [553]:
pretrained_model_path = Path('./Pretraining_weights_bias/SSP1/models/')
pretrained_fnames = ['C-H-activation_100_wt', 'C-H-activation_100_vocab']
fnames = [pretrained_model_path/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]

In [554]:

random_seed(1234, True)

data_clas = TextClasDataBunch.from_df(path, train, valid, bs=bs, tokenizer=tok,
                                          chunksize=50000, text_cols='smiles',label_cols='Yield',
                                          vocab=lm_vocab.vocab, max_vocab=60000, include_bos=False, min_freq=1, num_workers=0)

print(f'Vocab Size: {len(data_clas.vocab.itos)}')

Vocab Size: 48


Your valid set contained the following unknown labels, the corresponding items have been discarded.
45, 48, 90
  if getattr(ds, 'warn', False): warn(ds.warn)
  sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([],dtype=np.int)


In [555]:
random_seed(1234, True)

lm_learner = language_model_learner(lm_vocab, AWD_LSTM, drop_mult=0.0, wd=0.0, pretrained=False)
lm_learner = lm_learner.load_pretrained(*fnames)
lm_learner.freeze()
lm_learner.save_encoder(f'lm_encoder')

In [556]:
random_seed(1234, True)

reg_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.0, wd=0.0, metrics = [r2_score, rmse])
reg_learner.load_encoder(f'lm_encoder')
reg_learner.freeze()

In [557]:
reg_learner.model

SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(48, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(48, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1152, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1152, 1152, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1152, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=1200, out_features=50, bias=True)
      (2): ReLU(inplace=True)
      (3): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_run

In [558]:
reg_learner_trained=reg_learner
data = df

In [559]:
from numpy.ma.core import shape
def process_doc(learn, doc):
    xb, yb = learn.data.one_item(doc)
    #print(xb)
    return xb
def encode_doc(learn, doc):
    xb = process_doc(learn, doc)
    #print(xb)
    #print(xb.shape)
# Reset initializes the hidden state
    awd_lstm = learn.model[0]
    awd_lstm.reset()
    with torch.no_grad():
        out = awd_lstm.eval()(xb)
        #print(out[0][2][0])

    # Return raw output, for last RNN, on last token in sequence
    return out[0][2][0].cpu().max(0).values.detach().numpy()

In [560]:
decoder_output_app = []
train_sample_app = []
for i in range(0, len(data)):
    #print(i)
    arr1 = encode_doc(reg_learner_trained, data.iloc[i][0])
    arr1 = arr1.tolist()
    decoder_output_app.append(arr1)
    train_sample = data.iloc[i][0]
    train_sample_app.append(train_sample)

df_decoder_output = pd.DataFrame(decoder_output_app)
df_train_sample  = pd.DataFrame(train_sample_app, columns= ['smiles_label'])
df_decoder_output_train = pd.concat([df_train_sample, df_decoder_output], axis =1)
df_decoder_output_train

Unnamed: 0,smiles_label,0,1,2,3,4,5,6,7,8,...,390,391,392,393,394,395,396,397,398,399
0,N#CC(C=C(OC)C=C1)=C1OCCCC2=CC(C(F)(F)F)=CC=C2....,0.520122,0.109922,0.149098,0.053835,0.185784,0.102807,0.152670,0.126783,0.142995,...,0.122031,0.055867,0.016855,0.072544,0.085530,0.035136,0.097785,0.022894,0.018409,0.325049
1,O=C(C1=C(C#N)C=CC=C1)N(C)CCC2=CC(Cl)=CC=C2.C=C...,0.505961,0.110397,0.199684,0.016118,0.217537,0.038914,0.194987,0.197607,0.136213,...,0.109315,0.056969,0.020730,0.211507,0.033881,0.041205,0.110850,0.015501,0.039126,0.258439
2,O=C(N(C1=CC(OC)=CC=C1C#N)C2=C(C#N)C=CC(OC)=C2)...,0.531706,0.067528,0.310283,0.053984,0.274924,0.130943,0.224237,0.209065,0.153753,...,0.293596,0.117360,0.146163,0.281790,0.035990,0.090530,0.186175,0.122245,0.028403,0.228475
3,O=S(CC1=CC(F)=CC=C1)(OC2=C(C#N)C=CC=C2)=O.C=CC...,0.468523,0.134949,0.140643,0.120589,0.206129,0.023158,0.119368,0.197607,0.122283,...,0.081214,0.103506,0.098014,0.339903,0.018327,0.054632,0.042415,0.018676,0.030374,0.278195
4,O=C(C(F)OC1=C(C#N)C=CC=C1)N(C)C2=C(C)C=CC=C2.C...,0.489745,0.065481,0.222345,0.086288,0.227518,0.036697,0.184868,0.197607,0.145667,...,0.064315,0.056969,0.072992,0.340624,0.046433,0.056968,0.240506,0.253225,0.028981,0.174144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,O=C(N(S(C1=CC=C([N+]([O-])=O)C=C1)(=O)=O)CCC2=...,0.483660,0.054629,0.192002,0.046049,0.200293,0.076255,0.156875,0.197607,0.170324,...,0.098686,0.114456,0.060442,0.240235,0.035990,0.049261,0.228916,0.188670,0.102952,0.275684
364,O=C(N(C1=CC=CC=C1C(O)=O)C)CCC2=CC=C(F)C=C2.C=C...,0.563075,0.098232,0.199243,0.034410,0.184122,0.064050,0.145034,0.197607,0.138005,...,0.095144,0.056969,0.074419,0.243414,0.086539,0.080915,0.154527,0.061614,0.038087,0.370405
365,O=C(N(C1=CC=CC(F)=C1C(O)=O)C)CCC2=CC(F)=CC=C2....,0.592294,0.065697,0.227285,0.021715,0.384089,0.072540,0.172149,0.197607,0.138005,...,0.072588,0.059334,0.028623,0.386085,0.039281,0.024825,0.151328,0.149173,0.031179,0.370405
366,CC([Si](CC1=CC=CC=C1)(OC2=C(C#N)C=CC=C2)C(C)C)...,0.471802,0.040194,0.169874,0.057898,0.135297,0.061402,0.118340,0.154823,0.103585,...,0.070318,0.051108,0.009272,0.203166,0.021368,0.041153,0.070438,0.057319,0.004590,0.316063


In [562]:
dfs = pd.concat([df_decoder_output_train, df], axis=1)
dfs

Unnamed: 0,smiles_label,0,1,2,3,4,5,6,7,8,...,396,397,398,399,smiles,Yield,mean+sigma,label_plus,mean-sigma,label_minus
0,N#CC(C=C(OC)C=C1)=C1OCCCC2=CC(C(F)(F)F)=CC=C2....,0.520122,0.109922,0.149098,0.053835,0.185784,0.102807,0.152670,0.126783,0.142995,...,0.097785,0.022894,0.018409,0.325049,N#CC(C=C(OC)C=C1)=C1OCCCC2=CC(C(F)(F)F)=CC=C2....,82,83.62843,0,64.713962,0
1,O=C(C1=C(C#N)C=CC=C1)N(C)CCC2=CC(Cl)=CC=C2.C=C...,0.505961,0.110397,0.199684,0.016118,0.217537,0.038914,0.194987,0.197607,0.136213,...,0.110850,0.015501,0.039126,0.258439,O=C(C1=C(C#N)C=CC=C1)N(C)CCC2=CC(Cl)=CC=C2.C=C...,78,83.62843,0,64.713962,0
2,O=C(N(C1=CC(OC)=CC=C1C#N)C2=C(C#N)C=CC(OC)=C2)...,0.531706,0.067528,0.310283,0.053984,0.274924,0.130943,0.224237,0.209065,0.153753,...,0.186175,0.122245,0.028403,0.228475,O=C(N(C1=CC(OC)=CC=C1C#N)C2=C(C#N)C=CC(OC)=C2)...,58,83.62843,0,64.713962,1
3,O=S(CC1=CC(F)=CC=C1)(OC2=C(C#N)C=CC=C2)=O.C=CC...,0.468523,0.134949,0.140643,0.120589,0.206129,0.023158,0.119368,0.197607,0.122283,...,0.042415,0.018676,0.030374,0.278195,O=S(CC1=CC(F)=CC=C1)(OC2=C(C#N)C=CC=C2)=O.C=CC...,76,83.62843,0,64.713962,0
4,O=C(C(F)OC1=C(C#N)C=CC=C1)N(C)C2=C(C)C=CC=C2.C...,0.489745,0.065481,0.222345,0.086288,0.227518,0.036697,0.184868,0.197607,0.145667,...,0.240506,0.253225,0.028981,0.174144,O=C(C(F)OC1=C(C#N)C=CC=C1)N(C)C2=C(C)C=CC=C2.C...,72,83.62843,0,64.713962,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,O=C(N(S(C1=CC=C([N+]([O-])=O)C=C1)(=O)=O)CCC2=...,0.483660,0.054629,0.192002,0.046049,0.200293,0.076255,0.156875,0.197607,0.170324,...,0.228916,0.188670,0.102952,0.275684,O=C(N(S(C1=CC=C([N+]([O-])=O)C=C1)(=O)=O)CCC2=...,72,83.62843,0,64.713962,0
364,O=C(N(C1=CC=CC=C1C(O)=O)C)CCC2=CC=C(F)C=C2.C=C...,0.563075,0.098232,0.199243,0.034410,0.184122,0.064050,0.145034,0.197607,0.138005,...,0.154527,0.061614,0.038087,0.370405,O=C(N(C1=CC=CC=C1C(O)=O)C)CCC2=CC=C(F)C=C2.C=C...,55,83.62843,0,64.713962,1
365,O=C(N(C1=CC=CC(F)=C1C(O)=O)C)CCC2=CC(F)=CC=C2....,0.592294,0.065697,0.227285,0.021715,0.384089,0.072540,0.172149,0.197607,0.138005,...,0.151328,0.149173,0.031179,0.370405,O=C(N(C1=CC=CC(F)=C1C(O)=O)C)CCC2=CC(F)=CC=C2....,45,83.62843,0,64.713962,1
366,CC([Si](CC1=CC=CC=C1)(OC2=C(C#N)C=CC=C2)C(C)C)...,0.471802,0.040194,0.169874,0.057898,0.135297,0.061402,0.118340,0.154823,0.103585,...,0.070438,0.057319,0.004590,0.316063,CC([Si](CC1=CC=CC=C1)(OC2=C(C#N)C=CC=C2)C(C)C)...,72,83.62843,0,64.713962,0


In [563]:
dfs.to_csv(".Data/dataset-encoding-vectors.csv")