In [1]:
import numpy as np
import pandas as pd
import multiprocessing
import pickle
import atomInSmiles

from tqdm import tqdm

In [28]:
df_train = pd.read_parquet('/data/datasets/leash-BELKA/random_stratified_split/train.parquet')
df_valid = pd.read_parquet('/data/datasets/leash-BELKA/random_stratified_split/valid.parquet')

In [2]:
df_test = pd.read_parquet('/data/datasets/leash-BELKA/origin/test.parquet')

In [3]:
df_test.head(4)

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name
0,295246830,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,BRD4
1,295246831,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,HSA
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,sEH
3,295246833,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,BRD4


In [4]:
with open('/data/datasets/leash-BELKA/AIS-Token-Dict.pickle', 'rb') as f:
    ais_tokens_dict = pickle.load(f)

In [5]:
ais_tokens_dict

{'#': 1,
 '(': 2,
 ')': 3,
 '-': 4,
 '/': 5,
 '1': 6,
 '2': 7,
 '3': 8,
 '4': 9,
 '5': 10,
 '6': 11,
 '7': 12,
 '8': 13,
 '9': 14,
 '=': 15,
 '[B;R;COO]': 16,
 '[Br;!R;C]': 17,
 '[C;!R;CCCC]': 18,
 '[C;!R;CCCN]': 19,
 '[C;!R;CCCO]': 20,
 '[C;!R;CCC]': 21,
 '[C;!R;CCCl]': 22,
 '[C;!R;CCFF]': 23,
 '[C;!R;CCOO]': 24,
 '[C;!R;CCO]': 25,
 '[C;!R;CC]': 26,
 '[C;!R;CClCl]': 27,
 '[C;!R;CFFF]': 28,
 '[C;!R;CNN]': 29,
 '[C;!R;CNO]': 30,
 '[C;!R;CNS]': 31,
 '[C;!R;CN]': 32,
 '[C;!R;COO]': 33,
 '[C;!R;COS]': 34,
 '[C;!R;CSi]': 35,
 '[C;!R;FFFO]': 36,
 '[C;!R;NNO]': 37,
 '[C;!R;NOO]': 38,
 '[C;R;CCCC]': 39,
 '[C;R;CCCF]': 40,
 '[C;R;CCCN]': 41,
 '[C;R;CCCO]': 42,
 '[C;R;CCCS]': 43,
 '[C;R;CCC]': 44,
 '[C;R;CCFF]': 45,
 '[C;R;CCN]': 46,
 '[C;R;CCOO]': 47,
 '[C;R;CCO]': 48,
 '[C;R;CCS]': 49,
 '[C;R;CNN]': 50,
 '[C;R;CNO]': 51,
 '[C;R;COO]': 52,
 '[C;R;FFOO]': 53,
 '[C;R;NNO]': 54,
 '[C;R;NNS]': 55,
 '[C;R;NOO]': 56,
 '[C;R;NOS]': 57,
 '[CH2;!R;CC]': 58,
 '[CH2;!R;CF]': 59,
 '[CH2;!R;CN]': 60,
 '[CH2

In [31]:
s = atomInSmiles.similarity('[s;R;CN]', '[o;R;CN]')
s

0.0

In [6]:
MAX_LEN = 150
PAD = 0
UNK = 221

def remove_dy(smiles):
    return smiles.replace("[Dy]", "")

def tokenize(smiles):
    smiles = remove_dy(smiles)
    ais_tokens = atomInSmiles.encode(smiles)
    
    return ais_tokens.split()

def encode_smile(smiles):
    tokens = tokenize(smiles)
    tmp = [ais_tokens_dict[tk] for tk in tokens]
    assert len(tmp) <= MAX_LEN, "Max length overhead"
    tmp = tmp + [PAD]*(MAX_LEN-len(tmp))
    return np.array(tmp).astype(np.uint8)

In [7]:
smiles = 'NCC(=O)O'

print(encode_smile(smiles))

[116  60  33   2  15 124   3 130   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0]


In [8]:
smiles_test = df_test['molecule_smiles'].values
len(smiles_test)

1674896

In [9]:
with open('/data/datasets/leash-BELKA/AIS-UNK-Tokens.pickle', 'rb') as f:
    UNK_tokens = pickle.load(f)
UNK_tokens

[31, 50, 53, 64, 65, 80, 91, 98, 99, 137, 151, 160, 186, 193]

In [25]:
# UNK_tokens = [v + 1 for v in UNK_tokens]
# UNK_tokens

[31, 50, 53, 64, 65, 80, 91, 98, 99, 137, 151, 160, 186, 193]

In [27]:
# with open('/data/datasets/leash-BELKA/AIS-UNK-Tokens.pickle', 'wb') as f:
#     pickle.dump(UNK_tokens, f)

### Test data AIS tokenize

In [10]:
with multiprocessing.Pool(processes=32) as pool:
    test_ais_encoded = list(tqdm(pool.imap(encode_smile, smiles_test), total=len(smiles_test)))

100%|██████████| 1674896/1674896 [01:37<00:00, 17207.26it/s]


In [11]:
test_ais_encoded = np.stack(test_ais_encoded)

In [13]:
test_ais_encoded.shape

(1674896, 150)

### Base test parquet, UNK = random noise vector

In [14]:
test_base = pd.DataFrame(test_ais_encoded, columns = [f'token{i}' for i in range(150)])
test_base.to_parquet('/data/datasets/leash-BELKA/test_ais_tokenized_base.parquet')

### UNK = 221 test parquet

In [16]:
for unk in UNK_tokens:
    test_ais_encoded[test_ais_encoded == unk] = 221

In [20]:
np.any(test_ais_encoded == 0)

True

In [21]:
test_unk_221 = pd.DataFrame(test_ais_encoded, columns = [f'token{i}' for i in range(150)])
test_unk_221.to_parquet('/data/datasets/leash-BELKA/test_ais_tokenized_unk=221.parquet')

In [25]:
for i in range(1,10):
    print(i)

1
2
3
4
5
6
7
8
9


### Replace UNK tokens to other tokens using similarity

In [24]:
a = {}
for k, v in ais_tokens_dict.items():
    a[v] = k

In [26]:
sim_result_dict = {}
for t in UNK_tokens:
    cur_token = a[t]
    sim_max = 0
    idx = -1
    for i in range(1, 221):
        if i != t:
            sim = atomInSmiles.similarity(cur_token, a[i])
            if sim > sim_max:
                sim_max = sim
                idx = i
    sim_result_dict[t] = (idx, sim_max)

In [27]:
sim_result_dict

{31: (-1, 0),
 50: (-1, 0),
 53: (-1, 0),
 64: (-1, 0),
 65: (-1, 0),
 80: (-1, 0),
 91: (-1, 0),
 98: (-1, 0),
 99: (-1, 0),
 137: (-1, 0),
 151: (-1, 0),
 160: (-1, 0),
 186: (-1, 0),
 193: (-1, 0)}

In [29]:
ch_token_dict = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
                 '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
                 '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}

def tokenize_char(smiles):
    smiles = remove_dy(smiles)
    tmp = [ch_token_dict[tk] for tk in smiles]
    assert len(tmp) <= MAX_LEN
    tmp = tmp + [PAD]*(MAX_LEN-len(tmp))
    return np.array(tmp).astype(np.uint8)

In [30]:
smiles = 'NCC(=O)O'

print(tokenize_char(smiles))

[33  8  8 17 26 28 19 28  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


In [31]:
with multiprocessing.Pool(processes=32) as pool:
    test_ch_encoded = list(tqdm(pool.imap(tokenize_char, smiles_test), total=len(smiles_test)))

100%|██████████| 1674896/1674896 [00:51<00:00, 32604.04it/s]


In [33]:
test_ch_encoded = np.stack(test_ch_encoded)

In [34]:
test_ch_encoded = pd.DataFrame(test_ch_encoded, columns = [f'token{i}' for i in range(150)])

In [35]:
test_ch_encoded.head(3)

Unnamed: 0,token0,token1,token2,token3,token4,token5,token6,token7,token8,token9,...,token140,token141,token142,token143,token144,token145,token146,token147,token148,token149
0,8,22,8,8,8,8,29,8,3,5,...,0,0,0,0,0,0,0,0,0,0
1,8,22,8,8,8,8,29,8,3,5,...,0,0,0,0,0,0,0,0,0,0
2,8,22,8,8,8,8,29,8,3,5,...,0,0,0,0,0,0,0,0,0,0


In [36]:
test_ch_encoded.to_parquet('/data/datasets/leash-BELKA/test_ch_tokenized.parquet')

In [37]:
smiles_train = df_train['molecule_smiles'].values
print(len(smiles_train))

with multiprocessing.Pool(processes=32) as pool:
    train_ch_encoded = list(tqdm(pool.imap(tokenize_char, smiles_train), total=len(smiles_train)))
train_ch_encoded = np.stack(train_ch_encoded)

83653268


100%|██████████| 83653268/83653268 [42:59<00:00, 32424.14it/s]


In [38]:
train_ch_encoded = pd.DataFrame(train_ch_encoded, columns = [f'token{i}' for i in range(150)])
train_ch_encoded['binds_BRD4'] = df_train['binds_BRD4'].values
train_ch_encoded['binds_HSA'] = df_train['binds_HSA'].values
train_ch_encoded['binds_sEH'] = df_train['binds_sEH'].values

In [39]:
train_ch_encoded.head(4)

Unnamed: 0,token0,token1,token2,token3,token4,token5,token6,token7,token8,token9,...,token143,token144,token145,token146,token147,token148,token149,binds_BRD4,binds_HSA,binds_sEH
0,8,8,17,8,33,12,27,35,12,17,...,0,0,0,0,0,0,0,0,0,0
1,8,35,27,35,12,12,12,27,29,8,...,0,0,0,0,0,0,0,0,0,0
2,28,26,8,17,33,19,29,8,3,5,...,0,0,0,0,0,0,0,0,0,0
3,8,12,27,10,12,17,8,19,12,17,...,0,0,0,0,0,0,0,0,0,0


In [40]:
train_ch_encoded.to_parquet('/data/datasets/leash-BELKA/random_stratified_split/train_ch_tokenized.parquet')

In [41]:
valid_smiles = df_valid['molecule_smiles'].values
print(len(valid_smiles))

with multiprocessing.Pool(processes=32) as pool:
    valid_ch_encoded = list(tqdm(pool.imap(tokenize_char, valid_smiles), total=len(valid_smiles)))
valid_ch_encoded = np.stack(valid_ch_encoded)

14762342


100%|██████████| 14762342/14762342 [07:47<00:00, 31589.81it/s]


In [43]:
valid_ch_encoded = pd.DataFrame(valid_ch_encoded, columns = [f'token{i}' for i in range(150)])
valid_ch_encoded['binds_BRD4'] = df_valid['binds_BRD4'].values
valid_ch_encoded['binds_HSA'] = df_valid['binds_HSA'].values
valid_ch_encoded['binds_sEH'] = df_valid['binds_sEH'].values

In [44]:
valid_ch_encoded.head(5)

Unnamed: 0,token0,token1,token2,token3,token4,token5,token6,token7,token8,token9,...,token143,token144,token145,token146,token147,token148,token149,binds_BRD4,binds_HSA,binds_sEH
0,8,28,12,27,12,12,18,12,17,12,...,0,0,0,0,0,0,0,0,0,0
1,8,6,6,8,8,33,12,27,35,12,...,0,0,0,0,0,0,0,0,0,0
2,8,28,12,27,12,12,12,12,17,36,...,0,0,0,0,0,0,0,0,0,0
3,8,28,8,17,26,28,19,12,27,10,...,0,0,0,0,0,0,0,0,0,0
4,8,12,27,12,12,17,31,9,19,12,...,0,0,0,0,0,0,0,0,0,0


In [45]:
valid_ch_encoded.to_parquet('/data/datasets/leash-BELKA/random_stratified_split/valid_ch_tokenized.parquet')

In [46]:
with open('/data/datasets/leash-BELKA/AIS-UNK-Tokens.pickle', 'rb') as f:
    unk = pickle.load(f)

In [47]:
unk

[31, 50, 53, 64, 65, 80, 91, 98, 99, 137, 151, 160, 186, 193]