# Imports

In [1]:
#Imports - general
import numpy as np

#Imports - RDKit
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs import DiceSimilarity

#Imports - additional
from os import listdir #for getting a list of files in a dir to process checkpoints

---

# Singletons

In [2]:
with open("receptors_descending.txt", "r") as f:
    receptors = [l.strip().split() for l in f.readlines()]

In [3]:
def read_ism_w_chembl(file_name):
    ''' Parse an .ism file, returning a list of smiles of molecules '''
    mol_list = []
    chembl_list = []
    with open(file_name, 'r') as f:
        frl = f.readlines()
    for line in frl:
        line = line.strip().split('\t')
        smile = line[0]
        chembl = line[1]
        mol_list.append(smile)
        chembl_list.append(chembl)
    return mol_list, chembl_list

In [4]:
num_class = 73
ism_path = 'data/targets/'
mol_matrix = []
chembl_matrix = []
for line in receptors[:num_class]:
    name = line[0]
    mol_list, chembl_list = read_ism_w_chembl(ism_path + name)
    mol_matrix.append(mol_list)
    chembl_matrix.append(chembl_list)

In [5]:
def get_class_vectors_w_chembl(mol_matrix, chembl_matrix):
    '''
    
    '''
    cv_dict = {}
    for i,l in enumerate(mol_matrix):
        for j, smile in enumerate(l):
            if smile not in cv_dict:
                chembl_id = chembl_matrix[i][j]
                mol = MolFromSmiles(smile)
                labels = [0]*len(mol_matrix)
                cv_dict[smile] = [mol, labels, chembl_id]
            cv_dict[smile][1][i] = 1
    return cv_dict

In [6]:
molecules = get_class_vectors_w_chembl(mol_matrix, chembl_matrix)

In [7]:
# Split the data into training and validation sets
smiles, mols, labels, chembl_ids = [],[], [], []
for key, val in molecules.items():
    smiles.append(key) # [fingerprint]
    mols.append(val[0])
    labels.append(val[1]) # [label]
    chembl_ids.append(val[2])
smiles = np.array(smiles)
mols = np.array(mols)
labels = np.array(labels)
chembl_ids = np.array(chembl_ids)

### Select the singletons

In [8]:
unique_labels, unique_indices, unique_counts = np.unique(
            ar=labels, return_index=True, return_counts=True, axis=0)

singular_indices = unique_indices[unique_counts==1]

smiles_os = smiles[singular_indices]
mols_os = mols[singular_indices]
labels_os = labels[singular_indices]
chembl_os = chembl_ids[singular_indices]

### Calculate similarity

In [9]:
def Calculate_DiceSimilarity(molecules):
    fps = list(map(lambda x: GetMorganFingerprintAsBitVect(x,2), molecules))
    
    r = len(fps)
    
    similarity_matrix = np.zeros([r, r]) # fingerprint_matrix
    similarity_matrix[range(r), range(r)] = 1.0
    for i in range(r):
        for j in range(i+1, r):
            similarity_matrix[i,j] = similarity_matrix[j,i] = DiceSimilarity(fps[i], fps[j])

    return similarity_matrix

In [10]:
dice = Calculate_DiceSimilarity(mols_os)

### Save the resulting matrix to file

In [11]:
csv_lines = []
over = 'CHEMBL id,'
for i in range(len(chembl_os)):
    over += ',{}'.format(i+1) 
over += '\n'
csv_lines.append(over)

for i, row in enumerate(dice):
    line = '{},{},'.format(chembl_os[i], i+1)
    for j,item in enumerate(row):
        rounded = str(round(item,3))
        line += rounded
        if j != len(row)-1:
            line += ','
    line += '\n'
    csv_lines.append(line)

with open('test_save_1.csv','w') as f:
    for line in csv_lines:
        f.write(line)

### Check how similar they are

In [87]:
flattened = dice.flatten()
flat_sort = np.sort(flattened)[::-1]

tmp_dict1 = {}
points = [0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
for i in range(1,len(points)):
    s = points[i-1]
    e = points[i]
    slice_1 = flat_sort[np.where(s<=flat_sort)]
    slice_2 = slice_1[np.where(slice_1<e)]
    tmp_dict1[s] = len(slice_2)//2
tmp_dict1[1] = (len(flat_sort[np.where(flat_sort==1)])-506)//2

In [82]:
check = 0
for key, val in tmp_dict1.items():
    check += val
check == len(flat_sort)-506

True

In [83]:
len(flat_sort) # = 506**2

256036

In [88]:
tmp_dict1

{0: 4017,
 0.1: 48677,
 0.2: 55123,
 0.3: 15767,
 0.4: 2912,
 0.5: 658,
 0.6: 250,
 0.7: 202,
 0.8: 123,
 0.9: 25,
 1: 11}

In [89]:
len(flat_sort[np.where(flat_sort==0)])//2

33

## 0.75<=x --> wszystkie do treningowego

In [17]:
pair_indices = np.where(0.75<=dice)
pair_indices

(array([  0,   1,   2, ..., 504, 505, 505]),
 array([  0,   1,   2, ..., 504, 397, 505]))

In [27]:
pair_indices[0].shape

(1018,)

In [49]:
np.array_equal(np.sort(pair_indices[0]), np.sort(pair_indices[1]))

True

### Remove the pairs along the diagonal.

In [21]:
actual_pairs = [[],[]]
for i in range(len(pair_indices[0])):
    if pair_indices[0][i] != pair_indices[1][i]:
        actual_pairs[0].append(pair_indices[0][i])
        actual_pairs[1].append(pair_indices[1][i])
actual_pairs = np.array(actual_pairs)

In [23]:
actual_pairs.shape # that's okay, this is the doubled number

(2, 512)

In [44]:
np.unique(actual_pairs[0])

array([  3,  10,  15,  17,  18,  21,  23,  24,  26,  28,  46,  48,  62,
        63,  66,  68,  69,  70,  71,  82,  83,  84,  85,  88,  90,  97,
       104, 105, 106, 107, 111, 112, 113, 114, 122, 124, 127, 128, 129,
       131, 135, 137, 142, 144, 145, 146, 147, 149, 161, 165, 166, 174,
       175, 183, 186, 187, 189, 191, 196, 198, 201, 207, 208, 210, 211,
       212, 214, 215, 218, 219, 220, 221, 223, 224, 227, 230, 231, 232,
       234, 236, 241, 242, 243, 244, 248, 250, 251, 252, 257, 259, 260,
       261, 263, 267, 269, 271, 272, 274, 275, 281, 283, 287, 288, 289,
       292, 293, 294, 297, 298, 299, 300, 302, 303, 305, 310, 314, 315,
       318, 321, 322, 324, 325, 326, 327, 328, 332, 334, 335, 336, 337,
       339, 340, 342, 345, 348, 349, 350, 351, 352, 354, 355, 356, 358,
       359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371,
       372, 380, 381, 382, 384, 388, 389, 390, 391, 394, 395, 396, 397,
       398, 400, 401, 403, 408, 411, 412, 413, 414, 418, 420, 42

# Repetitions

In [90]:
def read_ism_w_chembl(file_name):
    ''' Parse an .ism file, returning a list of smiles of molecules '''
    mol_list = []
    chembl_list = []
    with open(file_name, 'r') as f:
        frl = f.readlines()
    for line in frl:
        line = line.strip().split('\t')
        smile = line[0]
        chembl = line[1]
        mol_list.append(smile)
        chembl_list.append(chembl)
    return mol_list, chembl_list

In [91]:
with open("receptors_descending.txt", "r") as f:
    receptors = [l.strip().split() for l in f.readlines()]

In [92]:
num_class = 73
ism_path = 'data/targets/'
mol_matrix = []
chembl_matrix = []
for line in receptors[:num_class]:
    name = line[0]
    mol_list, chembl_list = read_ism_w_chembl(ism_path + name)
    mol_matrix.append(mol_list)
    chembl_matrix.append(chembl_list)

In [93]:
class_dict = {}
for target_i in range(73):
    d_tmp = {}
    d_final = {}
    target = mol_matrix[target_i]
    for line in range(len(target)):
        smile = target[line]
        ch_id = chembl_matrix[target_i][line]
        line_id_tuple = (line+1, ch_id)
        if smile not in d_tmp:
            d_tmp[smile] = line_id_tuple
        elif smile in d_final:
            d_final[smile].append(line_id_tuple)
        else:
            d_final[smile] = [d_tmp[smile],line_id_tuple]
    if d_final:
        class_dict[receptors[target_i][0]] = d_final

In [94]:
repetitions_dict = {}
for target,val in sorted(class_dict.items()):
    
    for smile,lines in val.items():
        smile_val = (target,lines)
        
        if smile not in repetitions_dict:
            repetitions_dict[smile] = [smile_val]
        else:
            repetitions_dict[smile].append((target,lines))

In [41]:
with open('repeated_molecules.txt','w') as f:
    f.write("Format:\nSmile\n\tfile\n\t\tline_number -- CHEMBL_id\n\nLine numbers counted from 1, the same as when opening .ism files through Jupyter Notebook.\n\n\n")
    
    for smile,val in sorted(repetitions_dict.items()):
        f.write("{}\n".format(smile))
        for t_l_tuple in val:
            f.write('\t{}\n'.format(t_l_tuple[0]))
            for line in t_l_tuple[1]:
                f.write('\t\t{} -- {}\n'.format(line[0],line[1]))
        f.write('\n')

In [18]:
for key,val in sorted(class_dict.items()):
    print(key,'\n')
    for k2,v2 in val.items():
        print(k2,'\n',v2,'\n')
    print('\n')

ADRA1A.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [(2288, 'CHEMBL2114139'), (2289, 'CHEMBL2115166')] 



ADRA2A.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [(632, 'CHEMBL2114139'), (633, 'CHEMBL2115166')] 



ADRA2B.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [(573, 'CHEMBL2114139'), (574, 'CHEMBL2115166')] 



ADRA2C.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [(595, 'CHEMBL2114139'), (596, 'CHEMBL2115166')] 



CHRM1.ism 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(=C)C3CCN(CC3)C4CCCCC4 
 [(570, 'CHEMBL2115128'), (588, 'CHEMBL2114068')] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(=O)C3CCN(CC3)C4CCCCC4 
 [(589, 'CHEMBL2115127'), (598, 'CHEMBL2114067')] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(C#N)N3CCN(CC3)C4CCCCC4 
 [(282, 'CHEMBL73341'), (564, 'CHEMBL2114066'), (567, 'CHEMBL2111540')] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(C#N)C3CCN(CC3)C4CCCCC4 
 [(569, 'CHEMBL2115126'), (577, 'CHEMBL2114064')] 



CHRM2.ism 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(=C)C3CCN(CC3)C