In [5]:
#Imports - general
import random
random.seed(15)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('paper')
from math import sqrt
from sklearn.model_selection import train_test_split
%matplotlib inline

#Imports - RDKit
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem.rdMolDescriptors import *

#Imports - additional
from os import listdir #for getting a list of files in a dir to process checkpoints
import time # for real-time timing the progress of the network

In [6]:
def read_ism(file_name):
    ''' Parse an .ism file, returning a list of smiles of molecules '''
    mol_list = []
    with open(file_name, 'r') as f:
        frl = f.readlines()
    for line in frl:
        line = line.split('\t')
        smile = line[0]
        mol_list.append(smile)
    return mol_list

In [8]:
def get_class_vectors(mol_matrix):
    '''
    
    '''
    cv_dict = {}
    for i,l in enumerate(mol_matrix):
        for smile in l:
            if smile not in cv_dict:
                
                mol = MolFromSmiles(smile)
                
                labels = [0]*len(mol_matrix)
                cv_dict[smile] = [mol, labels]
            cv_dict[smile][1][i] = 1
    return cv_dict

---

In [13]:
with open("receptors_descending.txt", "r") as f:
    receptors = [l.strip().split() for l in f.readlines()]

In [14]:
num_class = 73 #first n most numerous classes - 2<=n<=73
ism_path = 'data/targets/'   #path to .ism files which represent classes and contain molecules
mol_matrix = []
for line in receptors[:num_class]:
    name = line[0]
    mol_list = read_ism(ism_path + name)
    mol_matrix.append(mol_list)
mol_matrix[0][:3]

['CN1CCN(CC1)C2=Cc3ccccc3C(=C(C)C)c4ccccc24',
 'CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24',
 'CN1CCC(CC1)C2=Cc3cc(Cl)ccc3Cc4ccccc24']

# Singletons

In [15]:
molecules = get_class_vectors(mol_matrix)

In [16]:
# Split the data into training and validation sets
smiles, mols, labels = [],[], []
for key, val in molecules.items():
    smiles.append(key) # [fingerprint]
    mols.append(val[0])
    labels.append(val[1]) # [label]
smiles = np.array(smiles)
mols = np.array(mols)
labels = np.array(labels)

In [17]:
unique_labels, unique_indices, unique_counts = np.unique(
            ar=labels, return_index=True, return_counts=True, axis=0)

singular_indices = unique_indices[unique_counts==1]

smiles_os = smiles[singular_indices]
mols_os = mols[singular_indices]
labels_os = labels[singular_indices]

---

# Repetitions

In [50]:
with open("receptors_descending.txt", "r") as f:
    receptors = [l.strip().split() for l in f.readlines()]

In [51]:
num_class = 73
ism_path = 'data/targets/'
mol_matrix = []
for line in receptors[:num_class]:
    name = line[0]
    mol_list = read_ism(ism_path + name)
    mol_matrix.append(mol_list)
mol_matrix[0][:3]

['CN1CCN(CC1)C2=Cc3ccccc3C(=C(C)C)c4ccccc24',
 'CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24',
 'CN1CCC(CC1)C2=Cc3cc(Cl)ccc3Cc4ccccc24']

In [54]:
class_dict = {}
for j in range(73):
    d1={}
    d2={}
    c = mol_matrix[j]
    for i in range(len(c)):
        s = c[i]
        if s not in d1:
            d1[s] = i+1
        elif s in d2:
            d2[s].append(i+1)
        else:
            d2[s] = [d1[s],i+1]
    if d2:
        class_dict[receptors[j][0]] = d2

In [55]:
for key,val in sorted(class_dict.items()):
    print(key,'\n')
    for k2,v2 in val.items():
        print(k2,'\n',v2,'\n')
    print('\n')

ADRA1A.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [2288, 2289] 



ADRA2A.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [632, 633] 



ADRA2B.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [573, 574] 



ADRA2C.ism 

COc1cccc(OC)c1OCCNC[C@@H]2C[S+]([O-])c3ccccc3O2 
 [595, 596] 



CHRM1.ism 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(C#N)C3CCN(CC3)C4CCCCC4 
 [569, 577] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(=O)C3CCN(CC3)C4CCCCC4 
 [589, 598] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(C#N)N3CCN(CC3)C4CCCCC4 
 [282, 564, 567] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(=C)C3CCN(CC3)C4CCCCC4 
 [570, 588] 



CHRM2.ism 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(C#N)C3CCN(CC3)C4CCCCC4 
 [1561, 1571] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(=O)C3CCN(CC3)C4CCCCC4 
 [1585, 1598] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(C#N)N3CCN(CC3)C4CCCCC4 
 [1065, 1556, 1559] 

COc1ccc(cc1)[S+]([O-])c2ccc(cc2)C(=C)C3CCN(CC3)C4CCCCC4 
 [1562, 1584] 

[I-].C[N+](C)(C)C[C@@H]1C[S+]([O-])C(O1)(C2CCCCC2)C3CCCCC3 
 [696, 70