**Preprocessing of the PubChem Database of Substances**

The downloaded dataset comprises of a sequence of SMILES strings and some empty strings which are to be converted to MACCS fingerprints for the dataset.

In [1]:
import os

from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import MACCSkeys

In [2]:
def retrieve_smiles(l):
    """ Retrieves the smiles representation from a line in file """
    
    l = str(l)
    l = l.split("\\t")
    return [l[0].split("'")[1], l[1].split("\\n")[0]]

In [3]:
def get_smiles_from_file(f):
    """ Gets SMILES from a file 'f' """

    lines = f.readlines()

    # Create SMILES list
    smiles = []
    for i in range(len(lines)):
        smiles.append(retrieve_smiles(lines[i]))

    # Remove all empty strings
    smiles = list(filter(None, [smiles[i][1] for i in range(len(smiles))]))
    
    return smiles

In [4]:
smiles = []
for fname in os.listdir('data'):
    print ("Reading " + fname + "...")
    f = open('data/' + fname, 'rb')
    smiles += get_smiles_from_file(f)
    f.close()

len(smiles)

Reading 1.5m-2.0m.txt...
Reading 3.5m-4.0m.txt...
Reading 3.0m-3.5m.txt...


696409

In [5]:
# Convert all SMILES to MACCS Keys
maccs = []
for i in tqdm(range(len(smiles))):
    # Convert SMILES to Molecule object
    molecule = Chem.MolFromSmiles(smiles[i])
    try:
        # Get MACCS Key from Molecule object
        maccs_key = MACCSkeys.GenMACCSKeys(molecule)
        maccs.append(maccs_key)
    except:
        continue

100%|██████████| 696409/696409 [21:34<00:00, 537.77it/s]


In [6]:
for i in tqdm(range(len(maccs))):
    maccs[i] = maccs[i].ToBitString()

100%|██████████| 696403/696403 [00:01<00:00, 434247.70it/s]


In [7]:
print (maccs[0])
print ("Number of features =", len(maccs[0]))

# Check if size of all fingerprints is 167
count = 0
for fp in maccs:
    if len(fp) != 167:
        count += 1
        
assert count == 0

00000000000000000000000000000000000010000100000100000000000000000000000000010001110101001101100110000100010000100001111011100101110001011100001111111111001010111111110
Number of features = 167


In [11]:
# Save as a txt file
train_set = maccs[0: int(len(maccs)*0.8)]
test_set = maccs[int(len(maccs)*0.8): int(len(maccs))]

train = open('train_aae_10m', 'w')
test = open('test_aae_10m', 'w')

for i in tqdm(range(len(train_set))):
    train.write(str(i) + '\t' + train_set[i] + '\n')
    
for i in tqdm(range(len(test_set))):
    test.write(str(i) + '\t' + test_set[i] + '\n')
    
train.close()
test.close()

100%|██████████| 557122/557122 [00:00<00:00, 662560.91it/s]
100%|██████████| 139281/139281 [00:00<00:00, 654416.92it/s]


In [13]:
train = open('train_aae_10m')
test = open('test_aae_10m')

print ("Number of training samples = ", len(train.readlines()))
print ("Number of testing samples = ", len(test.readlines()))

train.close()
test.close()

Number of training samples =  557122
Number of testing samples =  139281
