**Preprocessing of the PubChem Database of Substances**

The downloaded dataset comprises of a sequence of SMILES strings and some empty strings which are to be converted to MACCS fingerprints for the dataset.

In [None]:
import os

from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import MACCSkeys

In [None]:
def retrieve_smiles(l):
    """ Retrieves the smiles representation from a line in file """
    
    l = str(l)
    l = l.split("\\t")
    return [l[0].split("'")[1], l[1].split("\\n")[0]]

In [None]:
def get_smiles_from_file(f):
    """ Gets SMILES from a file 'f' """

    lines = f.readlines()

    # Create SMILES list
    smiles = []
    for i in range(len(lines)):
        smiles.append(retrieve_smiles(lines[i]))

    # Remove all empty strings
    smiles = list(filter(None, [smiles[i][1] for i in range(len(smiles))]))
    
    return smiles

In [None]:
smiles = []
files = os.listdir('data')
for i in tqdm(range(len(files))):
#     print ("Reading " + files[i] + "...")
    f = open('data/' + files[i], 'rb')
    smiles += get_smiles_from_file(f)
    f.close()

len(smiles)

In [None]:
# Convert all SMILES to MACCS Keys
maccs = []
for i in tqdm(range(len(smiles))):
    # Convert SMILES to Molecule object
    molecule = Chem.MolFromSmiles(smiles[i])
    try:
        # Get MACCS Key from Molecule object
        maccs_key = MACCSkeys.GenMACCSKeys(molecule)
        maccs.append(maccs_key)
    except:
        continue

In [None]:
for i in tqdm(range(len(maccs))):
    maccs[i] = maccs[i].ToBitString()

In [None]:
print (maccs[0])
print ("Number of features =", len(maccs[0]))

# Check if size of all fingerprints is 167
count = 0
for fp in maccs:
    if len(fp) != 167:
        count += 1
        
assert count == 0

In [None]:
# Save as a txt file
train_set = maccs[0: int(len(maccs)*0.8)]
test_set = maccs[int(len(maccs)*0.8): int(len(maccs))]

train = open('train_aae_10m', 'w')
test = open('test_aae_10m', 'w')

for i in tqdm(range(len(train_set))):
    train.write(str(i) + '\t' + train_set[i] + '\n')
    
for i in tqdm(range(len(test_set))):
    test.write(str(i) + '\t' + test_set[i] + '\n')
    
train.close()
test.close()

In [None]:
train = open('train_aae_10m')
test = open('test_aae_10m')

print ("Number of training samples = ", len(train.readlines()))
print ("Number of testing samples = ", len(test.readlines()))

train.close()
test.close()