In [1]:
import os
import sys
import click
import concurrent.futures

from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import MACCSkeys

In [2]:
def convert_to_fingerprint(s):
    # Convert SMILES to Molecule object
    molecule = Chem.MolFromSmiles(s)
    try:
        # Get MACCS Key from Molecule object
        maccs_key = MACCSkeys.GenMACCSKeys(molecule)
        return maccs_key.ToBitString()
    except:
        return None

def retrieve_smiles(l):
    """ Retrieves the smiles representation from a line in file """
    
    l = str(l)
    l = l.split("\\t")
    return [l[0].split("'")[1], l[1].split("\\n")[0]]

def get_smiles_from_file(f):
    """ Gets SMILES from a file 'f' """

    lines = f.readlines()

    # Create SMILES list
    smiles = []
    for i in range(len(lines)):
        smiles.append(retrieve_smiles(lines[i]))

    # Remove all empty strings
    smiles = list(filter(None, [smiles[i][1] for i in range(len(smiles))]))
    
    return smiles

In [3]:
direc = 'data/data0'

print ("Preprocessing data from " + direc + " =>")
    
smiles = []
files = os.listdir(direc)
print ("Reading data..." )

with click.progressbar(list(range(len(files)))) as bar:
    for i in tqdm(range(len(files))):
        f = open(direc + "/" + files[i], 'rb')
        smiles += get_smiles_from_file(f)
        f.close()
        
print ("Converting SMILES to MACCS Keys...")
with concurrent.futures.ProcessPoolExecutor() as executor:
    with click.progressbar(smiles) as bar:
        # Convert all SMILES to MACCS Keys
        maccs = []
        for s, fp in zip(bar, executor.map(convert_to_fingerprint, bar)):
            maccs.append(fp)
        print ("")

# print ("Converting SMILES to MACCS Keys...")
# with concurrent.futures.ProcessPoolExecutor() as executor:
#     # Convert all SMILES to MACCS Keys
#     maccs = []
#     for s, fp in zip(smiles, executor.map(convert_to_fingerprint, smiles)):
#         maccs.append(fp)
#     print ("")
    
maccs = list(filter(None, maccs))

for fp in maccs:
    if not (type(fp) == str):
            raise ValueError('Something except a string in maccs.')

Preprocessing data from data/data0 =>
Reading data...


UnsupportedOperation: not writable

In [4]:
# Check if size of all fingerprints is 167
count = 0
for fp in maccs:
    if len(fp) != 167:
        count += 1

if count == 0:
    print ("All instances have length 167.")
else:
    print ("Data not uniform. Check lengths for instances.")

# Save as a txt file
print ("Saving files =>")
print ("")
train_set = maccs[0: int(len(maccs)*0.8)]
test_set = maccs[int(len(maccs)*0.8): int(len(maccs))]

train = open('train_aae_final', 'w')
test = open('test_aae_final', 'w')

print ("Saving train set...")
with click.progressbar(list(range(len(train_set)))) as bar:
    for i in bar:
        train.write(str(i) + '\t' + train_set[i] + '\n')
    print ("")

print ("Saving test set...")
with click.progressbar(list(range(len(test_set)))) as bar:
    for i in bar:
        test.write(str(i) + '\t' + test_set[i] + '\n')
    print ("")    

train.close()
test.close()
print ("Done.")

train = open('train_aae_final')
test = open('test_aae_final')

print ("")
print ("Number of training samples =", len(train.readlines()))
print ("Number of testing samples =", len(test.readlines()))
print ("")

train.close()
test.close()

All instances have length 167.
Saving files =>

Saving train set...


UnsupportedOperation: not writable