**Preprocessing of the PubChem Database of Substances**

The downloaded dataset comprises of a sequence of SMILES strings and some empty strings which are to be converted to MACCS fingerprints for the dataset.

In [1]:
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import MACCSkeys

In [2]:
def retrieve_smiles(l):
    """ Retrieves the smiles representation from a line in file """
    
    l = str(l)
    l = l.split("\\t")
    return [l[0].split("'")[1], l[1].split("\\n")[0]]

In [3]:
f0 = open('1-100000.txt', 'rb')
f1 = open('100000-600000.txt', 'rb')
f2 = open('600000-1000000.txt', 'rb')

def get_smiles_from_file(f):
    """ Gets SMILES from a file 'f' """

    lines = f.readlines()

    # Create SMILES list
    smiles = []
    for i in range(len(lines)):
        smiles.append(retrieve_smiles(lines[i]))

    # Remove all empty strings
    smiles = list(filter(None, [smiles[i][1] for i in range(len(smiles))]))
    
    return smiles

# Concatenate all read SMILES into one list
smiles = get_smiles_from_file(f0) + get_smiles_from_file(f1) + get_smiles_from_file(f2)

# Close files
f0.close()
f1.close()
f2.close()

len(smiles)

427034

In [4]:
# Convert all SMILES to MACCS Keys
maccs = []
for i in tqdm(range(len(smiles))):
    # Convert SMILES to Molecule object
    molecule = Chem.MolFromSmiles(smiles[i])
    try:
        # Get MACCS Key from Molecule object
        maccs_key = MACCSkeys.GenMACCSKeys(molecule)
        maccs.append(maccs_key)
    except:
        continue

100%|██████████| 427034/427034 [16:30<00:00, 431.08it/s]


In [5]:
len(maccs)

426992

In [6]:
maccs[0]

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fc8c412a030>

In [7]:
for i in tqdm(range(len(maccs))):
    maccs[i] = maccs[i].ToBitString()

100%|██████████| 426992/426992 [00:00<00:00, 436004.07it/s]


In [8]:
maccs[0]

'00000000000000000000000001001000000000100011000000000100000000000100000010010101101111010111100111101100011101100100010011100111010111110111101110101011111111110111110'

In [9]:
# Save to disk
import pickle

dataset = open('maccs.pkl', 'wb')
pickle.dump(maccs, dataset)
dataset.close()

Some other calculations

In [11]:
len(maccs[0])

167

In [15]:
# Check if size of all fingerprints is 167
count = 0
for fp in maccs:
    if len(fp) != 167:
        count += 1
        
assert count == 0