In [17]:
import pandas as pd
import pyrfume
import os
from PIL import Image, ImageOps
import base64
from io import BytesIO

In [8]:
# Get SMILES from master molecule list
molecule_url = 'https://raw.githubusercontent.com/TravisGould/pyrfume-data/molecules_update/molecules/molecules.csv'
molecules = pd.read_csv(molecule_url, index_col=0)

print(molecules.shape)
molecules.head()

(9982, 4)


Unnamed: 0_level_0,MolecularWeight,IsomericSMILES,IUPACName,name
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-35_CC(C)CC(C)(O)C1CCCS1,188.336,CC(C)CC(C)(O)C1CCCS1,,Isopropylmethyltetrahydrothiophenyl-ethanol
-34_CCCCCCCCC(=O)C(=O)OC(=C(C)C)C(O)CO,300.395,CCCCCCCCC(=O)C(=O)OC(=C(C)C)C(O)CO,,Isopropylideneglyceryl oxodecanoate
-33_CCCCC=COC(=O)C=CCCCCC,224.344,CCCCC=COC(=O)C=CCCCCC,,Hexenyl octenoate
-32_CCCC(CCOC)[SH]=C(O)OCC,222.35,CCCC(CCOC)[SH]=C(O)OCC,,Ethyl S-(1-methoxyhexan-3-yl)carbonothioate
-31_CCCCCCCCC(S)C=O,188.336,CCCCCCCCC(S)C=O,,Mercaptodecanal


In [18]:
# Calculate stucture for each unique SMILES
smiles = molecules[~molecules.index.duplicated()]['IsomericSMILES'].to_list()

im_dict = {}
for smi in set(smiles):
    try:
        image = pyrfume.odorants.smiles_to_image(smi, png=False, crop=True, padding=0, size=150)

        # Crop
        bbox = ImageOps.invert(image).getbbox()
        image = image.crop(bbox)

        # Convert to base64
        buffer = BytesIO()
        image.save(buffer, format='PNG')
        image = buffer.getvalue()
        image = base64.b64encode(image).decode("utf8")

        im_dict[smi] = image
    except:
        print(f'Could not create structure for {smi}')

Could not create structure for [NH4+].[NH4+].F[Si-2](F)(F)(F)(F)F
Could not create structure for F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]


In [21]:
# Convert to dataframe
df = pd.DataFrame.from_dict(im_dict, orient='index', columns=['Image_base64'])
df.index.name = 'SMILES'

print(df.shape)
df.head()

(9967, 1)


Unnamed: 0_level_0,Image_base64
SMILES,Unnamed: 1_level_1
CC(C)C1=CC=CC=C1[N+](=O)[O-],iVBORw0KGgoAAAANSUhEUgAAAHkAAACKCAIAAAABnnKpAA...
CCCCOC(=O)C=CC1=CC=CC=C1,iVBORw0KGgoAAAANSUhEUgAAAIsAAAAtCAIAAAARPg9bAA...
CCCCC1=CC=C(S1)CC,iVBORw0KGgoAAAANSUhEUgAAAIoAAAAwCAIAAABBtHUgAA...
CCOCC1=CC=CC=C1O,iVBORw0KGgoAAAANSUhEUgAAAIsAAABTCAIAAAAgFmPqAA...
CC(=C)C(=O)O,iVBORw0KGgoAAAANSUhEUgAAAIsAAAByCAIAAADs5rV5AA...


In [22]:
# Write to disk
df.to_csv(os.path.join('../', 'static', 'structures.csv'))