### In this file, we will generate our E0 feature data

Instructions: 

You should start by writing a notebook to create the dataset. Note that if you use many molecules, using MOPAC could take a while. If you decide to optimize the geometry of all the molecules in the Solubility datasets, ot will take at least 12 hours. Make sure to save the result so that you don't have to compute it multiple times!

In [1]:
# Pandas dataframe, apply function to whole dataframe
# Make sure it works on a small dataset. Save the results
# Create dataset associated smiles to energy. Then, run overnight on all data
# Once dataset ready, can move on to next step

import pandas as pd
import os

In [2]:
# Combine Datasets
def generate_combined_dataset(dir_path, fnames, out_name="", debug=False):
    combined = None
    print("Outname is", out_name)
    for f in fnames:
        df = pd.read_csv(dir_path + f)
        if combined is None:
            combined = df.copy()
        else:
            ## Here we do not check for duplicates as Professor Tristan in class informed me that there should not be any duplicate data between the sets
            combined = pd.concat([combined, df])
        if debug:
            print("Added " + str(f) + " with dim", df.shape)
    if debug:
        print("Our combined dataset has", combined.shape[0], "rows with", combined.shape[1], "features.")
    
    if out_name != "":
        if out_name.endswith(".csv"):
            if os.path.exists(out_name):
                print("Combined file already exists at specified output path... rename or replace it first.")
            else:
                combined.to_csv(out_name, index=False)
        else:
            print("ERROR: Output filename must end in .csv")
    return combined
    
# Restriction: "It is forbidden to use any dataset other than A, B, C, D, F, G, H, I"
# So, we make sure we only use these datasets in our data preparation

generate_combined_dataset("../MLQC_HW/Data/Solubility/",
                          ["dataset-H.csv", "dataset-I.csv"],
                          "train_data_preview.csv")

generate_combined_dataset("../MLQC_HW/Data/Solubility/",
                          ["dataset-A.csv", "dataset-B.csv", "dataset-C.csv",
                          "dataset-D.csv", "dataset-F.csv", "dataset-G.csv",
                          "dataset-H.csv", "dataset-I.csv"], "train_data.csv")

Outname is train_data_preview.csv
Combined file already exists at specified output path... rename or replace it first.
Outname is train_data.csv
Combined file already exists at specified output path... rename or replace it first.


Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility
0,A-1,"1,2-dichlorobenzene",InChI=1S/C6H4Cl2/c7-5-3-1-2-4-6(5)8/h1-4H,RFFLAFLAYFXFSW-UHFFFAOYSA-N,Clc1ccccc1Cl,-3.177212
1,A-2,"1,2-dichlorobenzene",InChI=1S/C6H4Cl2/c7-5-3-1-2-4-6(5)8/h1-4H,RFFLAFLAYFXFSW-UHFFFAOYSA-N,Clc1ccccc1Cl,-3.180557
2,A-3,"N,N,N-trimethyloctadecan-1-aminium bromide",InChI=1S/C21H46N.BrH/c1-5-6-7-8-9-10-11-12-13-...,SZEMGTQCPRNXEG-UHFFFAOYSA-M,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127
3,A-4,Benzo[cd]indol-2(1H)-one,InChI=1S/C11H7NO/c13-11-8-5-1-3-7-4-2-6-9(12-1...,GPYLCFQEKPUWLD-UHFFFAOYSA-N,O=C1Nc2cccc3cccc1c23,-3.254767
4,A-5,4-chlorobenzaldehyde,InChI=1S/C7H5ClO/c8-7-3-1-6(5-9)2-4-7/h1-5H,AVPYQKSLYISFPO-UHFFFAOYSA-N,Clc1ccc(C=O)cc1,-2.177078
...,...,...,...,...,...,...
89,I-90,trimethoprim,InChI=1S/C14H18N4O3/c1-19-10-5-8(6-11(20-2)12(...,IEDVJHCEMCRBQM-UHFFFAOYSA-N,c1(nc(c(cn1)Cc1cc(c(c(OC)c1)OC)OC)N)N,-2.950000
90,I-91,trimipramine,InChI=1S/C20H26N2/c1-16(14-21(2)3)15-22-19-10-...,ZSCDBOWYZJWBIY-UHFFFAOYSA-N,CC(CN(C)C)CN1c2ccccc2CCc2ccccc12,-4.790000
91,I-92,tryptamine,InChI=1S/C10H12N2/c11-6-5-8-7-12-10-4-2-1-3-9(...,APJYDQYYACXCRM-UHFFFAOYSA-N,NCCc1c[nH]c2ccccc12,-3.300000
92,I-93,verapamil,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",SGTNSNPWRIOYBX-UHFFFAOYSA-N,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,-3.980000


In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
from ase import Atoms
from ase.io import read
from ase.calculators.mopac import MOPAC

from tqdm.notebook import tqdm_notebook

# If necessary, can add a batch parameter to start at a given index and to save batches of data at intervals
def calc_potential(from_df):
    
    def get_mopac(smile):
        try:
            # Create the rdkit molecule object in Cartesian space
            mol = Chem.AddHs(Chem.MolFromSmiles(smile))
            AllChem.EmbedMolecule(mol)

            # Save and reload the information into a mol object from ase (instead of rdkit)
            Chem.rdmolfiles.MolToXYZFile(mol, 'init.xyz')
            mol = read('init.xyz')

            # With this ase object, add quantum mechanic calculations
            mol.calc = MOPAC(label='TMP', task='UHF BONDS GRADS')
            return mol.get_potential_energy()
        except:
            pass
        
    tqdm_notebook.pandas(desc="Quantum Calculations for Minimum Potential Energy (Molecule Number)")
    
    # Would in theory progress_apply each batch if we needed
    
    from_df['Min_PE'] = from_df['SMILES'].progress_apply(get_mopac)
    return from_df
    


# CAUTION TEAM ELECTRON
You only need to run your own code block here. I chunked it out so each of us have about an even amount of data to churn through so we can do this at 3x speed. Be advised it may take ~30 hours to go through all of this data, so for this reason I have it save to the csv file every time you complete one of the data sets. That's why there are multiple lines of calculating and saving

In [4]:
data_path = "../MLQC_HW/Data/Solubility/"

In [None]:
### RJ run this ###

# Will do C, D, F (I already did H and I while testing)

# C
new_df = calc_potential(pd.read_csv(data_path + "dataset-C.csv"))

# Only keep our SMILES and Min_PE
new_df[["SMILES", "Min_PE"]].to_csv('Data/pe_data_C.csv', index=False)


In [None]:
### RJ run this ###
# D
new_df = calc_potential(pd.read_csv(data_path + "dataset-D.csv"))
new_df[["SMILES", "Min_PE"]].to_csv('Data/pe_data_D.csv', index=False)

In [None]:
# F
new_df = calc_potential(pd.read_csv(data_path + "dataset-F.csv"))
new_df[["SMILES", "Min_PE"]].to_csv('Data/pe_data_F.csv', index=False)

In [None]:
### RONAN RUN THIS ###

# This will do the computations on dataset A

# A
new_df = calc_potential(pd.read_csv(data_path + "dataset-A.csv"))
new_df[["SMILES", "Min_PE"]].to_csv('Data/pe_data_A.csv', index=False)

In [None]:
### JOSHUA RUN THIS ###

# This will do the computations on dataset B, G

# B
new_df = calc_potential(pd.read_csv(data_path + "dataset-B.csv"))
new_df[["SMILES", "Min_PE"]].to_csv('Data/pe_data_B.csv', index=False)

In [5]:
### JOSHUA RUN THIS ###
# G
new_df = calc_potential(pd.read_csv(data_path + "dataset-G.csv"))
new_df[["SMILES", "Min_PE"]].to_csv('Data/pe_data_G.csv', index=False)

  from pandas import Panel


Quantum Calculations for Minimum Potential Energy (Molecule Number):   0%|          | 0/1144 [00:00<?, ?it/s]