In [None]:
import numpy as np
from boom.datasets.SMILESDataset import *
import pandas as pd
import os
##IMPORTANT: SELFIES must be version 1 to be compatible with pretrained IBM Regresion Transformer!
import selfies as sf
import matplotlib.pyplot as plt
import random

In [2]:
def dataframe_wrapper(dataset, prop_name):
    """
    Wraps the dataset into a pandas dataframe.
    """
    num_samples = len(dataset)
    df = pd.DataFrame(columns=["smiles", prop_name])

    for i in range(num_samples):
        smiles, target = dataset[i]
        df.loc[i] = [smiles, target]
    return df

def make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name):
    """
    Takes in pandas dataframes and outputs a Regression Transformer input file for property prediction.
    """
    #sf.set_semantic_constraints("hypervalent")
    data_min=min([min(train_df.iloc[:,1]),min(iid_test_df.iloc[:,1]),min(ood_test_df.iloc[:,1])])
    data_max=max([max(train_df.iloc[:,1]),max(iid_test_df.iloc[:,1]),max(ood_test_df.iloc[:,1])])
    if(property_token_name=='den'):
        #Do train data
        outfile='./10k_dft_density_OOD/10k_dft_density_OOD_train.txt'
        file1 = open(outfile, 'w')
        for index in range(len(train_df)):
            normed_value = (train_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(train_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()
    
        #Make IID Test data
        outfile='./10k_dft_density_OOD/10k_dft_density_OOD_iid_test.txt'
        file1 = open(outfile, 'w')
        for index in range(len(iid_test_df)):
            normed_value = (iid_test_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(iid_test_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()
        
        #Make OOD Test data
        outfile='./10k_dft_density_OOD/10k_dft_density_OOD_ood_test.txt'
        file1 = open(outfile, 'w')
        for index in range(len(ood_test_df)):
            normed_value = (ood_test_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(ood_test_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()
    
    elif(property_token_name=='hof'):
        #Do train data
        outfile='./10k_dft_hof_OOD/10k_dft_hof_OOD_train.txt'
        file1 = open(outfile, 'w')
        for index in range(len(train_df)):
            normed_value = (train_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(train_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()
    
        #Make IID Test data
        outfile='./10k_dft_hof_OOD/10k_dft_hof_OOD_iid_test.txt'
        file1 = open(outfile, 'w')
        for index in range(len(iid_test_df)):
            normed_value = (iid_test_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(iid_test_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()
        
        #Make OOD Test data
        outfile='./10k_dft_hof_OOD/10k_dft_hof_OOD_ood_test.txt'
        file1 = open(outfile, 'w')
        for index in range(len(ood_test_df)):
            normed_value = (ood_test_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(ood_test_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close() 
        
    elif('qm9' in property_token_name):
        #Do train data
        outfile='./'+property_token_name+'_OOD/'+property_token_name+'_OOD_train.txt'
        print(outfile)
        file1 = open(outfile, 'w')
        for index in range(len(train_df)):
            normed_value = (train_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(train_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()
    
        #Make IID Test data
        outfile='./'+property_token_name+'_OOD/'+property_token_name+'_OOD_iid_test.txt'
        file1 = open(outfile, 'w')
        for index in range(len(iid_test_df)):
            normed_value = (iid_test_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(iid_test_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()
        
        #Make OOD Test data
        outfile='./'+property_token_name+'_OOD/'+property_token_name+'_OOD_ood_test.txt'
        file1 = open(outfile, 'w')
        for index in range(len(ood_test_df)):
            normed_value = (ood_test_df.iloc[:,1][index]-data_min) / (data_max-data_min)
            selfies_string=sf.encoder(ood_test_df.iloc[:,0][index])
            line='<' + property_token_name + '>' + '%0.3f' % normed_value + '|' + selfies_string + '\n'
            file1.write(line)
        file1.close()     
    return

In [None]:
#10k Density
train_dataset = TrainDensityDataset()
iid_test_dataset = IDDensityDataset()
ood_test_dataset = OODDensityDataset()
train_df = dataframe_wrapper(train_dataset, 'density')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'density')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'density')
os.makedirs('./10k_dft_density_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='den')
print('Done 10k Density')

#10k HoF
train_dataset=TrainHoFDataset()
iid_test_dataset=IDHoFDataset()
ood_test_dataset=OODHoFDataset()
train_df = dataframe_wrapper(train_dataset, 'hof')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'hof')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'hof')
os.makedirs('./10k_dft_hof_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='hof')
print('Done 10k HoF!')

#QM9_alpha
train_dataset=TrainQM9_alphaDataset()
iid_test_dataset=IDQM9_alphaDataset()
ood_test_dataset=OODQM9_alphaDataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_alpha')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_alpha')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_alpha')
os.makedirs('./qm9_alpha_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_alpha')
print('Done QM9 Alpha!')

#QM9_cv
train_dataset=TrainQM9_cvDataset()
iid_test_dataset=IDQM9_cvDataset()
ood_test_dataset=OODQM9_cvDataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_cv')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_cv')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_cv')
os.makedirs('./qm9_cv_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_cv')
print('Done QM9 Cv!')

#QM9_gap
train_dataset=TrainQM9_gapDataset()
iid_test_dataset=IDQM9_gapDataset()
ood_test_dataset=OODQM9_gapDataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_gap')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_gap')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_gap')
os.makedirs('./qm9_gap_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_gap')
print('Done QM9 Gap!')

#QM9_homo
train_dataset=TrainQM9_homoDataset()
iid_test_dataset=IDQM9_homoDataset()
ood_test_dataset=OODQM9_homoDataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_homo')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_homo')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_homo')
os.makedirs('./qm9_homo_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_homo')
print('Done QM9 Homo!')

#QM9_lumo
train_dataset=TrainQM9_lumoDataset()
iid_test_dataset=IDQM9_lumoDataset()
ood_test_dataset=OODQM9_lumoDataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_lumo')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_lumo')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_lumo')
os.makedirs('./qm9_lumo_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_lumo')
print('Done QM9 Lumo!')

#QM9_mu
train_dataset=TrainQM9_muDataset()
iid_test_dataset=IDQM9_muDataset()
ood_test_dataset=OODQM9_muDataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_mu')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_mu')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_mu')
os.makedirs('./qm9_mu_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_mu')
print('Done QM9 Mu!')

#QM9_r2
train_dataset=TrainQM9_r2Dataset()
iid_test_dataset=IDQM9_r2Dataset()
ood_test_dataset=OODQM9_r2Dataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_r2')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_r2')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_r2')
os.makedirs('./qm9_r2_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_r2')
print('Done QM9 R2!')

#QM9_zpve
train_dataset=TrainQM9_zpveDataset()
iid_test_dataset=IDQM9_zpveDataset()
ood_test_dataset=OODQM9_zpveDataset()
train_df = dataframe_wrapper(train_dataset, 'qm9_zpve')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'qm9_zpve')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'qm9_zpve')
os.makedirs('./qm9_zpve_OOD',exist_ok=True)
make_RT_inputs(train_df,iid_test_df,ood_test_df,property_token_name='qm9_zpve')
print('Done QM9 ZPVE!')

Done 10k Density
Done 10k HoF!
./qm9_alpha_OOD/qm9_alpha_OOD_train.txt
Done QM9 Alpha!
./qm9_cv_OOD/qm9_cv_OOD_train.txt
Done QM9 Cv!
./qm9_gap_OOD/qm9_gap_OOD_train.txt
Done QM9 Gap!
./qm9_homo_OOD/qm9_homo_OOD_train.txt
Done QM9 Homo!
./qm9_lumo_OOD/qm9_lumo_OOD_train.txt
Done QM9 Lumo!
./qm9_mu_OOD/qm9_mu_OOD_train.txt
Done QM9 Mu!
./qm9_r2_OOD/qm9_r2_OOD_train.txt
Done QM9 R2!
./qm9_zpve_OOD/qm9_zpve_OOD_train.txt
Done QM9 ZPVE!
