Runs the fg transform tool 

For Methylation Reaction (with no optimization)

In [None]:
'''
date modified - 2024-02-16
Runs the state reaction tool, which runs a variety of reactions on datasets

    dataset_filepath            - starting db file which will hold starting reactants 
    labelH_filepath             - label H1, important to find out which H to remove
    labelH_filepathacarb        - label H2, importaant to find out which second hydrogen to remove (if applicable)
    reactantout_filepath        - reactant output, init.xyz file, each molecule will serve as reactant
    productout_filepath         - product output, trans.xyz file, each molecule will serve as corresponding product of reaction
    
    
    statereaction               - the main code being executed, handles all types of reactions being studies

    number_trans                - code returns number of products formed for record keeping (the real outputs are the init and trans.xyz,.db files)
'''
from fgtransform import statereaction

#Dataset
dataset_filepath = 'data/datasets/QM9/qm9.db'


#H-TYPE TRANSFORMATION
labelH1_filepath = 'data/autolabel/Hqm910000labelpert2/labelpert2ver2.csv'
labelH2_filepath = 'data/autolabel/Hqm910000labelpert2/labelpert2ver2.csv'
#OUTPUT
reactantout_filepath = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert2/init.xyz'
productout_filepath = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert2/trans.xyz'
#pert3
#remove all H's of CH3-CH2-C and evaluate O embs ---> CH2-CH2-C...O (pert 3)
#labelH1id = 17

#OH -> O for primary alcohol O embedding
#pert 1 - 3, pret2 - 9, pert 3 - 21,  pert 4 - 69, pert5 - 324 ... for pert6 use pentanol!! 
targetlabelH1 = 9
#pert 2 - 14, pert 2, pert 3, pert4 ... onwards, just make sure that for pert 2 oxidation you add the extra one that is also allowed in the OR condition
targetlabelH2 = 14
#need to allow for branching at the alpha! for depth = 2...
#still use two labelH for depth = 3 and above
targetlabelH3 = 16

#number of molecules to scan the dataset for reactant labels
n_molecules = [0,10000]

#available property of the dataset
available_properties = ['energy']

#whether to perform an optimization on the product with MMFF94 force field from RDKit
optimize = False

#Initialize state reaction
statereaction = statereaction.transform(dataset_filepath,labelH1_filepath,targetlabelH1,reactantout_filepath,productout_filepath,n_molecules,available_properties)

#Run an oxidation, return number of products formed
number_trans = statereaction.oxidalcs(labelH2_filepath,targetlabelH2,targetlabelH3,optimize)

In [None]:
print(number_trans)

FGTransform Analysis

#Run extract embeddings on init and trans

In [2]:
#Run extract embeddings on init and trans
from extractembeddings import allelementextract

init_dataset_filepath = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert2/init.db'
init_save_filepath = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert2/initO.csv'
trans_dataset_filepath = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert2/trans.db'
trans_save_filepath = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert2/transO.csv'

model_filepath = 'data/trainedmodels/model1/best_model'
start = 0
end = 644
n_features = 128
n_layers = [5,6]
all_elements = False
element = [8]
qm9=False
available_properties = ['energy']
#label_file =  '../../data/labeldataset/%s/label%s%s.csv' %(element,element,number_inputs)

#run extract on init and trans
#default: True
label = False
restrict_label = False
allowed_labels = [8,7]
scratch_file = 'temp2.xyz'

add_header = False

#allelementextract.extract(qm9,init_dataset_filepath,model_filepath,init_save_filepath,start,end,n_features,n_layers,all_elements,element,available_properties=available_properties)
allelementextract.extract(qm9,init_dataset_filepath,model_filepath,init_save_filepath,start,end,n_features,n_layers,all_elements,element,available_properties=available_properties,label=label,add_header=add_header)
allelementextract.extract(qm9,trans_dataset_filepath,model_filepath,trans_save_filepath,start,end,n_features,n_layers,all_elements,element,available_properties=available_properties,label=label,add_header=add_header)


  from .autonotebook import tqdm as notebook_tqdm


0


  properties[pname] = torch.FloatTensor(prop)


0


transform with average diff vector and use pca on both

In [None]:
from tools.utils import utils_fgtransform, utils_dimred, utils_numericalfiletools

n_molecules = 17
n_features = 128
fg_trans_fildir = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert5/'


#Calculate average diff vector
data_filepath1 = fg_trans_fildir+ 'initO.csv'
data_filepath2 = fg_trans_fildir+ 'transO.csv'
diff_mean = utils_fgtransform.vecdiffmean(data_filepath1,data_filepath2,n_features,n_molecules,fg_trans_fildir)


#Perform transformation/'
vec_filepath = fg_trans_fildir+ 'Odiff.csv'
save_filepath = fg_trans_fildir+ 'artO.csv'
utils_fgtransform.add_vectomat(vec_filepath,data_filepath1,n_features,save_filepath)


true_trans_filepath = fg_trans_fildir+ 'transO.csv'
art_trans_filepath = fg_trans_fildir+ 'artO.csv'
start_trans_idx = 0
stack_qm9 = True
qm9embs_filepath = 'data/embs/model1-10000/layer5/Oembs/embs.csv'
#DO NOT SORT! 
utils_fgtransform.nearest_trans(true_trans_filepath,art_trans_filepath,n_molecules,n_features,start_trans_idx,stack_qm9,qm9embs_filepath)




In [3]:
fg_trans_fildir = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert2/'
data_filepath1 = fg_trans_fildir+ 'initO.csv'
data_filepath2 = fg_trans_fildir+ 'transO.csv'

from tools.utils import utils_numericalfiletools
n_data1 = 644
n_data2 = 644
single_dim = False
skip_header1 = False
skip_header2 = False
save_filepath = fg_trans_fildir+'bothO.csv'
utils_numericalfiletools.vstacktwofiles(data_filepath1,data_filepath2,save_filepath,n_data1,n_data2,single_dim,skip_header1,skip_header2)

In [4]:
#stack embs and bothO (for PCA including the reaction embeddings for accurate representation, especially in neighbor test)

n_data1 = 644*2
n_data2 = 13076
single_dim = False
skip_header1 = False
skip_header2 = False
data_filepath1 = fg_trans_fildir+'bothO.csv'
data_filepath2 = 'data/embs/model1-10000/layer5/Oembs/embsnoheader.csv'
save_filepath = fg_trans_fildir + 'bothOembs.csv'
utils_numericalfiletools.vstacktwofiles(data_filepath1,data_filepath2,save_filepath,n_data1,n_data2,single_dim,skip_header1,skip_header2)


Linear regression on transformation vector while setting the intercept of the solution to diff zeroth order


In [9]:
import numpy as np
from numpy import genfromtxt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import math 
from sklearn.preprocessing import PolynomialFeatures

# Load data into a Pandas DataFrame
xy_data = genfromtxt('data/fgtransform/model1/oxidation/optimize/1alcsaldspert3/lrfull/bothO.csv',delimiter=',',encoding='utf-8-sig')
#y_diff = genfromtxt('data/fgtransform/model1/oxidation/optimize/1alcsaldspert3/initOpcadiff.csv',delimiter=',',encoding='utf-8-sig')

#y_diff = y_diff[0:5]

# Split data into X and y
X_train = xy_data[:291,0:128]
#X_test = xy_data[250:291,0:10]
#128 --> pka
#138 --> nmr
#138 onwards --> electron density 
y_train = xy_data[291:,0:128]
#y_test = xy_data[541:582,0:10]

print(X_train.shape[0])

# Add a column of 1s to X for the intercept term
X_train = np.concatenate((np.ones((X_train.shape[0], 1)), X_train), axis=1)

# Calculate the coefficients
XT = np.transpose(X_train)
coefficients = np.linalg.inv(XT.dot(X_train)).dot(XT).dot(y_train)

use_train = True

if use_train == True: 
    predictions = np.dot(X_train, coefficients)
    # Evaluate the model
    mse = mean_squared_error(y_train, predictions)
    r2 = r2_score(y_train, predictions)
else:
    predictions = np.dot(X_test, coefficients)
    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

print('mse',math.sqrt(mse))
print('r2',r2)

291
mse 0.1205900581696623
r2 0.8367096081716874


In [10]:
fg_trans_fildir = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert3/lrfull/'

np.savetxt(fg_trans_fildir+'artOlr.csv',predictions,delimiter=',')

#save the y_int
np.savetxt(fg_trans_fildir+'artOlrint.csv',coefficients[0],delimiter=',')

In [8]:
from tools.utils import utils_fgtransform

n_molecules = 291
n_features = 128
fg_trans_fildir = 'data/fgtransform/model1/oxidation/optimize/1alcsaldspert3/lrfull/'

stack_qm9 = True
qm9embs_filepath = 'data/embs/model1-10000/layer5/Oembs/embsnoheader.csv'

true_trans_filepath = fg_trans_fildir+ 'bothO.csv'
start_trans_idx = 291
art_trans_filepath = fg_trans_fildir+ 'artOlr.csv'
#DO NOT SORT! 
utils_fgtransform.nearest_trans(true_trans_filepath,art_trans_filepath,n_molecules,n_features,start_trans_idx,stack_qm9,qm9embs_filepath)



0
16
24
26
29
44
52
53
54
63
66
72
89
91
126
159
161
186
190
191
198
200
201
203
223
224
225
232
234
238
241
245
260
272
277
286
290
matched 87.62886597938144 %


Line Plot of the Transformation


In [None]:
from tools.utils import utils_fgtransform, utils_dimred, utils_numericalfiletools
n_molecules = 68
n_features = 128
fg_trans_fildir = 'data/fgtransform/model1/oxidation/1alcsaldspert3/'


#must stack initOtransform with initO call it bothO
init_filepath = fg_trans_fildir+ 'initO.csv'
inittrans_filepath = fg_trans_fildir+ 'initOtransform.csv'
save_filepath = fg_trans_fildir+ 'bothOtransform.csv'
n_data1 = n_molecules
n_data2 = n_molecules
utils_numericalfiletools.vstacktwofiles(init_filepath,inittrans_filepath,save_filepath,n_data1,n_data2,single_dim=False,skip_header1=1,skip_header2=0)

n_molecules = n_molecules*2
#Dimensionality reduction on new file with init and inittransformed
data_filepath = fg_trans_fildir+ 'bothOtransform.csv'
save_filepath = fg_trans_fildir+ 'initOtransform.csv'
apply_filepath = fg_trans_fildir+'bothOtransform.csv'
n_components = 128
skip_header =  0
scale_data = False
utils_dimred.pca(data_filepath,n_components,n_features,scale_data,save_filepath, n_molecules,skip_header,apply_filepath=apply_filepath)

data_filepath = fg_trans_fildir+ 'initOtransformpca.csv'
save_filepath = fg_trans_fildir+ 'initOtransformpcavecs.csv'
#Make vector plotting file from the pca (x1,y1,x2,y2) 
#(half the data is init, half is trans after compilation)
utils_fgtransform.scatters_to_vectors(data_filepath,save_filepath)