Chemical environment transformation tool

In [6]:
import pandas as pd
import numpy as np

'''
Last Updated: 2024-03-09

This code performs statereactions on labelled db files. it mimics leaving group and attaching group by 
removing labelled atoms (leaving group) and building a functional group in its place or somewhere else,

Dictionaries below have predefined settings to get algorithm going
    1) LEAVING_GROUP_LABELS - defines leaving groups to be removed, 
    2) BUILD_LABELS - defines labels of target atoms to build FG (and extract embedding of),
    3) FG_TO_BUILD - defines FG mol lines to attach to mol
    4) FG_BOND_TO_BUILD - fg mol bond lines to attach to mol 
    5) FG_ADD_ATOMS_BONDS - FG's number atoms and bonds to adjust mol file


    LEAVING_GROUP_LABELS    - dictionary of predefined labels of FG's to remove, 
                              these are found through the autolabel files and should be formatted as follows

                                [['labeldepth#_high',atom_label], ['labeldepth#_low',atom_label]]   
                                NOTE each item in this dictionary has to be organized from deepest to shallowest depthlabel

    BUILD_LABELS            - dictionary of predefined labels of targets to build FG off of,
                              these slightly depEND on LEAVING_GROUP_labelS so careful how they are defined
                              THEY ARE ALSO AN IMPORTANT CHECK IN THE ALGORTHM
                              THE ALGORITHM LOOKS FOR EXACTLY ONLY ONE BUILD SITE IN A MOLECULE 
                              TO AVOID SYMMETRY PROBLEMS (for now)
    FG_TO_BUILD             - dictionary of functional groups to attach after leaving group is removed
                              in mol lines format. This can be 'none', in case of oxidation leaving group
    FG_BOND_TO_BUILD        - dictionary of the bonding template for the functional group that is being attached
                              the bonding template to specify how the functional group attaches to the molecule (and within itself)
                              Since the functional group will always be attached at the END of mol file
                              the bonding template must follow the following key
                                'a' - number_atoms + 1
                                'b' - number_atoms + 2
                                'c' - number_atoms + 3
                                'd' - number_atoms + 4
                                'z' - bond_index (found through the algorithm looking for BUILD_labelS)    
                              thus having 'a' means that the attaching atom is the first one in the functional,
                              'b' is the second one, and so on... 'z' is reserved for the bond_index which will be found 
                              through the algorithm as it looks for index of the bond_labels to attach.                
    FG_ADD_ATOMS_BONDS      - a list that contains the number of atoms and bonds to add due to the 
                              addition of the functional group 
                              NOTE, DO NOT ACCOUNT FOR REMOVED ATOMS LEAVING GROUP (NO SUBCTRACTION), THAT HAS ALREADY BEEN ACCOUNTED FOR
                              JUST COUNT THE NUMBER OF ATOMS/BONDS IN FG

                              [#atoms_in_fg,#bonds_in_fg]
    COLORS_TO_KEEP          - this is important for the visualization part,
                              all colors need to be greyed out except for those associated with the 
                              functional group of the target build atom (also the atom that will be used for analysis)
    ATOMLABELS_REMOVE       - choose a leaving group to remove from the dictionary of LGs
                              the chosen LG list informs on which atoms you want to remove, those that correspond to environments
                              at specified depths (for the removal of LG)
                            
                              [['labeldepth1', label_atom1],['labeldepth2',label_atom2]]

                              ORGANIZED FROM DEEPEST DEPTH TO SHALLOWEST DEPTH (that is how an environment is narrowed in on)
                              see dictionary above
    ATOMLABELS_BUILD        - the target is predefined usually and depENDs on the LG chosen, choose the same 
                              item in dictionary as the LG dictionary for building on the target index where the LG
                              left (i.e the site center of reaction)
                              this follows the format:

                                  ['labeldepth3',label#]
                                
    FG_TO_BUILD             - choose a functional group from the FG_TO_BUILD dictionary to build
                              (gives the mol lines of the FG which will be attached to END of xyz/mol)
    FG_BOND_TO_BUILD        - choose the corresponding bonding template from the FG_BOND_TO_BUILD
                              (gives a template for replacing alphabetical key with integers for bonding indices of mol file)
    ADD_ATOMS_BONDS         - list that contains number of atoms and bonds in the FG 
                               [#atoms_in_fg,#bonds_in_fg] 
    ONLY_ONE                - boolean to decide to remove ONLY one of each atomic environment found
                              this is necessary in certain cases for mimicing real reactions 
                              (ex. oxidation does not require removal of both H's on CH2)
    FG_ADD_DOUBLE_BOND      - boolean to decide to add double bond as a way to esnure a double bond is created
    ADD_DOUBLE_BOND_BW      - a list of the labels of the atomic environments to create the double bond between
'''



#MAIN_BUILD_STRUCTURE = {'ox_1alc2aldC_depth3'
    
#                        }


main_build_structure = 'C-[O-C-H-H-]-|-O-[C-H-]-'

#example oflabels 
#NOTE MUST BE ORDERED FROM HIGHEST DEPTHS TO LOWEST
#NOTE that alkanes/alkenes and high symmetry is really problematic for the code
#need to choose FG's with little symmetry across the molecule, and the code will only look for
#one of the build site on the molecule to work on
LEAVING_GROUP_LABELS = {
                'nas_1amide2acid_depth3': [['labeldepth3',67],['labeldepth2',53]],
                'ox_1alc2ald_depth3':[['labeldepth3',49],['labeldepth2',42]],
                'eli_1alcalk_depth3': [['labeldepth3',192],['labeldepth2',20],['labeldepth1',14]],
                'ox_alka2alke_depth2': [['labeldepth2',60],['labeldepth2',38]]
                }

#The build label is where to build the FG
#AND ALSO THE TARGET to extract embeddings! 
BUILD_LABELS = {
                'nas_1amide2acidC_depth3':['labeldepth1',24],
                'ox_1alc2aldC_depth3':['labeldepth2']+build_labels,
                'ox_1alc2aldO_depth3': ['labeldepth2',41],
                'eli_1alc2alkC_depth3':['labeldepth1',23],
                'ox_alka2alkeC_depth2': ['labeldepth2']+build_labels,
                }


FG_TO_BUILD = {
                'none':'',
                'OH'  : '    0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 H   0  0  0  0  0  0  0  0  0  0  0  0\n',
                }

FG_BONDING   = {'none':'',
                'OH':' a z  1  0\n b a  1  0\n',
                }


FG_ADD_ATOMS_BONDS = {
                'none': [0,0],
                'OH' : [2,2],
                    }      
          
#This is purely for visualization 
COLORS_TO_KEEP = {'ox_1alc2aldC_depth3': ['#AFEEEE', '#1E90FF'],
                  'ox_1alcsaldsO_depth3': ['#98FB98', '#FFE4E1'],
                  'pri_amideC_depth3': ['#FFD700','#66CDAA'],
                  'eli_1alcalkC_depth3':['#2F4F4F','#008000'],
                  'ox_alka2alkeC_depth2': ['#98FB98','#8FBC8F']
                  }

FG_ADD_DOUBLE_BOND = False
#has to be the same depth as the build label because that one will be the checking point,
#ONLY one of that one is allowed to exist in molecule
ADD_DOUBLE_BOND_BW = [23, 20]

ATOMLABELS_REMOVE = LEAVING_GROUP_LABELS['ox_1alc2ald_depth3']
ATOMLABELS_BUILD = BUILD_LABELS['ox_1alc2aldC_depth3']
FG_TO_BUILD = FG_TO_BUILD['none']
FG_BOND_TO_BUILD = FG_BONDING['none']
ADD_ATOMS_BONDS = FG_ADD_ATOMS_BONDS['none']
ONLY_ONE = True




In [7]:
print(ATOMLABELS_BUILD)

['labeldepth2', 40, 83, 85, 102, 201, 307, 384, 496, 713, 826, 1408, 1422]


In [None]:

from fgtransform import statereaction
import pandas as pd
import numpy as np

'''
Choose a proper transformation/reaction using the dictionaries above and then adjust configuraiton
here to get reactants, removed, products files 

    ARGS:
        DATASET_FILEEPATH    - the db file that has been labelled with autolabel
        QM9                  - a boolean depENDing on whether QM9.db is being used because we
                               load all the properties using QM9 method instead
                               of AtomsData for any other db dataset (more efficient)
        LABELS_FILEDIR       - in a specified directory, you should have all the autolabel files for the db at every depth, "0.csv", "1.csv"
                               all of these will be stacked horizontally and accessed with pandas to find specific atomic environments
        N_DEPTHS             - total depths available in label_FILEDIR, how many label depths files are there
        MOLECULES_RANGE      - the range of molecules to scan for reactions
        REACT_FILEPATH        - filepath that will contain reactants 
        REMOVE_FILEPATH      - filepath that will contain reactants without LG (without optimization)
        OUT_FILEPATH         - filepath that will contain products with MMFF94 optimization 
        AVAILABLE_PROPERTIES - making the db requires setting an available property even if property is missing
                               this can be anything as filler, long as you remember what the filler database property name is so that it can be accessed
        SKIP                 - list of indices to skip because they failed at geometry optimization 
    RETURNS
        statereaction        - initializing the fgtransform class that removes specified LG and adds specified FG
        statereaction2.
        RemoveBuild_LGFG_
        SingleBuildSite      - takes in all the initial parameters of the reaction (control), and initializes the build parameters (build)
        state_reaction.
        RemoveBuild_LGFG     - scans the db file for possible reactants that have LG, performs reactions
                               and writes reactions in the init, rem and out filepath
        number_trans         - number of molecule transformed in the removal
        build_indices_list   - a list of build_indices is returned, 1 per molecule, 
                               helps embedding extraction (next) to extract target build site
        initbuild_indices
        _list                - a list of build_indices before the removal of FG is returned (to avoid the likely mismatch)
                               helps embedding extraction tool (next) to extract target build site before LG was removed
'''      

#Dataset
DATASET_FILEEPATH = 'data/datasets/QM9/QM9.db'
QM9=True 

#
LABELS_FILEDIR = "data/autolabeldata/qm910000_allelem_depths0-5"
N_DEPTHS = 6

MOLECULES_RANGE = [0,1000]

#OUTPUT
REACT_FILEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/react.xyz'
REMOVE_FILEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/del.xyz'
OUT_FILEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/prod.xyz'

AVAILABLE_PROPERTIES = ['energy']

#for ox_alka2alke_depth2
#SKIP = [685,698,699,700]
SKIP = []

statereaction = statereaction.RemoveBuild_LGFG_SingleBuildSite(DATASET_FILEEPATH,LABELS_FILEDIR,ATOMLABELS_BUILD,ATOMLABELS_REMOVE,REACT_FILEPATH,REMOVE_FILEPATH,OUT_FILEPATH,MOLECULES_RANGE,AVAILABLE_PROPERTIES,QM9,FG_TO_BUILD,FG_BOND_TO_BUILD,ADD_ATOMS_BONDS,N_DEPTHS,ONLY_ONE,FG_ADD_DOUBLE_BOND,ADD_DOUBLE_BOND_BW,SKIP)
number_trans, build_indices_list, initbuild_indices_list = statereaction.RemoveBuild_LGFG()

print(number_trans)
print(initbuild_indices_list)
print(build_indices_list)

np.savetxt('data/fgtransform/model1/ox_1alc2ald_depth3/build_indices_list.csv',build_indices_list,delimiter=',')
np.savetxt('data/fgtransform/model1/ox_1alc2ald_depth3/initbuild_indices_list.csv',initbuild_indices_list,delimiter=',')

FGTransform Analysis

#Run extract embeddings on init, rem, and trans

In [None]:
#Run extract embeddings on init and trans
from embeddings.extract_embs_frompretrainedmodel import extract_embs
import numpy as np

'''
Runs the extraction of embedding for the target site of reaction
(where the leaving group leaves and attaching FG attaches)

    ARGS:
        REACT_DATASET_FILEEPATH     - where the reactants init.db file is stored
        DEL_DATASET_FILEEPATH       - where the removed LGs rem.db file is stored
        PROD_DATASET_FILEEPATH      - where the optimized products trans.db file is stored
        REACTEMBS_SAVEFILEPATH      - where the reactant extracted embeddings will be stored
        DELEMBS_SAVEFILEPATH        - where the removed extracted embedding will be stored
        PRODEMBS_SAVEFILEPATH       - where the product extracted embedding will be stored
        MODEL_FILEPATH              - the pretrained schnet model to use to exctract embeddings
        START                       - STARTing index of the db scanning for reactions
        END                         - ENDing index of the db scanning for reactions
        N_FEATURES                  - number of features in an embedding (depENDs on model trained)
        LAYER_RANGE                 - range of layers of embeddings to extract, [5,6], means layer 5 ONLY as 6 is not included
        QM9                         - a boolean to determine to load QM9 with QM9 method rather than AtomsData to avoid
                                      having to list all of QM9's properties in AtomsData
        AVAILABLE_PROPERTIES        - in case QM9 is False, then AtomsData will be used, in which case
                                      you have to list the available properties in the db file
        ADD_HEADER                  - a boolean whether to add a header to the embedding file, 
                                      NOTE KEEP THIS FALSE HERE BECAUSE THE LATER CODES 
                                      DO NOT EXPECT A HEADING!!
        LABEL                       - a boolean whether to label the extracted embeddings with manual labelling code
        DB_DATASET                  - selects a db dataset to extract embeddings for (reactant, removed, product)
        SAVEEMBS_FILEPATH           - selects a filepath name to save the target embs data (reactant, removed, product)       
        INDICES_TO_label            - these define which indices to exctract embeddings for each molecule
                                        
                                        [[each_molecule_indices_to_extractembedding],...]
                                      NOTE that the indices would change if we remove a functional group, so there
                                      are two sets one for the reactants (initbuild_indices_list.csv), 
                                      and one for the removed and products (build_indices_list.csv)
    Returns:                              
        embedding.extract_
        emb                         - calling the function that exctracts embedding and labelling
                                      save results in the filepath specified
'''



REACT_DATASET_FILEEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/react.db'
DEL_DATASET_FILEEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/del.db'
PROD_DATASET_FILEEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/prod.db'

REACTEMBS_SAVEFILEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/rembs_Ctarget.csv'
DELEMBS_SAVEFILEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/dembs_Ctarget.csv'
PRODEMBS_SAVEFILEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/pembs_Ctarget.csv'


MODEL_FILEPATH = 'data/trainedmodels/model1/best_model'

START = 0
END = 48

N_FEATURES = 128

#NOTE use -1, to get the initial embedding!!!!!
LAYERS = [5,6]

QM9_TRUE=False
AVAILABLE_PROPERTIES = ['energy']
ADD_HEADER = False

#use indices list of atoms to label each molecule (generally, lists of lists)
LABEL = True

DB_DATASET = REACT_DATASET_FILEEPATH
SAVEEMBS_FILEPATH = REACTEMBS_SAVEFILEPATH
if DB_DATASET == REACT_DATASET_FILEEPATH:    
    INDICES_TO_LABEL = list(np.genfromtxt('data/fgtransform/model1/ox_1alc2ald_depth3/initbuild_indices_list.csv',delimiter=',').reshape(-1, 1))
else:
    INDICES_TO_LABEL = list(np.genfromtxt('data/fgtransform/model1/ox_1alc2ald_depth3/build_indices_list.csv',delimiter=',').reshape(-1, 1))  


#allelementextract.extract(QM9,REACT_DATASET_FILEEPATH,MODEL_FILEPATH,REACT_SAVE_FILEPATH,START,END,N_FEATURES,n_layers,all_elements,element,AVAILABLE_PROPERTIES=AVAILABLE_PROPERTIES)
extract_embs(QM9_TRUE,DB_DATASET, MODEL_FILEPATH,SAVEEMBS_FILEPATH,START,END,N_FEATURES,LAYERS,ELEMENTS=[1,6,7,8,9],AVAILABLE_PROPERTIES=AVAILABLE_PROPERTIES,LABEL=LABEL,ADD_HEADER=ADD_HEADER,RESTRICT_LABEL=False,ALLOWED_LABELS=[],INDICES_TO_LABEL=INDICES_TO_LABEL)



Repeating for deleted LG molecules

In [None]:
DB_DATASET = DEL_DATASET_FILEEPATH
SAVEEMBS_FILEPATH = DELEMBS_SAVEFILEPATH
if DB_DATASET == REACT_DATASET_FILEEPATH:    
    INDICES_TO_LABEL = list(np.genfromtxt('data/fgtransform/model1/ox_1alc2ald_depth3/initbuild_indices_list.csv',delimiter=',').reshape(-1, 1))
else:
    INDICES_TO_LABEL = list(np.genfromtxt('data/fgtransform/model1/ox_1alc2ald_depth3/build_indices_list.csv',delimiter=',').reshape(-1, 1))  

#allelementextract.extract(QM9,REACT_DATASET_FILEEPATH,MODEL_FILEPATH,REACT_SAVE_FILEPATH,START,END,N_FEATURES,n_layers,all_elements,element,AVAILABLE_PROPERTIES=AVAILABLE_PROPERTIES)
extract_embs(QM9_TRUE,DB_DATASET, MODEL_FILEPATH,SAVEEMBS_FILEPATH,START,END,N_FEATURES,LAYERS,ELEMENTS=[1,6,7,8,9],AVAILABLE_PROPERTIES=AVAILABLE_PROPERTIES,LABEL=False,ADD_HEADER=False,RESTRICT_LABEL=False,ALLOWED_LABELS=[],INDICES_TO_LABEL=INDICES_TO_LABEL)


Repeat for optimized products

In [None]:
DB_DATASET = PROD_DATASET_FILEEPATH
SAVEEMBS_FILEPATH = PRODEMBS_SAVEFILEPATH
if DB_DATASET == REACT_DATASET_FILEEPATH:    
    INDICES_TO_LABEL = list(np.genfromtxt('data/fgtransform/model1/ox_1alc2ald_depth3/initbuild_indices_list.csv',delimiter=',').reshape(-1, 1))
else:
    INDICES_TO_LABEL = list(np.genfromtxt('data/fgtransform/model1/ox_1alc2ald_depth3/build_indices_list.csv',delimiter=',').reshape(-1, 1))  

#allelementextract.extract(QM9,REACT_DATASET_FILEEPATH,MODEL_FILEPATH,REACT_SAVE_FILEPATH,START,END,N_FEATURES,n_layers,all_elements,element,AVAILABLE_PROPERTIES=AVAILABLE_PROPERTIES)
extract_embs(QM9_TRUE,DB_DATASET, MODEL_FILEPATH,SAVEEMBS_FILEPATH,START,END,N_FEATURES,LAYERS,ELEMENTS=[1,6,7,8,9],AVAILABLE_PROPERTIES=AVAILABLE_PROPERTIES,LABEL=True,ADD_HEADER=False,RESTRICT_LABEL=False,ALLOWED_LABELS=[],INDICES_TO_LABEL=INDICES_TO_LABEL)


transform with average diff vector and use pca on both

In [1]:
from fgtransform.utils import utils
import numpy as np

'''
Neighbor Test Using Linear Analogy Vector (Transformed vs True Product Embeddings)

    ARGS:  
        REACTEMBS_SAVEFILPEATH      - reactant FG molecules target embs filepath
        REMEMBES_SAVEFILEPATH       - removed FG molecules target embs filepath
        PRODEMBS_SAVEFILEPATH       - product FG molecules target embs filepath
        FROM                        - choose a starting point embs (REACT-, REM-, or can be even PRODEMBS_SAVEFILEPATH)
        TO                          - choose an ending point embs, same as above 
        QM9EMBS_FILEPATH            - qm9 embeddings of the same element(s) at the target and 
                                      the same layers as the extracted embedding for the product above
                                      this will be stacked on top of the product embeddings so that there 
                                      the task of finding neighbor matching is harder and includes part of 
                                      the original set SchNet was trained on
        N_FEATURES                  - number of features in the atom-embeddings
        STACK_QM9                   - boolean to stack QM9 embeddings at the same layer (and elements) to make the task 
                                      of neighbor matching harder
        utils.neighbortest_
        linearembanalogy_meandiff   - calls on the function that finds the linear analogy by mean_diff between
                                      product and reactant embeddings, and then tests the linear analogy using the neighbor
                                      test. If the linear vector transforms the reactants (transformed_embs) right next to
                                      their true product_embs using this average mean vector, then linear analogy is successful
       REACTPROD_EMBSSTACKED_
       FILEPATH                     - the filepath that holds both reactants and products embeddings stacked on top of each other 
                                      important for the next step in constructing the vectors from reactant to product for visualization

    
    RETURNS:
        linear_analogy_vector       - the mean difference vector between product and reactant embeddings
        transformed_embs            - the transformed embeddings form reactants embs by using mean difference linear analogy
        percent_match               - number of matched neighbors between transformed embedding and true product embedding
        neighbor_idx_list           - a list that will hold the neighbor indices, if the neighbor idx matches the product embedding idx
                                      then there is a match, a 100% match looks like [0,1,2,3,4,...] because all neighbor idx are matching 
                                      their product embedding, a mismatch in this list will point to where the linear analogy failed.
                                      this works because product embeddings are ordered in the same manner as transformed embeddings
        reactprod_embsstacked       - this is important for the next step in visualization is to have one dataset that 
                                      has both the reactant and product embeddings stacked
'''

REACTEMBS_SAVEFILEPATH = 'data/fgtransform/model1/ox_alka2alke_depth2/rembs_Ctarget.csv'
DELEMBS_SAVEFILEPATH = 'data/fgtransform/model1/ox_alka2alke_depth2/dembs_Ctarget.csv'
PRODEMBS_SAVEFILEPATH = 'data/fgtransform/model1/ox_alka2alke_depth2/pembs_Ctarget.csv'

FROM = REACTEMBS_SAVEFILEPATH
TO = PRODEMBS_SAVEFILEPATH

QM9EMBS_FILEPATH = 'data/embs/model1-10000/Cembslayer6.csv'

N_FEATURES = 128

STACK_QM9 = True

#load another linear analogy vector, for fractal testing! (you do this once for the article and thats it)
LA = []
#LA = np.genfromtxt('data/fgtransform/model1/ox_1alc2ald_depth3/r2p_la.csv',delimiter=',')

linear_analogy_vector, transformed_embs, percent_match, neighbors_idx_list, reactprod_embsstacked = utils.neighbortest_linearembanalogy_meandiff(FROM,TO,N_FEATURES,STACK_QM9,QM9EMBS_FILEPATH, LA)

#np.savetxt('data/fgtransform/model1/ox_1alc2ald_depth3/r2p_la.csv',linear_analogy_vector,delimiter=',')

print(neighbors_idx_list)
print(percent_match)

#This is for the next step
REACTPROD_EMBSSTACKED_FILEPATH = 'data/fgtransform/model1/ox_alka2alke_depth2/r2pembs_Ctarget.csv'
np.savetxt(REACTPROD_EMBSSTACKED_FILEPATH,reactprod_embsstacked,delimiter=',')



In [None]:
from fgtransform.utils import utils
from tools.utils import utils_dimred
import numpy as np

'''
Visualization of Linear Analogy Transformation

Uses PCA on the reactants + products + stacked qm9 embeddings (of the same layer and elements). 
Then plots the vectors between reactants to products in the PC basis

    Args:
       QM9EMBS_FILEPATH             - filepath where qm9 embeddings (of the same elements and layer) are found
                                      these will be used to provide
                                      this will be the data which we will apply our pca fit on 
       SAVE_FILEPATH                - the filepath to save the results of the pca projection on the apply data
                                      same place as the apply data but labelled with  'pca.csv'
       N_COMPONENTS                 - the number of components
       N_FEATURES                   - the number of features
       SKIP_HEADER_FULL             - 1/0 boolean to skip header on the fit data if there is one
       SKIP_HEADER_APPLY            - 1/0 boolean to skip header on the apply data if there is one
       SCALE_DATA                   - boolean to scale data using StandardScalar on the fit data
       PCA_FILEPATH                 - filepath for getting the pca embeddings of QM9 for the visualization of the plot
                                      so that it is not just vectors (need to be prepared beforehand)
       VEC_FILEPATH                 - the pca scatters of reactant to product have to be converted to 
                                      vectors x_r,y_r, x_p,y_p so that they can be plotted with connecting arrows
    Process:
        utils_dimred.pca            - a tool that runs pca fitting and applying
                                      and keeps the label on the data, saves pca filepath
                                      eigenvalues, eigenvectors, and covatiance of the pca fit
        utils.connect_reactprod_
        scatters2vectors            - a tool that connects reactant and product scatters into vectors
                                      (x_r,y_r,x_p,y_p) that can be connected with arrows
    
    Returns:
        the pca files (pca on the apply data, eig, ev, cov, of the fit)
        the vectors (x_r,y_r,x_p,y_p) that describe reactant to product 

'''

QM9EMBS_FILEPATH = 'data/embs/model1-10000/Cembslayer6.csv'
REACTPROD_EMBSSTACKED = 'data/fgtransform/model1/ox_1alc2ald_depth3/r2pembs_Ctarget.csv'

#Dimensionality reduction on new file with init and inittransformed
N_COMPONENTS = 128
N_FEATURES = 128
HEADER_EXISTS_FIT =  True
HEADER_EXISTS_APPLY = False
SCALE_DATA = False



utils_dimred.pca(QM9EMBS_FILEPATH,N_COMPONENTS,N_FEATURES,SCALE_DATA,REACTPROD_EMBSSTACKED,HEADER_EXISTS_FIT,HEADER_EXISTS_APPLY,REACTPROD_EMBSSTACKED)

PCA_FILEPATH =  'data/fgtransform/model1/ox_1alc2ald_depth3/r2pembs_Ctargetpca.csv'
VEC_FILEPATH =  'data/fgtransform/model1/ox_1alc2ald_depth3/r2pembs_Ctargetpcavecs.csv'
#Make vector plotting file from the pca (x1,y1,x2,y2) 
#(half the data is init, half is trans after compilation)
utils.connect_reactprod_scatters2vectors(PCA_FILEPATH,VEC_FILEPATH)



Preparing data for plots 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

'''
functions that prepare the data for plotting

    convert_colorsgray_excepttwo        - function that converts all irrelevany FG colors labels
                                          (not part of reactant and product FG) to gray
    move_rows_to_end                    - moves rows of all the reactand and product FG labelled embeddings
                                          to the end of the data so that they are plotted last 
                                          (on top of everything)

'''

def convert_colorsgray_excepttwo(COLORS, KEEP1, KEEP2):
    '''
    Converts all colors to gray except for two specified

        Args:
            COLORS          - the list of all color labels of atoms 
            KEEP1           - which hexcolor 1 to keep 
            KEEP2           - which hexcolor 2 to keep 
        
        Process:
            gray_colors     - a list that will contain all the now grayed out colors 
                              except two
            color           - each hexcolor in COLORS, used to check if this is a color 
                              we should keep
            r,g,b           - converts hexcolor to rgb
            gray_value      - the gray rgb
            gray_hex        - convert gray rgb back to hex

        Returns:
            gray_colors     - a list containing all the grayed out colors
                              except two

    '''
    def rgb_to_hex(r, g, b):
        '''
        converts rgb back to hex
        '''
        return "#{:02x}{:02x}{:02x}".format(r, g, b)

    def hex_to_rgb(hex_color):
        '''
        converts hex to rgb
        '''
        hex_color = hex_color.lstrip('#')
        return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

    def rgb_to_gray(rgb):
        '''
        grays out rgb
        '''
        return int(0.2989 * rgb[0] + 0.587 * rgb[1] + 0.114 * rgb[2])

    gray_colors = []
    for color in COLORS:
        if color == KEEP1 or color == KEEP2:
            gray_colors.append(color.replace('#','0x'))
        else:
            r, g, b = hex_to_rgb(color)
            gray_value = rgb_to_gray((r, g, b))
            gray_hex = rgb_to_hex(gray_value, gray_value, gray_value)
            gray_colors.append(gray_hex.upper().replace('#','0x'))

    return gray_colors


def move_rows_to_end(DF, COLOR_COL_NAME, KEEP1, KEEP2):
    '''
    Move two labelled FG's to the end so that they are plotted last

        Args:
            DF                  - the dataframe that needs to be edited
            COLOR_COL_NAME      - the color column name
            KEEP1               - the first label (data point) to move to the end
            KEEP2               - the second label (data point) to move to the end
        
        Process:
            'is_keep_color'     - creating a new column of booleans in pandas
                                  dataframes, where its True if one of the colors
                                  to keep is in this row and false otherwise
            df_keep             - all the colors to move to end, extracted using our new column
            df_rest             - all the colors that will stay

        Returns:
            df_combined         - putting together the two in the order we want them, where the 
                                  relevant target FG's are plotted last, and new temporary column is dropped


    '''

    DF['is_keep_color'] = (DF[COLOR_COL_NAME] == KEEP1) | (DF[COLOR_COL_NAME] == KEEP2)
    df_keep = DF[DF['is_keep_color']]
    df_rest = DF[~DF['is_keep_color']]
    df_combined = pd.concat([df_rest, df_keep])
    df_combined = df_combined.drop(columns=['is_keep_color'])
    return df_combined




Plotting 

In [None]:
'''
Saving to files compatible with gnuplot for plotting           

    VEC2VEC_DATA_FILEPATH
    PCA_SCATTER_FILEPATH
    

'''
# Read the CSV file
VEC2VEC_DATA_FILEPATH = 'data/fgtransform/model1/ox_1alc2ald_depth3/r2pembs_Ctargetpcavecs.csv'
PCA_SCATTER_FILEPATH = 'data/embs/model1-10000/Cembslayer6pca.csv'

def prepare_data_gnuplot():
    pass

vec2vec_data = pd.read_csv(VEC2VEC_DATA_FILEPATH)
pca_scatter_data = pd.read_csv(PCA_SCATTER_FILEPATH)

# Extract hexadecimal colors from column 136
colors = pca_scatter_data.iloc[:, 136].tolist()

colors_gray = convert_colorsgray_excepttwo(colors,COLORS_TO_KEEP['ox_1alc2aldC_depth3'][0],COLORS_TO_KEEP['ox_1alc2aldC_depth3'][1])
colors_gray = pd.Series(colors_gray,name='graycolors')

pca_scatter_data2 = pd.concat([pca_scatter_data,colors_gray],axis=1)
pca_scatter_data2 = move_rows_to_end(pca_scatter_data2,'hexcolor', COLORS_TO_KEEP['ox_1alc2aldC_depth3'][0], COLORS_TO_KEEP['ox_1alc2aldC_depth3'][1])

pca_scatter_data2.to_csv('data/fgtransform/model1/ox_1alc2ald_depth3/Cembslayer6reordered.csv',index=False)






In [None]:
'''
Plotting the scatter and vectors here

'''

# Convert hexadecimal colors to RGB
rgb_colors = [(int(color[1:3], 16) / 255, int(color[3:5], 16) / 255, int(color[5:7], 16) / 255) for color in colors]
plt.scatter(pca_scatter_data.iloc[:,0], pca_scatter_data.iloc[:,1], color=rgb_colors)
# Plot each vector
for index, row in vec2vec_data.iterrows():
    x_r, y_r, x_p, y_p = row
    plt.quiver(x_r, y_r, x_p, y_p, angles='xy', scale_units='xy', scale=1,color='r',width=0.005)


plt.ylabel('PC1')
plt.ylabel('PC2')
# Set plot limits and aspect ratio
#plt.xlim(-20,20)
#plt.ylim(-10,15)
#plt.gca().set_aspect('equal', adjustable='box')
# Show the plot
plt.show()

In [None]:
'''
PERTURBATION TOOL --> UNDERSTANDING THE EMBEDDING SPACE AS 
                      NEIGHBORHOOD COMPOSITIONS
'''

