TRAINING PHASE I

In [None]:
from utils import utils_transferlearn
import numpy as np

np.random.seed(42)

'''
Initializes preprocessing of the training data and fit the scaler
 

ARGS:
    DATA_FILEPATH           - where the x and y datafile is located, in one file, where the x and y columns will be provided by column indices              
    X_COLS_IDXS             - the column indices of the x data
    Y_COLS_IDXS             - the column indices of the y data

                              
Returns:
    SAVED transfer learning model as .joblib which can be used for testing

'''

#Dictionary of preset directories/filepaths
direc_dict = {'qm9Oembsvselectrondensity':'../data/embs/embsdensityince/20240907oxygenfullspectrumwfullP/OembsVSfullocc250to5000NNmodel1/OembsVSfullocc.csv',
              'qm9Cembsvselectrondensity':'../data/embs/embsdensityince/20240910carbonwtotalED/CembsVSfullocc250to5000NNmodel1/CembsVSfullocc.csv',
              'iupacpkaembsvspka':'../data/embs/embspKA-stef/1layerNN-1L1NLbias258/601Oembs.csv',
              'qm9embeddingsvsae':'../data/embs/old/model1-10000/layer6/Cembslayer6-addedfromprevlayer.csv',
              'qm9embeddingsvsnmr': '../data/embs/embsQM9NMR/Cembslayer6-nmrgas.csv',
              'qm9embeddingswcarbacid@6000':'../data/perturbation/targetall-useavg-subavg/qm9-5000-6000-all/mlp/Caewcarbacid/carbacidd1-p5.csv'}  #X_test = X_test[:-2] NOTE missing last two carbons got deleted, to match it with 10000 QM9 molecules 

#Args
TRAIN_DATA_FILEPATH = direc_dict['qm9Cembsvselectrondensity']
X_COLS_IDXS = [0,128]
Y_COLS_IDXS = [159,160]


model = utils_transferlearn.mlp_train_and_test(TRAIN_DATA_FILEPATH,X_COLS_IDXS,Y_COLS_IDXS)


'''
Train mlp on the data 

    HIDDEN_LAYER_SIZES      - tuple to control the number of layers and sizes of the layers (ex. (200,200))
    ACTIVATION              - which activation function to use between the layers
    USE_TRAIN_FOR_TEST      - boolean to choose if to use the training/testing dataset for later testing below,
                              to compare training vs generalization error 
    SAVE_MODEL_FILEPATH     - filepath here to save the mlp model for later testing

'''
HIDDEN_LAYER_SIZES = (200,200,200,200,200)
#use 'identity' to just do linear regression... 
ACTIVATION = 'relu'
SAVE_MODEL_FILEPATH = '../data/embs/embsdensityince/20240910carbonwfullED/CembsVSfullocc250to5000NNmodel1/nnmodel200x6.pth'

model.train(HIDDEN_LAYER_SIZES,ACTIVATION,SAVE_MODEL_FILEPATH)



TESTING PHASE II

In [None]:
from utils import utils_transferlearn
import numpy as np

np.random.seed(42)

'''
Initializes preprocessing of the training data and fit the scaler
 

ARGS:
    DATA_FILEPATH           - where the x and y datafile is located, in one file, where the x and y columns will be provided by column indices              
    X_COLS_IDXS             - the column indices of the x data
    Y_COLS_IDXS             - the column indices of the y data

                              
Returns:
    SAVED transfer learning model as .joblib which can be used for testing

'''

#Dictionary of preset directories/filepaths
direc_dict = {'qm9Oembsvselectrondensity':'../data/embs/embsdensityince/20240907oxygenfullspectrumwfullP/OembsVSfullocc250to5000NNmodel1/OembsVSfullocc.csv',
              'qm9Cembsvselectrondensity':'../data/embs/embsdensityince/20240910carbonwtotalED/CembsVSfullocc250to5000NNmodel1/CembsVSfullocc.csv',
              'iupacpkaembsvspka':'../data/embs/embspKA-stef/1layerNN-1L1NLbias258/601Oembs.csv',
              'qm9embeddingsvsae':'../data/embs/old/model1-10000/layer6/Cembslayer6-addedfromprevlayer.csv',
              'qm9embeddingsvsnmr': '../data/embs/embsQM9NMR/Cembslayer6-nmrgas.csv',
              'qm9embeddingswcarbacid@6000':'../data/perturbation/targetall-useavg-subavg/qm9-5000-6000-all/mlp/Caewcarbacid/carbacidd1-p5.csv'}  #X_test = X_test[:-2] NOTE missing last two carbons got deleted, to match it with 10000 QM9 molecules 

#Args
TRAIN_DATA_FILEPATH = direc_dict['qm9Cembsvselectrondensity']
X_COLS_IDXS = [0,128]
Y_COLS_IDXS = [159,160]


model = utils_transferlearn.mlp_train_and_test(TRAIN_DATA_FILEPATH,X_COLS_IDXS,Y_COLS_IDXS)


'''
Introduce outside test set for predictions

    Args:
        TESTDATA_FILEPATH           ---- where the input molecular embedding file is located, use the direc_dict for previously defined filepaths
        X_COLS_IDXS                 ---- the range of columns representing the embeddings vector
        Y_COLS_IDXS                 ---- the range of columns representing the Y target, if it exists for this set, as it can be a testing with unknown Y
        Y_EXISTS                    ---- this could be a test set with unknown Y, thus Y does not have to exist for this, we are not fitting, just using the model
        COL_TO_ISOLATE              ---- if you want to isolate embeddings based on a specific column name
        KEY                         ---- the key value to isolate the embeddings you want
        INDICES_FOR_NMR             ---- if you have predefined indices for each molecule [[mol1_index1,mol1_index2],[mol2_index1,mol2_index2,...],[.mol3_index1,...]] 
                                         that you want to isolate for predictions
        NUMBER_CENTERS              ---- number of molecules in the dataset
'''


#Dictionary of preset directories/filepaths
direc_dict = {'qm9Oembsvselectrondensity':'../data/embs/embsdensityince/20240907oxygenfullspectrumwfullP/OembsVSfullocc250to5000NNmodel1/OembsVSfullocc.csv',
              'qm9Cembsvselectrondensity': '../data/embs/embsdensityince/20240910carbonwtotalED/CembsVSfullocc250to5000NNmodel1/CembsVSfullocc.csv',
              'qm9ED_perts':'../data/embs/embsdensityince/pertsmodel/0.csv',
             'qm9embsvselectrondensity':'../data/embs/embsdensity-ince/Cembs250-5000woccs.csv',
             'NMRstudy_cyclopentafuran':'../data/perturbation/NMRstudy_cyclopentafuran_BEST_0-2000/3.csv',
             'sym_alkene_H2ox_reactembs_perts':'../data/fgtransform/model1/sym_alkene_H2ox/rpert1.csv',
              'sym_alkene_H2ox_prodembs_perts':'../data/fgtransform/model1/sym_alkene_H2ox/ppert1.csv',
              'sym_alkene_H2ox_reactembs':'../data/fgtransform/model1/sym_alkene_H2ox/reactembsfull.csv',
              'sym_alkene_H2ox_prodembs':'../data/fgtransform/model1/sym_alkene_H2ox/prodembsfull.csv',
              'pr_alcohol_H2ox_reactembs_perts':'../data/fgtransform/model1/pr_alcohol_H2ox/r1.csv',
              'pr_alcohol_H2ox_prodembs_perts':'../data/fgtransform/model1/pr_alcohol_H2ox/p1.csv',
              'pr_alcohol_H2ox_reactembs':'../data/fgtransform/model1/pr_alcohol_H2ox/reactembsfull.csv',
              'pr_alcohol_H2ox_prodembs':'../data/fgtransform/model1/pr_alcohol_H2ox/prodembsfull.csv',
              'alcohol_H2ox_reactembs_perts':'../data/fgtransform/model1/alcohol_H2ox/r3.csv',
              'alcohol_H2ox_prodembs_perts':'../data/fgtransform/model1/alcohol_H2ox/p4.csv',
              'alcohol_H2ox_reactembs':'../data/fgtransform/alcohol_H2ox/totalrembs_Ctarget.csv',
              'alcohol_H2ox_prodembs':'../data/fgtransform/alcohol_H2ox/totalpembs_Ctarget.csv',
              'alkane_H2ox_prodembs_perts':'../data/fgtransform/model1/alkane_H2ox/p2.csv',
              'alkane_H2ox_reactembs_perts':'../data/fgtransform/model1/alkane_H2ox/r2.csv',
              'alkane_H2ox_prodembs':'../data/fgtransform/alkane_H2ox/totalpembs_Ctarget.csv',
              'alkane_H2ox_reactembs':'../data/fgtransform/alkane_H2ox/totalrembs_Ctarget.csv',
              'qm9embeddingsvsae':'../data/embs/old/model1-10000/layer6/Cembslayer6-addedfromprevlayer.csv',
              'qm9embeddingsvsnmr': '../data/embs/embsQM9NMR/Cembslayer6-nmrgas.csv',
              'qm9embeddingswcarbacid@6000':'../data/perturbation/targetall-useavg-subavg/qm9-5000-6000-all/mlp/Caewcarbacid/carbacidd1-p5.csv'}  #X_test = X_test[:-2] NOTE missing last two carbons got deleted, to match it with 10000 QM9 molecules 


#Args
TEST_DATA_FILEPATH = direc_dict['qm9ED_perts'] 
Y_EXISTS = False
LABEL_EXISTS = False
COL_TO_ISOLATE = 'atomic_number'
ROW_KEY = 6
LABEL_COL_IDXS = [128,139]
CENTER_INDICES_FILEPATH = None
CENTERS = (0,1)
NUMBER_CENTERS = 186


MODEL_FILEPATH = '../data/embs/embsdensityince/20240910carbonwtotalED/CembsVSfullocc250to5000NNmodel1/nnmodel.pth'

predictions = model.test(TEST_DATA_FILEPATH,MODEL_FILEPATH,CENTER_INDICES_FILEPATH,CENTERS,NUMBER_CENTERS,COL_TO_ISOLATE,Y_EXISTS,ROW_KEY,LABEL_EXISTS,LABEL_COL_IDXS)

print(len(predictions))


predictions.to_csv('../data/embs/embsdensityince/20240910oxygenwfullED/OembsVSfullocc250to5000NNmodel1/OembsvstotalEPRED.csv',index=False)