In [1]:
import pertreplicateveroncorr

'''
Perturbational replicate embedding fitting based on neighborhood composition embedding feature space
NOTE that the construction of the neighborhood embedding features and the fitting the replicated GNN embeddings 
is done in one method, the reason for this is because the neighborhood embeddings features have to be updated
for each perturbation, so as to avoid having to manually repeat code, it is combined into one function that constructs
the features space and fits it, updates, and repeats. However, there is a boolean (FIT) that controls if you just want
the representation without fitting

   The method below is a two-function that is repeated up until number of perts chosen

    Args for neighborhood feature constructution:
        LOG_FILE            ---- string filepath where to output log info 
        MOL_RANGE           ---- tuple containing start and end range of embedding space to replicate
        N_FEATURES          ---- integer number of features per embedding (dimensionality of embedding space)
        DEPTH               ---- integer specifying how many neighborhood layers deep to use for the neighborhood feature space embedding composition
        PERTS_RANGE         ---- integer specifying number of times to reapply the model, each time applied is based on the previous results,
                                 which leads to incorporating neighborhood information in layers
        OG_EMBS_FILEPATH    ---- string filepath where original embedding vectors to be trained for replication are located
        PERTS_FILEDIR       ---- string filedir where to place our perturbational replicates
    
    Args for fitting:
        INPUT_DIM           ---- number of dimensions per atom's neighborhood features 
                                = (NUMBER_ELEMENTS)*DEPTH*DIM_EMBEDDING
        OUTPUT_DIM          ---- number of dimensions per atom-embedding from the pretrained GNN model
        NONLINEAR           ---- boolean to decide if to make this nonlinear, by adding a nonlinear layer (tanh) on top of the initial layer
        SECOND_LINEAR       ---- boolean to decide if to add an extra linear layer on top the non-linear
        NUMBER_EXTRALAYERS  ---- extra layers to add to test more hyperparameters
        EPOCHS              ---- max number of epochs for fitting
        INIT_LR             ---- initial learning rate for the neural network backpropagation
        BATCHING            ---- boolean to decide whether to batch for training and validation (found this to be useless for this study)
        TRAIN_BATCH_SIZE    ---- training batch size if batching is True   
        VAL_BATCH_SIZE      ---- validation batch size if batching is True
        VALIDATION_SPLIT    ---- fraction of total data to use for validation
        PATIENCE            ---- epochs to wait with no improvement before halting training
        LR_PATIENCE         ---- epochs to wait with no improvement before lowering the learning rate 
        LR_FACTOR           ---- factor to lower learning rate by
        FIT                 ---- whether to fit or just build neighborhood feature representation USING a pretrained model 
        PRETRAINED_DIR      ---- the director of pretrained perturbational models if not fitting but using a pretrained model
'''

#Args
LOG_FILE = 'log.txt'
MOL_RANGE = (250,5000)
N_FEATURES = 128
DEPTH = 1
PERTS_RANGE = [0,7]
OG_EMBS_FILEPATH = '../data/embs/model1-10000_old/layer6/embslayer6.csv'
PERTS_FILEDIR = '../data/embs/embsdensity-ince/perts_model/'


#FITTING/LOADING PARAMETERS
#NOTE THAT I TESTED OUT MODELS THAT HAVE HIGH INTERNAL PARAMETERS I FOUND THAT TO BE USELESS, 
#KEPT IT AT THE PARAMTERES OF THE INPUT NEIGHBORHOOD FEATURE SPACE AND OUTPUT EMBEDDING ONLY
INPUT_DIM = 128*5*DEPTH
OUTPUT_DIM = 128
NONLINEAR = True
SECOND_LINEAR = True
NUMBER_EXTRALAYERS = 0
EPOCHS = 10000
INIT_LR = 0.001
BATCHING = False
TRAIN_BATCH_SIZE = 200
VAL_BATCH_SIZE = 200
VALIDATION_SPLIT = 0.2
PATIENCE = 300
LR_PATIENCE = 100 
LR_FACTOR = 0.5
FIT = False
PRETRAINED_DIR = '../data/perturbation/0-2000mol_linear(128)-nonlinear-linear(128)_corr/'

#running the perturbational replicates algorithm: 
# 1) constructing nbrhood embeddings and 
# 2) fitting with embedding space, 
# 3) repeat until all perts applied
pertreplicateveroncorr.constructandfit_nbrlayeremb_vs_GNNemb(LOG_FILE,MOL_RANGE,N_FEATURES,DEPTH,PERTS_RANGE,OG_EMBS_FILEPATH,PERTS_FILEDIR,INPUT_DIM,OUTPUT_DIM,NONLINEAR,SECOND_LINEAR,NUMBER_EXTRALAYERS,EPOCHS,INIT_LR,BATCHING,TRAIN_BATCH_SIZE,VAL_BATCH_SIZE,VALIDATION_SPLIT,PATIENCE,LR_PATIENCE,LR_FACTOR,FIT,PRETRAINED_DIR)



  from .autonotebook import tqdm as notebook_tqdm


300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


nn_rmse:  0.6666248931595776
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


nn_rmse:  0.45613446616313225
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


nn_rmse:  0.3810654338349006
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


nn_rmse:  0.3551863381800801
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


nn_rmse:  0.3478919552915959


In [None]:
#TESTING IF THIS IS TRULY BETTER THEN THE MLP.... CHECK WITH THE NEIGHBOR TEST.... 