In [4]:
from utils import utils_transferlearn
from direc_dict import direc_dict_train, direc_dict_test

'''
Initializes preprocessing of the training data and fit the scaler
 

ARGS:
    DATA_FILEPATH           - where the x and y datafile is located, in one file, where the x and y columns will be provided by column indices              
    X_COLS_IDXS             - the column indices of the x data
    Y_COLS_IDXS             - the column indices of the y data

                              
Returns:
    SAVED transfer learning model as .joblib which can be used for testing

    
Train mlp on the data 

    HIDDEN_LAYER_SIZES      - tuple to control the number of layers and sizes of the layers (ex. (200,200))
    ACTIVATION              - which activation function to use between the layers
    USE_TRAIN_FOR_TEST      - boolean to choose if to use the training/testing dataset for later testing below,
                              to compare training vs generalization error 
    SAVE_MODEL_FILEPATH     - filepath here to save the mlp model for later testing
'''

#Args
TRAIN_DATA_FILEPATH = direc_dict_train['qm9Cembsvselectrondensity']
X_COLS_IDXS = [0,128]
Y_COLS_IDXS = [128,160]


model = utils_transferlearn.mlp_train_and_test(TRAIN_DATA_FILEPATH,X_COLS_IDXS,Y_COLS_IDXS)

HIDDEN_LAYER_SIZES = (200,200)
#use 'identity' to just do linear regression... 
ACTIVATION = 'relu'
SAVE_MODEL_FILEPATH = 'model.pth'

model.train(HIDDEN_LAYER_SIZES,ACTIVATION,SAVE_MODEL_FILEPATH)



------------------------------------------------------------------------------
Model Parameters:
Hidden layer sizes: (200, 200)
Activation function: relu
Solver: adam
Learning rate: constant
Number of iterations: 66
Coefficients: [array([[ 0.14242492, -0.05998186,  0.05267286, ...,  0.07122121,
        -0.24930597,  0.1538223 ],
       [-0.10741874, -0.19395546, -0.06452414, ...,  0.1300351 ,
        -0.19357301,  0.04525733],
       [-0.00749985,  0.10783902,  0.04517314, ...,  0.12729087,
        -0.05979425,  0.14606378],
       ...,
       [ 0.07113627,  0.12168142,  0.02513176, ...,  0.07985682,
         0.06738495,  0.12531532],
       [ 0.10015858,  0.09549639, -0.02424084, ..., -0.09378142,
         0.13920644,  0.02562449],
       [ 0.0264411 ,  0.04887815,  0.10058643, ..., -0.0332292 ,
        -0.09088604,  0.02206737]]), array([[-0.15910021,  0.04021626,  0.09674286, ..., -0.07685791,
        -0.06925091, -0.06258229],
       [-0.29150382, -0.0080021 , -0.09417362, ..., -0.

In [8]:
from utils import utils_transferlearn

'''
Last modified: 2024/10/09

Initializes preprocessing of the training data and fit the scaler
 

ARGS:
    DATA_FILEPATH           - where the x and y datafile is located, in one file, where the x and y columns will be provided by column indices              
    X_COLS_IDXS             - the column indices of the x data
    Y_COLS_IDXS             - the column indices of the y data

                              
Returns:
    SAVED transfer learning model as .joblib which can be used for testing

    
Introduce outside test set for predictions

    Args:
        TESTDATA_FILEPATH           ---- where the input molecular embedding file is located, use the direc_dict for previously defined filepaths
        X_COLS_IDXS                 ---- the range of columns representing the embeddings vector
        Y_COLS_IDXS                 ---- the range of columns representing the Y target, if it exists for this set, as it can be a testing with unknown Y
        Y_EXISTS                    ---- this could be a test set with unknown Y, thus Y does not have to exist for this, we are not fitting, just using the model
        COL_TO_ISOLATE              ---- if you want to isolate embeddings based on a specific column name
        ROW_KEY_ISOLATE             ---- the key value to isolate the embeddings you want
        INDICES_FOR_NMR             ---- if you have predefined indices for each molecule [[mol1_index1,mol1_index2],[mol2_index1,mol2_index2,...],[.mol3_index1,...]] 
                                         that you want to isolate for predictions
        NUMBER_CENTERS              ---- number of molecules in the dataset

'''

#Args
TRAIN_DATA_FILEPATH = direc_dict_train['qm9Cembsvselectrondensity']
TEST_DATA_FILEPATH = direc_dict_test['qm9Cembsvselectrondensity'] 
X_COLS_IDXS = [0,128]
Y_COLS_IDXS = [128,160]

#
Y_EXISTS = True
LABEL_EXISTS = False
LABEL_COL_IDXS = [128,139]


COL_TO_ISOLATE = 'atomic_number'
ROW_KEY_ISOLATE = 6

CENTER_INDICES_FILEPATH = None
CENTERS = (0,1)
NUMBER_CENTERS = 186


MODEL_FILEPATH = 'model.pth'
#initialize model with standard scalar used in training
model = utils_transferlearn.mlp_train_and_test(TRAIN_DATA_FILEPATH,X_COLS_IDXS,Y_COLS_IDXS)
#test model with testing dataset
predictions = model.test(TEST_DATA_FILEPATH,MODEL_FILEPATH,CENTER_INDICES_FILEPATH,CENTERS,NUMBER_CENTERS,COL_TO_ISOLATE,Y_EXISTS,ROW_KEY_ISOLATE,LABEL_EXISTS,LABEL_COL_IDXS)

print(len(predictions))



RMSE: 0.017530618934089453
R^2: 0.7650212716234367
variance on True: 1.1419317997029024
Variance on Pred: 1.1404269103730187
22924


In [None]:

predictions.to_csv('Cembs2occspectrum_preds.csv',index=False)

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import math

'''
(temp, for NMR)
Isolating a Y_true from another file.... for C-NMR (gas) values at 5000-6000 of QM9

'''


x_data = pd.read_csv('../data/embs/embsdensityince/20240910carbonwtotalED/CembsVSfullocc250to5000NNmodel1/CembsVSfullocc.csv', delimiter=',')
y_true = x_data[(x_data['mol_index'] >= 250) & (x_data['mol_index'] < 5000)].iloc[:,159:160]
fg_label = x_data[(x_data['mol_index'] >= 250) & (x_data['mol_index'] < 5000)].iloc[:,160:170]

print(len(y_true))
print(len(predictions))

mse = mean_squared_error(y_true.iloc[:,0:128], predictions.iloc[:,0:30])
print(math.sqrt(mse))

out = np.column_stack((predictions.iloc[:,0:1],y_true.iloc[:,0:1].values,fg_label))

22917


NameError: name 'predictions' is not defined

In [6]:

np.savetxt('Cembs2occs_predvstrue.csv', out, delimiter=',')