In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

import torch
from torch import nn, optim, Tensor
from torch.utils.data import Dataset, DataLoader
from torch.nn import MSELoss, Linear, ReLU, Dropout
import torch.nn.functional as F

import torch_geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
from torch_geometric.nn import global_add_pool as gsp, global_mean_pool as gmp, global_max_pool as gap

from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors as desc
from rdkit.Chem import AllChem, Crippen, Lipinski, Draw

from joblib import dump, load
from Environment import one_hot_encoding_unk, featurization_parameters, atom_features, process_single_SMILES, SMILES_data_process
from Environment import calculate_val_metrics, save_metrics_to_txt
from Environment import MT_FinGCN, MT_GCN, MT_GCN_non_SF, validation_GCN
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
Metric_path = './utils/Regression_multi_task/'
scaler_x = load(Metric_path + 'Regression_multitask_x_scaler.joblib')
scaler_y = load(Metric_path + 'Regression_multitask_y_scaler.joblib')


file_path = './dataset/data_pre/Result_data_pre_include_fingerprints/'
x_pre = pd.read_csv(file_path + '07_Concatenated_data.csv')
X_pre = scaler_x.transform(x_pre.iloc[:,1:]) #11 solvent descriptors and 2065 fingerprints
X_pre = pd.DataFrame(X_pre)

y_pre = pd.read_csv('./dataset/data_pre/DATA_pre_use_to_predict_142.csv')
Y_pre = scaler_y.transform(y_pre.iloc[:,4:10])   #6 real experimental values
Y_pre = pd.DataFrame(Y_pre)

combined_data_pre = pd.concat([x_pre.iloc[:,0], Y_pre, X_pre], axis=1)
print(combined_data_pre.shape)

feature_params = featurization_parameters()
pre_processed_data = SMILES_data_process(combined_data_pre)
pre_loader = DataLoader(pre_processed_data,batch_size=32,shuffle=False)

(142, 2083)


In [3]:
#MT_FinGCN
model = MT_FinGCN().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_function = MSELoss()
model.load_state_dict(torch.load(Metric_path + 'Regression_MT_FinGCN_best_model.pt'))

Y_pred_pre, Y_origin_pre, mask_pre, loss_pre = validation_GCN(model, pre_loader, DEVICE)

Y_origin_pre = scaler_y.inverse_transform(Y_origin_pre)
Y_pred_pre = scaler_y.inverse_transform(Y_pred_pre)

output_path = './Result/regressor/GCN/'
np.savetxt(output_path + 'Y_pred_MT_FinGCN.csv', Y_pred_pre, delimiter=',')
np.savetxt(output_path + 'Y_origin_pre.csv', Y_origin_pre, delimiter=',')

pre_metrics = calculate_val_metrics(Y_pred_pre, Y_origin_pre, mask_pre)
for key, value in pre_metrics.items():
    print(f"The test set of {key}: {value:.4f}")

save_metrics_to_txt(pre_metrics, output_path + 'pre_metrics_MT_FinGCN.txt')

validation finished!
validation_loss: 0.2108
The test set of RMSE_0: 35.5583
The test set of RMSE_1: 33.5476
The test set of RMSE_2: 20.1375
The test set of RMSE_3: 0.2155
The test set of RMSE_4: 0.1812


In [4]:
#MT_GCN
model = MT_GCN().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_function = MSELoss()
model.load_state_dict(torch.load(Metric_path + 'Regression_MT_GCN_best_model.pt'))

Y_pred_pre, Y_origin_pre, mask_pre, loss_pre = validation_GCN(model, pre_loader, DEVICE)

Y_origin_pre = scaler_y.inverse_transform(Y_origin_pre)
Y_pred_pre = scaler_y.inverse_transform(Y_pred_pre)

output_path = './Result/regressor/GCN/'
np.savetxt(output_path + 'Y_pred_MT_GCN.csv', Y_pred_pre, delimiter=',')

pre_metrics = calculate_val_metrics(Y_pred_pre, Y_origin_pre, mask_pre)
for key, value in pre_metrics.items():
    print(f"The test set of {key}: {value:.4f}")

save_metrics_to_txt(pre_metrics, output_path + 'pre_metrics_MT_GCN.txt')

validation finished!
validation_loss: 0.3092
The test set of RMSE_0: 53.4722
The test set of RMSE_1: 39.9070
The test set of RMSE_2: 37.3870
The test set of RMSE_3: 0.2543
The test set of RMSE_4: 0.1636


In [5]:
#MT_GCN_non_SF
model = MT_GCN_non_SF().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_function = MSELoss()
model.load_state_dict(torch.load(Metric_path + 'Regression_MT_GCN_non_SF_best_model.pt'))

Y_pred_pre, Y_origin_pre, mask_pre, loss_pre = validation_GCN(model, pre_loader, DEVICE)

Y_origin_pre = scaler_y.inverse_transform(Y_origin_pre)
Y_pred_pre = scaler_y.inverse_transform(Y_pred_pre)

output_path = './Result/regressor/GCN/'
np.savetxt(output_path + 'Y_pred_MT_GCN_non_SF.csv', Y_pred_pre, delimiter=',')

pre_metrics = calculate_val_metrics(Y_pred_pre, Y_origin_pre, mask_pre)
for key, value in pre_metrics.items():
    print(f"The test set of {key}: {value:.4f}")

save_metrics_to_txt(pre_metrics, output_path + 'pre_metrics_MT_GCN_non_SF.txt')

validation finished!
validation_loss: 0.4114
The test set of RMSE_0: 64.7614
The test set of RMSE_1: 53.1163
The test set of RMSE_2: 41.0917
The test set of RMSE_3: 0.3196
The test set of RMSE_4: 0.1533
