In [8]:
import pandas as pd
import numpy as np
import random
import json

from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch

import utils

from rdkit import rdBase
rdBase.DisableLog('rdApp.error') 

from dotenv import load_dotenv
load_dotenv()

import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(parent_dir)

from model.model import GNNFingerprint3D

In [9]:
data_path = os.getenv("DATA_PATH")
models_path = os.getenv("MODELS_PATH")

columns = ["zpve", "mu", "energy_U0"]

In [10]:
def read_data(db_name, size):
    path = os.path.join(data_path, db_name)
    all_files = [ff for f in os.listdir(path) if os.path.isfile(ff := os.path.join(path, f))]

    random_files = random.sample(all_files, size)
    l = []
    for i, file in enumerate(random_files):
        with open(file, 'r') as f:
            data = json.load(f)
        rec = {"smiles": data["smiles"], "conf": data["atoms"]}
        for col in columns:
            rec[col] = data[col]
        l.append(rec)

    return pd.DataFrame(l)

In [11]:
def get_score(X_train, y_train, X_test, y_test):
    s_scaler = StandardScaler()
    X_train = s_scaler.fit_transform(X_train)
    X_test = s_scaler.transform(X_test)

    # PCA to 167 dim
    pca = PCA(n_components=167)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Regression Metrics:")
    print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
    print(f"R²:   {r2_score(y_test, y_pred):.4f}")

In [13]:
fingerprint_model_2D = GNNFingerprint3D(13, 5)
fingerprint_model_2D.load_state_dict(torch.load(os.path.join(os.getenv("MODELS_PATH"), "GNN_MORE_WEIGHT_3D.pth")))
fingerprint_model_2D = fingerprint_model_2D.to("cuda")
fingerprint_model_2D.eval()

fingerprint_model_3D = GNNFingerprint3D(13, 5)
fingerprint_model_3D.load_state_dict(torch.load(os.path.join(os.getenv("MODELS_PATH"), "FINAL_GNN.pth")))
fingerprint_model_3D = fingerprint_model_3D.to("cuda")
fingerprint_model_3D.eval()

with open(os.path.join(data_path, "means_and_stds.json")) as f:
    scaler = json.load(f)

  fingerprint_model_2D.load_state_dict(torch.load(os.path.join(os.getenv("MODELS_PATH"), "GNN_MORE_WEIGHT_3D.pth")))
  fingerprint_model_3D.load_state_dict(torch.load(os.path.join(os.getenv("MODELS_PATH"), "FINAL_GNN.pth")))


In [14]:
df = read_data("qm9_data_json", 20000)

In [16]:
for col in columns:
    X = df[["smiles", "conf"]]
    y = df[col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    X_train = np.array(X_train)
    X_test = np.array(X_test)

    train_filtered = [(s, y) for s, y in zip(X_train, y_train) if getattr(utils, "is_valid_smiles")(s[0])]
    X_train, y_train = zip(*train_filtered) if train_filtered else ([], [])
    test_filtered = [(s, y) for s, y in zip(X_test, y_test) if getattr(utils, "is_valid_smiles")(s[0])]
    X_test, y_test = zip(*test_filtered) if test_filtered else ([], [])

    X_train, y_train = list(X_train), list(y_train)
    X_test, y_test = list(X_test), list(y_test)

    print("============================================")
    print(col)
    print("============================================")
    for fingerprint in ("ecfp", "maccs", "rdkit", "rdf", "random", "gnn_fp_2d", "gnn_fp_3d"):
        func_name = "smiles_to_" + fingerprint

        if "gnn_fp" in fingerprint:
            func_name = "smiles_to_3D"
            if fingerprint == "gnn_fp_2d":
                X_train_prep = np.array([getattr(utils, func_name)(smiles, fingerprint_model_2D, scaler).detach().cpu() for smiles in X_train])
                X_test_prep = np.array([getattr(utils, func_name)(smiles, fingerprint_model_2D, scaler).detach().cpu() for smiles in X_test])
            else:
                X_train_prep = np.array([getattr(utils, func_name)(smiles, fingerprint_model_3D, scaler).detach().cpu() for smiles in X_train])
                X_test_prep = np.array([getattr(utils, func_name)(smiles, fingerprint_model_3D, scaler).detach().cpu() for smiles in X_test])

            rows_all_nan_train = np.isnan(X_train_prep).all(axis=1)
            nan_indices_train = np.where(rows_all_nan_train)[0]
            rows_all_nan_test = np.isnan(X_test_prep).all(axis=1)
            nan_indices_test = np.where(rows_all_nan_test)[0]

            X_train_prep = np.delete(X_train_prep, nan_indices_train, axis=0)
            y_train_prep = np.delete(y_train, nan_indices_train, axis=0)
            X_test_prep = np.delete(X_test_prep, nan_indices_test, axis=0)
            y_test_prep = np.delete(y_test, nan_indices_test, axis=0)

        elif fingerprint == "rdf":
            X_train_prep = np.array([getattr(utils, func_name)(smiles) for smiles in X_train])
            X_test_prep = np.array([getattr(utils, func_name)(smiles) for smiles in X_test])

        else:
            X_train_prep = np.array([getattr(utils, func_name)(smiles[0]) for smiles in X_train])
            y_train_prep = y_train
            X_test_prep = np.array([getattr(utils, func_name)(smiles[0]) for smiles in X_test])
            y_test_prep = y_test

        print(fingerprint.upper())
        get_score(X_train_prep, y_train_prep, X_test_prep, y_test_prep)

zpve
ECFP
Regression Metrics:
MAE:  4193.8219
RMSE: 6440.2360
R²:   0.4431
MACCS
Regression Metrics:
MAE:  3781.4545
RMSE: 5802.8720
R²:   0.5479
RDKIT
Regression Metrics:
MAE:  4260.6341
RMSE: 6533.1617
R²:   0.4269
RDF
Regression Metrics:
MAE:  3136.0895
RMSE: 5218.1469
R²:   0.6344
RANDOM
Regression Metrics:
MAE:  5936.2329
RMSE: 8656.9265
R²:   -0.0063
GNN_FP_2D
Regression Metrics:
MAE:  3747.6181
RMSE: 5958.4773
R²:   0.5233
GNN_FP_3D
Regression Metrics:
MAE:  2732.8465
RMSE: 4053.1615
R²:   0.7794
mu
ECFP
Regression Metrics:
MAE:  0.2484
RMSE: 0.5711
R²:   0.1070
MACCS
Regression Metrics:
MAE:  0.2418
RMSE: 0.9331
R²:   -1.3837
RDKIT
Regression Metrics:
MAE:  0.2597
RMSE: 1.2089
R²:   -3.0013
RDF
Regression Metrics:
MAE:  0.2346
RMSE: 1.8606
R²:   -8.4784
RANDOM
Regression Metrics:
MAE:  0.2984
RMSE: 0.6111
R²:   -0.0225
GNN_FP_2D
Regression Metrics:
MAE:  0.2266
RMSE: 0.5105
R²:   0.2864
GNN_FP_3D
Regression Metrics:
MAE:  0.1598
RMSE: 0.3789
R²:   0.6069
energy_U0
ECFP
Regressi