In [1]:
import pandas as pd
import numpy as np
import random
import json

from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch

import utils

from rdkit import rdBase
rdBase.DisableLog('rdApp.error') 

from dotenv import load_dotenv
load_dotenv()

import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(parent_dir)

from model.model import GNNFingerprint3D

In [2]:
data_path = os.getenv("DATA_PATH")
models_path = os.getenv("MODELS_PATH")

columns = ["mu", "zpve", "rcA", "rcB", "rcC", "energy_U0", "Cv", "alpha"]

In [3]:
def read_data(db_name, size):
    path = os.path.join(data_path, db_name)
    all_files = [ff for f in os.listdir(path) if os.path.isfile(ff := os.path.join(path, f))]

    random_files = random.sample(all_files, size)
    l = []
    for i, file in enumerate(random_files):
        with open(file, 'r') as f:
            data = json.load(f)
        rec = {"smiles": data["smiles"], "conf": data["atoms"]}
        for col in columns:
            rec[col] = data[col]
        l.append(rec)

    return pd.DataFrame(l)

In [4]:
def get_score(X_train, y_train, X_test, y_test):
    s_scaler = StandardScaler()
    X_train = s_scaler.fit_transform(X_train)
    X_test = s_scaler.transform(X_test)

    # PCA to 167 dim
    pca = PCA(n_components=167)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Regression Metrics:")
    print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
    print(f"R²:   {r2_score(y_test, y_pred):.4f}")

In [5]:
fingerprint_model = GNNFingerprint3D(13, 5)
fingerprint_model.load_state_dict(torch.load(os.path.join(os.getenv("MODELS_PATH"), "GNN_MUCH_MORE_WEIGHT_3D.pth")))
fingerprint_model = fingerprint_model.to("cuda")
fingerprint_model.eval()

with open(os.path.join(data_path, "means_and_stds.json")) as f:
    scaler = json.load(f)

  fingerprint_model.load_state_dict(torch.load(os.path.join(os.getenv("MODELS_PATH"), "GNN_MUCH_MORE_WEIGHT_3D.pth")))


In [6]:
df = read_data("qm9_data_json", 20000)

In [7]:
for col in columns:
    X = df[["smiles", "conf"]]
    y = df[col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    X_train = np.array(X_train)
    X_test = np.array(X_test)

    train_filtered = [(s, y) for s, y in zip(X_train, y_train) if getattr(utils, "is_valid_smiles")(s[0])]
    X_train, y_train = zip(*train_filtered) if train_filtered else ([], [])
    test_filtered = [(s, y) for s, y in zip(X_test, y_test) if getattr(utils, "is_valid_smiles")(s[0])]
    X_test, y_test = zip(*test_filtered) if test_filtered else ([], [])

    X_train, y_train = list(X_train), list(y_train)
    X_test, y_test = list(X_test), list(y_test)

    print("============================================")
    print(col)
    print("============================================")
    for fingerprint in ("ecfp", "maccs", "rdkit", "rdf", "random", "3D"):
        func_name = "smiles_to_" + fingerprint

        if fingerprint == "3D":
            X_train_prep = np.array([getattr(utils, func_name)(smiles, fingerprint_model, scaler).detach().cpu() for smiles in X_train])
            X_test_prep = np.array([getattr(utils, func_name)(smiles, fingerprint_model, scaler).detach().cpu() for smiles in X_test])
    
            rows_all_nan_train = np.isnan(X_train_prep).all(axis=1)
            nan_indices_train = np.where(rows_all_nan_train)[0]
            rows_all_nan_test = np.isnan(X_test_prep).all(axis=1)
            nan_indices_test = np.where(rows_all_nan_test)[0]

            X_train_prep = np.delete(X_train_prep, nan_indices_train, axis=0)
            y_train = np.delete(y_train, nan_indices_train, axis=0)
            X_test_prep = np.delete(X_test_prep, nan_indices_test, axis=0)
            y_test = np.delete(y_test, nan_indices_test, axis=0)

        elif fingerprint == "rdf":
            X_train_prep = np.array([getattr(utils, func_name)(smiles) for smiles in X_train])
            X_test_prep = np.array([getattr(utils, func_name)(smiles) for smiles in X_test])

        else:
            X_train_prep = np.array([getattr(utils, func_name)(smiles[0]) for smiles in X_train])
            X_test_prep = np.array([getattr(utils, func_name)(smiles[0]) for smiles in X_test])

        print(fingerprint.upper())
        get_score(X_train_prep, y_train, X_test_prep, y_test)

mu
ECFP
Regression Metrics:
MAE:  0.2409
RMSE: 0.5009
R²:   0.1457
MACCS
Regression Metrics:
MAE:  0.2170
RMSE: 0.4275
R²:   0.3779
RDKIT
Regression Metrics:
MAE:  0.2359
RMSE: 0.4854
R²:   0.1977
RDF
Regression Metrics:
MAE:  0.2010
RMSE: 0.4516
R²:   0.3057
RANDOM
Regression Metrics:
MAE:  0.2904
RMSE: 0.5508
R²:   -0.0328
3D
Regression Metrics:
MAE:  0.1617
RMSE: 0.3626
R²:   0.5524
zpve
ECFP
Regression Metrics:
MAE:  4123.7674
RMSE: 6307.6398
R²:   0.4717
MACCS
Regression Metrics:
MAE:  3657.2585
RMSE: 5720.2776
R²:   0.5655
RDKIT
Regression Metrics:
MAE:  4273.4788
RMSE: 6521.6851
R²:   0.4353
RDF
Regression Metrics:
MAE:  3168.4178
RMSE: 5120.4873
R²:   0.6519
RANDOM
Regression Metrics:
MAE:  5888.8125
RMSE: 8705.1584
R²:   -0.0062
3D
Regression Metrics:
MAE:  2737.8402
RMSE: 4088.2374
R²:   0.7781
rcA
ECFP


KeyboardInterrupt: 