In [7]:
import pandas as pd
import numpy as np
import json
import random
from concurrent.futures import ThreadPoolExecutor
from functools import partial

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch

from rdkit import rdBase
rdBase.DisableLog('rdApp.error') 

from dotenv import load_dotenv
load_dotenv()

import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(parent_dir)

from model.model import GNNFingerprint3D
from hypotheses.compare_with_other_fingerprints.utils import smiles_to_3D

In [8]:
data_path = os.getenv("DATA_PATH")
models_path = os.getenv("MODELS_PATH")

columns = ["mu", "zpve", "energy_U0"]

In [9]:
small_3D_impact_model = GNNFingerprint3D(13, 5)
small_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MORE_WEIGHT_3D.pth")))
small_3D_impact_model = small_3D_impact_model.to("cuda")
small_3D_impact_model.eval()

huge_3D_impact_model = GNNFingerprint3D(13, 5)
huge_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "FINAL_GNN.pth")))
huge_3D_impact_model = huge_3D_impact_model.to("cuda")
huge_3D_impact_model.eval()

with open(os.path.join(data_path, "means_and_stds.json")) as f:
    scaler = json.load(f)

  small_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MORE_WEIGHT_3D.pth")))
  huge_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "FINAL_GNN.pth")))


### 2D Task

In [10]:
def read_data(path, ki_threshold):
    df = pd.read_csv(path, sep=";")
    df['Activity'] = df['Standard Value'].apply(lambda x: 1 if x < ki_threshold else 0)

    df = df[['Smiles', 'Activity']].dropna()

    X_train, X_test, y_train, y_test = train_test_split(
        df['Smiles'], df['Activity'], test_size=0.2, random_state=42
    )

    return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)

In [11]:
def get_score(X_train, y_train, X_test, y_test, task = "c"):
    s_scaler = StandardScaler()
    X_train = s_scaler.fit_transform(X_train)
    X_test = s_scaler.transform(X_test)

    # PCA to 167 dim
    pca = PCA(n_components=167)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    if task == "c":
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print("Classification Report:")
        print(classification_report(y_test, y_pred))

    else:
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print("Regression Metrics:")
        print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
        print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
        print(f"R²:   {r2_score(y_test, y_pred):.4f}")

In [6]:
folder = os.path.join(data_path, "CHEMBL")
device = "cuda" if torch.cuda.is_available() else "cpu"

for db in os.listdir(folder):
    print("============================================")
    print(db)
    print("============================================")

    data = os.path.join(folder, db)
    X_train, X_test, y_train, y_test = read_data(data, 100)

    X_train, y_train = list(X_train), list(y_train)
    X_test, y_test = list(X_test), list(y_test)

    ### SMALL 3D IMPACT
    print("****************")
    print("SMALL 3D IMPACT")
    print("****************")
    X_train_small_3D = np.array([smiles_to_3D(smiles, small_3D_impact_model, scaler, False).detach().cpu() for smiles in X_train])
    X_test_small_3D = np.array([smiles_to_3D(smiles, small_3D_impact_model, scaler, False).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_small_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_small_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_small_3D = np.delete(X_train_small_3D, nan_indices_train, axis=0)
    y_train_small_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_small_3D = np.delete(X_test_small_3D, nan_indices_test, axis=0)
    y_test_small_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_small_3D, y_train_small_3D, X_test_small_3D, y_test_small_3D)

    ### HUGE 3D IMPACT
    print("****************")
    print("HUGE 3D IMPACT")
    print("*****************")

    X_train_huge_3D = np.array([smiles_to_3D(smiles, huge_3D_impact_model, scaler, False).detach().cpu() for smiles in X_train])
    X_test_huge_3D = np.array([smiles_to_3D(smiles, huge_3D_impact_model, scaler, False).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_huge_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_huge_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_huge_3D = np.delete(X_train_huge_3D, nan_indices_train, axis=0)
    y_train_huge_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_huge_3D = np.delete(X_test_huge_3D, nan_indices_test, axis=0)
    y_test_huge_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_huge_3D, y_train_huge_3D, X_test_huge_3D, y_test_huge_3D)

CHEMBL1833_5HT2B.csv
****************
SMALL 3D IMPACT
****************
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       408
           1       0.86      0.45      0.59       134

    accuracy                           0.85       542
   macro avg       0.85      0.71      0.75       542
weighted avg       0.85      0.85      0.83       542

****************
HUGE 3D IMPACT
*****************
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       408
           1       0.93      0.30      0.45       134

    accuracy                           0.82       542
   macro avg       0.87      0.65      0.67       542
weighted avg       0.84      0.82      0.78       542

CHEMBL214_5HT1A.csv
****************
SMALL 3D IMPACT
****************




Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.69      0.71       549
           1       0.76      0.80      0.78       673

    accuracy                           0.75      1222
   macro avg       0.75      0.75      0.75      1222
weighted avg       0.75      0.75      0.75      1222

****************
HUGE 3D IMPACT
*****************




Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.62      0.66       549
           1       0.72      0.79      0.75       673

    accuracy                           0.71      1222
   macro avg       0.71      0.70      0.71      1222
weighted avg       0.71      0.71      0.71      1222

CHEMBL224_5HT2A.csv
****************
SMALL 3D IMPACT
****************
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.94      0.87       704
           1       0.89      0.70      0.78       532

    accuracy                           0.83      1236
   macro avg       0.85      0.82      0.82      1236
weighted avg       0.84      0.83      0.83      1236

****************
HUGE 3D IMPACT
*****************
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.91      0.81       704
           1       0.83      0.55      0.66     

### 3D TASK

In [12]:
def read_data_3D(db_name, size):
    path = os.path.join(data_path, db_name)
    all_files = [ff for f in os.listdir(path) if os.path.isfile(ff := os.path.join(path, f))]

    random_files = random.sample(all_files, size)
    l = []
    for i, file in enumerate(random_files):
        with open(file, 'r') as f:
            data = json.load(f)
        rec = {"smiles": data["smiles"], "conf": data["atoms"], "homo": data["homo"], "lumo": data["lumo"]}
        for col in columns:
            rec[col] = data[col]
        l.append(rec)

    return pd.DataFrame(l)

In [13]:
df = read_data_3D("qm9_data_json", 20000)
device = "cuda" if torch.cuda.is_available() else "cpu"

for col in columns:
    print("============================================")
    print(col)
    print("============================================")

    X = df[["smiles", "conf", "homo", "lumo"]]
    y = df[col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    
    ### SMALL 3D IMPACT
    print("****************")
    print("SMALL 3D IMPACT")
    print("****************")
    X_train_small_3D = np.array([smiles_to_3D(smiles, small_3D_impact_model, scaler).detach().cpu() for smiles in X_train])
    X_test_small_3D = np.array([smiles_to_3D(smiles, small_3D_impact_model, scaler).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_small_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_small_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_small_3D = np.delete(X_train_small_3D, nan_indices_train, axis=0)
    y_train_small_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_small_3D = np.delete(X_test_small_3D, nan_indices_test, axis=0)
    y_test_small_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_small_3D, y_train_small_3D, X_test_small_3D, y_test_small_3D, "r")

    ### HUGE 3D IMPACT
    print("****************")
    print("HUGE 3D IMPACT")
    print("*****************")

    X_train_huge_3D = np.array([smiles_to_3D(smiles, huge_3D_impact_model, scaler).detach().cpu() for smiles in X_train])
    X_test_huge_3D = np.array([smiles_to_3D(smiles, huge_3D_impact_model, scaler).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_huge_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_huge_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_huge_3D = np.delete(X_train_huge_3D, nan_indices_train, axis=0)
    y_train_huge_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_huge_3D = np.delete(X_test_huge_3D, nan_indices_test, axis=0)
    y_test_huge_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_huge_3D, y_train_huge_3D, X_test_huge_3D, y_test_huge_3D, "r")


mu
****************
SMALL 3D IMPACT
****************
Regression Metrics:
MAE:  0.2237
RMSE: 0.4979
R²:   0.2343
****************
HUGE 3D IMPACT
*****************
Regression Metrics:
MAE:  0.1634
RMSE: 0.3812
R²:   0.5512
zpve
****************
SMALL 3D IMPACT
****************
Regression Metrics:
MAE:  3655.8828
RMSE: 5839.9271
R²:   0.5289
****************
HUGE 3D IMPACT
*****************
Regression Metrics:
MAE:  2683.6034
RMSE: 4170.7000
R²:   0.7597
energy_U0
****************
SMALL 3D IMPACT
****************
Regression Metrics:
MAE:  367.7953
RMSE: 1437.1622
R²:   0.2875
****************
HUGE 3D IMPACT
*****************
Regression Metrics:
MAE:  182.1129
RMSE: 1017.1632
R²:   0.6431
