In [2]:
import pandas as pd
import numpy as np
import json
import random
from concurrent.futures import ThreadPoolExecutor
from functools import partial

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch

from rdkit import rdBase
rdBase.DisableLog('rdApp.error') 

from dotenv import load_dotenv
load_dotenv()

import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(parent_dir)

from model.model import GNNFingerprint3D
from hypotheses.compare_with_other_fingerprints.utils import smiles_to_3D

In [3]:
data_path = os.getenv("DATA_PATH")
models_path = os.getenv("MODELS_PATH")

columns = ["mu", "zpve", "energy_U0", "rcA", "rcB", "rcC", "Cv", "alpha"]

In [6]:
small_3D_impact_model = GNNFingerprint3D(13, 5)
small_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MORE_WEIGHT_3D.pth")))
small_3D_impact_model = small_3D_impact_model.to("cuda")
small_3D_impact_model.eval()

huge_3D_impact_model = GNNFingerprint3D(13, 5)
huge_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MUCH_MORE_WEIGHT_3D.pth")))
huge_3D_impact_model = huge_3D_impact_model.to("cuda")
huge_3D_impact_model.eval()

with open(os.path.join(data_path, "means_and_stds.json")) as f:
    scaler = json.load(f)

  small_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MORE_WEIGHT_3D.pth")))
  huge_3D_impact_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MUCH_MORE_WEIGHT_3D.pth")))


### 2D Task

In [4]:
def read_data(path, ki_threshold):
    df = pd.read_csv(path, sep=";")
    df['Activity'] = df['Standard Value'].apply(lambda x: 1 if x < ki_threshold else 0)

    df = df[['Smiles', 'Activity']].dropna()

    X_train, X_test, y_train, y_test = train_test_split(
        df['Smiles'], df['Activity'], test_size=0.2, random_state=42
    )

    return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)

In [5]:
def get_score(X_train, y_train, X_test, y_test, task = "c"):
    s_scaler = StandardScaler()
    X_train = s_scaler.fit_transform(X_train)
    X_test = s_scaler.transform(X_test)

    # PCA to 167 dim
    pca = PCA(n_components=167)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    if task == "c":
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print("Classification Report:")
        print(classification_report(y_test, y_pred))

    else:
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print("Regression Metrics:")
        print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
        print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
        print(f"R²:   {r2_score(y_test, y_pred):.4f}")

In [None]:
folder = os.path.join(data_path, "CHEMBL")
device = "cuda" if torch.cuda.is_available() else "cpu"

for db in os.listdir(folder):
    print("============================================")
    print(db)
    print("============================================")

    data = os.path.join(folder, db)
    X_train, X_test, y_train, y_test = read_data(data, 100)

    X_train, y_train = list(X_train), list(y_train)
    X_test, y_test = list(X_test), list(y_test)

    ### SMALL 3D IMPACT
    print("****************")
    print("SMALL 3D IMPACT")
    print("****************")
    X_train_small_3D = np.array([smiles_to_3D(smiles, small_3D_impact_model, scaler, False).detach().cpu() for smiles in X_train])
    X_test_small_3D = np.array([smiles_to_3D(smiles, small_3D_impact_model, scaler, False).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_small_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_small_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_small_3D = np.delete(X_train_small_3D, nan_indices_train, axis=0)
    y_train_small_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_small_3D = np.delete(X_test_small_3D, nan_indices_test, axis=0)
    y_test_small_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_small_3D, y_train_small_3D, X_test_small_3D, y_test_small_3D)

    ### HUGE 3D IMPACT
    print("****************")
    print("HUGE 3D IMPACT")
    print("*****************")

    X_train_huge_3D = np.array([smiles_to_3D(smiles, huge_3D_impact_model, scaler, False).detach().cpu() for smiles in X_train])
    X_test_huge_3D = np.array([smiles_to_3D(smiles, huge_3D_impact_model, scaler, False).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_huge_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_huge_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_huge_3D = np.delete(X_train_huge_3D, nan_indices_train, axis=0)
    y_train_huge_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_huge_3D = np.delete(X_test_huge_3D, nan_indices_test, axis=0)
    y_test_huge_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_huge_3D, y_train_huge_3D, X_test_huge_3D, y_test_huge_3D)
