In [None]:
import pandas as pd
import numpy as np
import json
import random

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import rdBase
rdBase.DisableLog('rdApp.error') 

from dotenv import load_dotenv
load_dotenv()

from mol_level_model.model import DescriptorContrastiveModel
from mol_level_model.utils import calculate_mean_and_std, approximate_homo_lumo

import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(parent_dir)

from model.model import GNNFingerprint3D
from hypotheses.compare_with_other_fingerprints.utils import smiles_to_3D
from data_preprocessing.descriptors import calculate_descriptors_v2

In [21]:
data_path = os.getenv("DATA_PATH")
models_path = os.getenv("MODELS_PATH")

columns = ["mu", "zpve", "energy_U0", "rcA", "rcB", "rcC", "Cv", "alpha"]

In [None]:
from rdkit.Geometry import Point3D
from rdkit.Chem import Conformer

def attach_3D_coords_to_mol(mol, coords):
    """
    Assigns 3D coordinates (list of [atom_symbol, x, y, z]) to mol.
    """
    conf = Conformer(mol.GetNumAtoms())
    for i, (symbol, x, y, z) in enumerate(coords):
        conf.SetAtomPosition(i, Point3D(float(x), float(y), float(z)))

    mol.AddConformer(conf, assignId=True)
    return mol


In [42]:
def get_descriptor(smiles, model, scaler, device, conf=None, homo=None, lumo=None):
    mol = Chem.MolFromSmiles(smiles)
    try:
        mol = Chem.AddHs(mol)
    except:
        return torch.full((1024,), float('nan')).to("cuda")
     
    if conf:
        c = Conformer(mol.GetNumAtoms())
        for i, (symbol, x, y, z) in enumerate(conf):
            c.SetAtomPosition(i, Point3D(float(x), float(y), float(z)))
        mol.AddConformer(c, assignId=True)
    else:
        result = AllChem.EmbedMolecule(mol, randomSeed=42)
        if result != 0: return torch.full((1024,), float('nan')).to(device)
        mol = Chem.RemoveHs(mol)

    if not homo:
        homo, lumo = approximate_homo_lumo(mol)

    desc = calculate_descriptors_v2(smiles, mol, homo, lumo, False)
    desc.pop("SMILES", None)

    for key in desc:
        desc[key] = (desc[key] - scaler[key]['mean']) / scaler[key]['std']

    rec = torch.tensor(list(desc.values()), dtype=torch.float32).unsqueeze(0).to(device)
    pred = model(rec)
    return pred.squeeze(0)

### 2D TASK

In [5]:
def read_data(path, ki_threshold):
    df = pd.read_csv(path, sep=";")
    df['Activity'] = df['Standard Value'].apply(lambda x: 1 if x < ki_threshold else 0)

    df = df[['Smiles', 'Activity']].dropna()

    X_train, X_test, y_train, y_test = train_test_split(
        df['Smiles'], df['Activity'], test_size=0.2, random_state=42
    )

    return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)

In [4]:
def get_score(X_train, y_train, X_test, y_test, task = "c"):
    s_scaler = StandardScaler()
    X_train = s_scaler.fit_transform(X_train)
    X_test = s_scaler.transform(X_test)

    # PCA to 167 dim
    pca = PCA(n_components=167)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    if task == "c":
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print("Classification Report:")
        print(classification_report(y_test, y_pred))

    else:
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print("Regression Metrics:")
        print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
        print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
        print(f"R²:   {r2_score(y_test, y_pred):.4f}")

In [8]:
fingerprint_model = GNNFingerprint3D(13, 5)
fingerprint_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MUCH_MORE_WEIGHT_3D.pth")))
fingerprint_model = fingerprint_model.to("cuda")
fingerprint_model.eval()

descriptor_model = DescriptorContrastiveModel(input_dim=18)
descriptor_model.load_state_dict(torch.load(os.path.join(models_path, "DESCRIPTOR_LEVEL_MODEL.pth")))
descriptor_model = descriptor_model.to("cuda")
descriptor_model.eval()

with open(os.path.join(data_path, "means_and_stds.json")) as f:
    scaler = json.load(f)

desc_scaler = calculate_mean_and_std(os.path.join(data_path, "qm9_data"))

  fingerprint_model.load_state_dict(torch.load(os.path.join(models_path, "GNN_MUCH_MORE_WEIGHT_3D.pth")))
  descriptor_model.load_state_dict(torch.load(os.path.join(models_path, "DESCRIPTOR_LEVEL_MODEL.pth")))


In [None]:
folder = os.path.join(data_path, "CHEMBL")
device = "cuda" if torch.cuda.is_available() else "cpu"

for db in os.listdir(folder):
    print("============================================")
    print(db)
    print("============================================")

    data = os.path.join(folder, db)
    X_train, X_test, y_train, y_test = read_data(data, 100)

    X_train, y_train = list(X_train), list(y_train)
    X_test, y_test = list(X_test), list(y_test)
    
    ### Descriptor
    print("****************")
    print("Mol level model")
    print("****************")
    X_train_desc = np.array([get_descriptor(smiles, descriptor_model,desc_scaler, device).detach().cpu() for smiles in X_train])
    X_test_desc = np.array([get_descriptor(smiles, descriptor_model,desc_scaler, device).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_desc).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_desc).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_desc = np.delete(X_train_desc, nan_indices_train, axis=0)
    y_train_desc = np.delete(y_train, nan_indices_train, axis=0)
    X_test_desc = np.delete(X_test_desc, nan_indices_test, axis=0)
    y_test_desc = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_desc, y_train_desc, X_test_desc, y_test_desc)

    ### 3D
    print("****************")
    print("Atom level model")
    print("*****************")

    X_train_3D = np.array([smiles_to_3D(smiles, fingerprint_model, scaler, False).detach().cpu() for smiles in X_train])
    X_test_3D = np.array([smiles_to_3D(smiles, fingerprint_model, scaler, False).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_3D = np.delete(X_train_3D, nan_indices_train, axis=0)
    y_train_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_3D = np.delete(X_test_3D, nan_indices_test, axis=0)
    y_test_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_3D, y_train_3D, X_test_3D, y_test_3D)

CHEMBL1833_5HT2B.csv
****************
Mol level model
****************


100%|██████████| 2166/2166 [03:58<00:00,  9.07it/s]


Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.96      0.90       408
           1       0.78      0.46      0.58       134

    accuracy                           0.83       542
   macro avg       0.81      0.71      0.74       542
weighted avg       0.83      0.83      0.82       542

****************
Atom level model
*****************
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       408
           1       0.91      0.31      0.46       134

    accuracy                           0.82       542
   macro avg       0.86      0.65      0.68       542
weighted avg       0.84      0.82      0.79       542

CHEMBL214_5HT1A.csv
****************
Mol level model
****************


100%|██████████| 4891/4891 [04:28<00:00, 18.25it/s]


KeyboardInterrupt: 

### 3D TASK

In [24]:
def read_data_3D(db_name, size):
    path = os.path.join(data_path, db_name)
    all_files = [ff for f in os.listdir(path) if os.path.isfile(ff := os.path.join(path, f))]

    random_files = random.sample(all_files, size)
    l = []
    for i, file in enumerate(random_files):
        with open(file, 'r') as f:
            data = json.load(f)
        rec = {"smiles": data["smiles"], "conf": data["atoms"], "homo": data["homo"], "lumo": data["lumo"]}
        for col in columns:
            rec[col] = data[col]
        l.append(rec)

    return pd.DataFrame(l)

In [45]:
#df = read_data_3D("qm9_data_json", 20000)
#device = "cuda" if torch.cuda.is_available() else "cpu"

for col in columns:
    print("============================================")
    print(col)
    print("============================================")

    X = df[["smiles", "conf", "homo", "lumo"]]
    y = df[col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    
    ### Descriptor
    print("****************")
    print("Mol level model")
    print("****************")
    X_train_desc = np.array([get_descriptor(mol[0], descriptor_model, desc_scaler, device, mol[1], mol[2], mol[3]).detach().cpu() for mol in X_train])
    X_test_desc = np.array([get_descriptor(mol[0], descriptor_model, desc_scaler, device, mol[1], mol[2], mol[3]).detach().cpu() for mol in X_test])

    rows_all_nan_train = np.isnan(X_train_desc).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_desc).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_desc = np.delete(X_train_desc, nan_indices_train, axis=0)
    y_train_desc = np.delete(y_train, nan_indices_train, axis=0)
    X_test_desc = np.delete(X_test_desc, nan_indices_test, axis=0)
    y_test_desc = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_desc, y_train_desc, X_test_desc, y_test_desc, "r")

    ### 3D
    print("****************")
    print("Atom level model")
    print("*****************")

    X_train_3D = np.array([smiles_to_3D(smiles, fingerprint_model, scaler).detach().cpu() for smiles in X_train])
    X_test_3D = np.array([smiles_to_3D(smiles, fingerprint_model, scaler).detach().cpu() for smiles in X_test])

    rows_all_nan_train = np.isnan(X_train_3D).all(axis=1)
    nan_indices_train = np.where(rows_all_nan_train)[0]

    rows_all_nan_test = np.isnan(X_test_3D).all(axis=1)
    nan_indices_test = np.where(rows_all_nan_test)[0]

    X_train_3D = np.delete(X_train_3D, nan_indices_train, axis=0)
    y_train_3D = np.delete(y_train, nan_indices_train, axis=0)
    X_test_3D = np.delete(X_test_3D, nan_indices_test, axis=0)
    y_test_3D = np.delete(y_test, nan_indices_test, axis=0)

    get_score(X_train_3D, y_train_3D, X_test_3D, y_test_3D, "r")

mu
****************
Mol level model
****************
Regression Metrics:
MAE:  0.1097
RMSE: 0.5160
R²:   0.1175
****************
Atom level model
*****************
Regression Metrics:
MAE:  0.1666
RMSE: 0.3772
R²:   0.5286
zpve
****************
Mol level model
****************


KeyboardInterrupt: 