In [None]:
import pandas as pd

# --- File paths ---
diffsbdd_path = "data/diffsbdd/processed/vina_results/docking_scores_diffsbdd.csv"
hybridised_path = "docking_scores_hybridised.csv"

# --- Load CSVs ---
df_diffsbdd = pd.read_csv(diffsbdd_path)
df_hybrid = pd.read_csv(hybridised_path)

# --- Define helper to summarise docking scores ---
def summarise(df, label):
    mean_score = df["score"].mean()
    max_score = df["score"].max()  # highest value (least negative, weakest binder)
    min_score = df["score"].min()  # most negative, strongest binder
    return {
        "Method": label,
        "Mean": mean_score,
        "Max (weakest)": max_score,
        "Min (strongest)": min_score,
    }

# --- Summaries ---
summary = pd.DataFrame([
    summarise(df_diffsbdd, "DiffSBDD"),
    summarise(df_hybrid, "DiffSBDD-expanded-atom-types")
])

# --- Reference ligand docking score ---
reference_score = -6.76

print(summary)
print(f"\nReference ligand docking score: {reference_score}")


                         Method      Mean  Max (weakest)  Min (strongest)
0                      DiffSBDD -8.386911       21.92911        -14.50522
1  DiffSBDD-expanded-atom-types -9.420248       45.58155        -13.29585

Reference ligand docking score: -6.76


In [None]:
import os
import pandas as pd
import numpy as np

# -----------------------------
# paths you gave
# -----------------------------
DOCK_DIFF = "data/diffsbdd/processed/vina_results/docking_scores_diffsbdd.csv"
DOCK_HYBR = "data/hybridised_carbons/docking_scores_hybridised.csv"
REF_DOCK = -6.76

PROP_DIFF = "./data/diffsbdd/raw/molecular_properties_analysis.csv"
PROP_HYBR = "data/hybridised_carbons/raw/molecular_properties_analysis.csv"

# Optional PoseBusters CSV
POSEBUSTERS_CSV = None

METHOD_LABELS = {
    "DiffSBDD": "DiffSBDD",
    "DiffSBDD-expanded-atom-types": "Hybridised DiffSBDD",
    "diffsbdd": "DiffSBDD",
    "hybrid": "Hybridised DiffSBDD",
}

# -----------------------------
# helpers
# -----------------------------
def load_docking(path, label):
    df = pd.read_csv(path)
    if "score" not in df.columns:
        raise ValueError(f"'score' not found in {path}. Got columns: {list(df.columns)}")
    out = {
        "Method": label,
        "Docking mean": df["score"].mean(),
        "Docking max (weakest)": df["score"].max(),
        "Docking min (strongest)": df["score"].min(),
        "N docking": len(df)
    }
    return pd.DataFrame([out])

def normalise_method_name(raw, fallback="Unknown"):
    s = str(raw)
    for k, v in METHOD_LABELS.items():
        if k.lower() in s.lower():
            return v
    return fallback

def summarise_properties(path, default_method_name):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    df = pd.read_csv(path)

    method = default_method_name

    present = {
        "QED_mean": "qed_mean",
        "SA_mean": "sa_mean",
        "logP_mean": "logp_mean",
        "Lipinski_mean": "lipinski_violations_mean",
        "diversity": "diversity"
    }

    out = {"Method": method}
    for new, old in present.items():
        if old in df.columns:
            out[new] = df[old].mean()

    out["Total molecules (props)"] = int(df["original_molecules"].sum())
    return pd.DataFrame([out]), df

def load_posebusters_summary(csv_path=None):
    if csv_path and os.path.exists(csv_path):
        pb = pd.read_csv(csv_path)
        pb["Method"] = pb["Method"].apply(normalise_method_name)
        if "Pass Rate (%)" in pb.columns:
            pb["Pass Rate (%)"] = (
                pb["Pass Rate (%)"].astype(str).str.replace("%","", regex=False).astype(float)
            )
        return pb[["Method","Total Poses","Passed Poses","Failed Poses","Pass Rate (%)"]]

    fallback = pd.DataFrame({
        "Method": ["DiffSBDD", "Hybridised DiffSBDD"],
        "Total Poses": [11728, 11559],
        "Passed Poses": [6830, 5399],
        "Failed Poses": [4898, 6160],
        "Pass Rate (%)": [58.24, 46.71],
    })
    return fallback

def make_latex_table(df, caption="Docking, PoseBusters and property summary", label="tab:hybrid_overview"):
    cols = df.columns
    colspec = "l" + "r"*(len(cols)-1)
    header = " & ".join(cols) + r" \\"
    lines = [r"\begin{table}[h]",
             r"  \centering",
             rf"  \caption{{{caption}}}",
             rf"  \label{{{label}}}",
             rf"  \begin{tabular}{{{colspec}}}",
             r"    \toprule",
             "    " + header,
             r"    \midrule"]
    for _, row in df.iterrows():
        vals = []
        for c in cols:
            v = row[c]
            if isinstance(v, float):
                vals.append(f"{v:.3f}")
            else:
                vals.append(str(v))
        lines.append("    " + " & ".join(vals) + r" \\")
    lines += [r"    \bottomrule", r"  \end{tabular}", r"\end{table}"]
    return "\n".join(lines)

# -----------------------------
# run
# -----------------------------
dock_a = load_docking(DOCK_DIFF, "DiffSBDD")
dock_b = load_docking(DOCK_HYBR, "Hybridised DiffSBDD")
dock_summary = pd.concat([dock_a, dock_b], ignore_index=True)

print("Docking summary")
print(dock_summary.round(3).to_string(index=False))
print(f"\nReference ligand docking score: {REF_DOCK}")

prop_a, raw_a = summarise_properties(PROP_DIFF, "DiffSBDD")
prop_b, raw_b = summarise_properties(PROP_HYBR, "Hybridised DiffSBDD")
prop_summary = pd.concat([prop_a, prop_b], ignore_index=True)

print("\nProperty summary")
print(prop_summary.round(3).to_string(index=False))

pb = load_posebusters_summary(POSEBUSTERS_CSV)
print("\nPoseBusters summary")
print(pb.to_string(index=False))

# Merge into one table
prop_keep = ["Method","QED_mean","SA_mean","logP_mean","Lipinski_mean","diversity","Total molecules (props)"]
dock_keep = ["Method","Docking mean","Docking min (strongest)","Docking max (weakest)","N docking"]

merged = (
    pd.merge(dock_summary[dock_keep], prop_summary[prop_keep], on="Method", how="outer")
      .merge(pb, on="Method", how="outer")
)

col_order = [
    "Method",
    "Docking mean","Docking min (strongest)","Docking max (weakest)","N docking",
    "QED_mean","SA_mean","logP_mean","Lipinski_mean","diversity","Total molecules (props)",
    "Total Poses","Passed Poses","Failed Poses","Pass Rate (%)"
]
merged = merged[[c for c in col_order if c in merged.columns]]

print("\n=== Combined comparison table ===")
print(merged.round(3).to_string(index=False))

os.makedirs("analysis_outputs", exist_ok=True)
merged.to_csv("analysis_outputs/hybrid_vs_diffsbdd_summary.csv", index=False)
print("\nSaved CSV → analysis_outputs/hybrid_vs_diffsbdd_summary.csv")

latex_str = make_latex_table(
    merged,
    caption="Comparison of DiffSBDD and Hybridised DiffSBDD across docking, PoseBusters validity and molecular properties. Docking: more negative is better.",
    label="tab:hybrid_diffsbdd_comparison"
)
with open("analysis_outputs/hybrid_vs_diffsbdd_summary.tex","w") as f:
    f.write(latex_str)
print("Saved LaTeX → analysis_outputs/hybrid_vs_diffsbdd_summary.tex")


Docking summary
             Method  Docking mean  Docking max (weakest)  Docking min (strongest)  N docking
           DiffSBDD        -8.387                 21.929                  -14.505        100
Hybridised DiffSBDD        -9.420                 45.582                  -13.296        100

Reference ligand docking score: -6.76

Property summary
             Method  QED_mean  SA_mean  logP_mean  Lipinski_mean  diversity  Total molecules (props)
           DiffSBDD     0.471     1.11      1.047          0.384      0.785                    11997
Hybridised DiffSBDD     0.481     1.11      1.209          0.331      0.765                    11996

PoseBusters summary
             Method  Total Poses  Passed Poses  Failed Poses  Pass Rate (%)
           DiffSBDD        11728          6830          4898          58.24
Hybridised DiffSBDD        11559          5399          6160          46.71

=== Combined comparison table ===
             Method  Docking mean  Docking min (strongest)  D

NameError: name 'tabular' is not defined

In [18]:
# Robust SA score means per SDF, for two directories.
# Produces one CSV per method label: <method>_sa_means.csv
#
# Usage:
# method_dirs = {
#     "method1": "/path/to/method1_sdfs",
#     "method2": "/path/to/method2_sdfs",
# }
# results = compute_sa_means_for_methods(method_dirs)

import os
import glob
import math
import pandas as pd
from typing import Tuple, Dict

from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import rdmolops

# quiet RDKit
RDLogger.DisableLog("rdApp.error")
RDLogger.DisableLog("rdApp.warning")

# Ensure sascorer.py is importable, place it next to your notebook if needed
try:
    from rdkit.Contrib.SA_Score import sascorer
except ModuleNotFoundError as e:
    raise ModuleNotFoundError(
        "No module named 'sascorer'. Place RDKit's Contrib/SA_Score/sascorer.py "
        "in the same directory as your notebook, or add its path to sys.path."
    ) from e


def sanitize_with_kekule_fallback(mol: Chem.Mol) -> Chem.Mol:
    """
    Try a robust sanitization sequence.
    1) Sanitize everything except kekulization.
    2) Try kekulize with backtracking.
    3) If kekulization fails, keep aromatic flags, set aromaticity, and proceed.

    Returns a sanitized molecule or raises on total failure.
    """
    # Step 1, sanitize without kekulization
    Chem.SanitizeMol(
        mol,
        sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_KEKULIZE
    )
    # Step 2, attempt kekulization
    try:
        Chem.Kekulize(mol, clearAromaticFlags=True, maxBackTracks=200)
    except Exception:
        # Step 3, fallback, do not kekulize, just ensure aromaticity perception is consistent
        # Re-set aromaticity on the current resonance form
        rdmolops.SetAromaticity(mol)
    return mol


def safe_score_sa(m: Chem.Mol) -> float:
    """
    Compute SA with a couple of safety nets.
    1) Try directly.
    2) If that fails, round-trip through SMILES without forcing kekule and try again.
    """
    # First try, standard path
    try:
        return float(sascorer.calculateScore(m))
    except Exception:
        pass

    # Second chance, round-trip via non-kekule SMILES
    try:
        smi = Chem.MolToSmiles(m, kekuleSmiles=False)
        m2 = Chem.MolFromSmiles(smi, sanitize=True)
        if m2 is None:
            raise ValueError("MolFromSmiles returned None")
        # remove Hs, just in case
        m2 = Chem.RemoveHs(m2)
        return float(sascorer.calculateScore(m2))
    except Exception:
        # Give up, caller will handle
        raise


def sanitized_mols_from_sdf(sdf_path: str) -> Tuple[list, int, int]:
    """
    Read an SDF with sanitize=False, sanitize each molecule robustly.
    Returns (sanitized_mols, num_read, num_sanitized).
    """
    suppl = Chem.SDMolSupplier(sdf_path, removeHs=False, sanitize=False)
    sanitized = []
    num_read = 0
    for mol in suppl:
        if mol is None:
            continue
        num_read += 1
        try:
            # robust sanitize
            mol = sanitize_with_kekule_fallback(mol)
            # remove explicit Hs for SA score stability
            mol = Chem.RemoveHs(mol)
            sanitized.append(mol)
        except Exception:
            # skip molecules that never sanitize
            continue
    return sanitized, num_read, len(sanitized)


def mean_sa_for_sdf(sdf_path: str) -> Tuple[float, int, int, int]:
    """
    Compute mean SA for all sanitized molecules in an SDF.
    Returns (mean_SA, num_read, num_sanitized, num_scored).
    """
    mols, n_read, n_sanitized = sanitized_mols_from_sdf(sdf_path)
    scores = []
    for mol in mols:
        try:
            sa = safe_score_sa(mol)
            if sa is not None and math.isfinite(sa):
                scores.append(sa)
        except Exception:
            # skip if all rescue attempts failed
            continue

    if not scores:
        return float("nan"), n_read, n_sanitized, 0
    return float(sum(scores) / len(scores)), n_read, n_sanitized, len(scores)


def compute_sa_means_for_methods(method_dir_map: Dict[str, str],
                                 pattern: str = "*.sdf") -> Dict[str, pd.DataFrame]:
    """
    method_dir_map, dict like {"method1": "/path/to/dir1", "method2": "/path/to/dir2"}
    Writes <method>_sa_means.csv, returns {method_label: DataFrame}.
    """
    out = {}
    for method_label, dir_path in method_dir_map.items():
        sdf_files = sorted(glob.glob(os.path.join(dir_path, pattern)))
        rows = []
        for sdf in sdf_files:
            mean_sa, n_read, n_sanitized, n_scored = mean_sa_for_sdf(sdf)
            rows.append({
                "sdf_file": os.path.basename(sdf),
                "n_mols_read": n_read,
                "n_mols_sanitized": n_sanitized,
                "n_mols_scored": n_scored,
                "mean_SA": mean_sa,
            })
        df = pd.DataFrame(rows).sort_values("sdf_file").reset_index(drop=True)
        csv_name = f"{method_label}_sa_means.csv"
        df.to_csv(csv_name, index=False)
        print(f"Wrote {csv_name} with {len(df)} rows")
        out[method_label] = df
    return out


# Example run:
# method_dirs = {
#     "method1": "/absolute/path/to/method1",
#     "method2": "/absolute/path/to/method2",
# }
# results = compute_sa_means_for_methods(method_dirs)
# results["method1"].head(), results["method2"].head()


method_dirs = {
    "DiffSBDD": "data/diffsbdd/processed",
    "DiffSBDD_hybdrised": "data/hybridised_carbons/processed",
}
results = compute_sa_means_for_methods(method_dirs)
results["DiffSBDD"].head(), results["DiffSBDD_hybdrised"].head()


Wrote DiffSBDD_sa_means.csv with 100 rows
Wrote DiffSBDD_hybdrised_sa_means.csv with 100 rows


(                                            sdf_file  n_mols_read  \
 0  14gs-A-rec-20gs-cbd-lig-tt-min-0-pocket10_14gs...          100   
 1  1a2g-A-rec-4jmv-1ly-lig-tt-min-0-pocket10_1a2g...          100   
 2  1afs-A-rec-1afs-tes-lig-tt-min-0-pocket10_1afs...          100   
 3  1ai4-A-rec-1ai5-mnp-lig-tt-docked-0-pocket10_1...          100   
 4  1coy-A-rec-1coy-and-lig-tt-docked-0-pocket10_1...          100   
 
    n_mols_sanitized  n_mols_scored   mean_SA  
 0                99             99  3.696011  
 1                98             98  3.823407  
 2                99             99  4.763497  
 3                99             99  4.464382  
 4               100            100  5.065767  ,
                                             sdf_file  n_mols_read  \
 0  14gs-A-rec-20gs-cbd-lig-tt-min-0-pocket10_14gs...          100   
 1  1a2g-A-rec-4jmv-1ly-lig-tt-min-0-pocket10_1a2g...          100   
 2  1afs-A-rec-1afs-tes-lig-tt-min-0-pocket10_1afs...          100   
 3  1ai4-

In [19]:
df_sa_diffsbdd = results["DiffSBDD"]
df_sa_hybridised = results["DiffSBDD_hybdrised"]

df_sa_diffsbdd.head()
df_sa_hybridised.head()



Unnamed: 0,sdf_file,n_mols_read,n_mols_sanitized,n_mols_scored,mean_SA
0,14gs-A-rec-20gs-cbd-lig-tt-min-0-pocket10_14gs...,100,97,97,3.930862
1,1a2g-A-rec-4jmv-1ly-lig-tt-min-0-pocket10_1a2g...,100,100,100,3.947682
2,1afs-A-rec-1afs-tes-lig-tt-min-0-pocket10_1afs...,100,100,100,4.415763
3,1ai4-A-rec-1ai5-mnp-lig-tt-docked-0-pocket10_1...,100,99,99,3.74983
4,1coy-A-rec-1coy-and-lig-tt-docked-0-pocket10_1...,100,100,100,4.965173


In [20]:
df_diffsbdd_mean_sa = df_sa_diffsbdd["mean_SA"].mean()
df_hybridised_mean_sa = df_sa_hybridised["mean_SA"].mean()

print(f"DiffSBDD mean SA: {df_diffsbdd_mean_sa}")
print(f"Hybridised DiffSBDD mean SA: {df_hybridised_mean_sa}")



DiffSBDD mean SA: 4.395277311366575
Hybridised DiffSBDD mean SA: 4.436709130019721


In [26]:
# Mean LogP per SDF for two directories.
# Produces one CSV per method label: <method>_logp_means.csv
#
# Usage:
# method_dirs = {
#     "method1": "/path/to/method1_sdfs",
#     "method2": "/path/to/method2_sdfs",
# }
# results = compute_logp_means_for_methods(method_dirs)

import os
import glob
import math
import pandas as pd
from typing import Tuple, Dict

from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import rdmolops
from rdkit.Chem import Crippen

# quiet RDKit chatter
RDLogger.DisableLog("rdApp.error")
RDLogger.DisableLog("rdApp.warning")


def sanitize_with_kekule_fallback(mol: Chem.Mol) -> Chem.Mol:
    """Robust sanitization with kekule fallback."""
    Chem.SanitizeMol(
        mol,
        sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_KEKULIZE
    )
    try:
        Chem.Kekulize(mol, clearAromaticFlags=True, maxBackTracks=200)
    except Exception:
        rdmolops.SetAromaticity(mol)
    return mol


def safe_logp(m: Chem.Mol) -> float:
    """Compute LogP safely, retrying if needed."""
    try:
        return float(Crippen.MolLogP(m))
    except Exception:
        pass
    try:
        smi = Chem.MolToSmiles(m, kekuleSmiles=False)
        m2 = Chem.MolFromSmiles(smi, sanitize=True)
        if m2 is None:
            raise ValueError("MolFromSmiles returned None")
        m2 = Chem.RemoveHs(m2)
        return float(Crippen.MolLogP(m2))
    except Exception:
        raise


def sanitized_mols_from_sdf(sdf_path: str) -> Tuple[list, int, int]:
    """Read SDF with sanitize=False, sanitize each molecule robustly."""
    suppl = Chem.SDMolSupplier(sdf_path, removeHs=False, sanitize=False)
    sanitized = []
    num_read = 0
    for mol in suppl:
        if mol is None:
            continue
        num_read += 1
        try:
            mol = sanitize_with_kekule_fallback(mol)
            mol = Chem.RemoveHs(mol)
            sanitized.append(mol)
        except Exception:
            continue
    return sanitized, num_read, len(sanitized)


def mean_logp_for_sdf(sdf_path: str) -> Tuple[float, int, int, int]:
    """Compute mean LogP for all sanitized molecules in an SDF."""
    mols, n_read, n_sanitized = sanitized_mols_from_sdf(sdf_path)
    vals = []
    for mol in mols:
        try:
            lp = safe_logp(mol)
            if lp is not None and math.isfinite(lp):
                vals.append(lp)
        except Exception:
            continue
    if not vals:
        return float("nan"), n_read, n_sanitized, 0
    return float(sum(vals) / len(vals)), n_read, n_sanitized, len(vals)


def compute_logp_means_for_methods(method_dir_map: Dict[str, str],
                                   pattern: str = "*.sdf") -> Dict[str, pd.DataFrame]:
    """
    method_dir_map: {"method1": "/path/to/dir1", "method2": "/path/to/dir2"}.
    Writes <method>_logp_means.csv, returns {method_label: DataFrame}.
    """
    out = {}
    for method_label, dir_path in method_dir_map.items():
        sdf_files = sorted(glob.glob(os.path.join(dir_path, pattern)))
        rows = []
        for sdf in sdf_files:
            mean_logp, n_read, n_sanitized, n_scored = mean_logp_for_sdf(sdf)
            rows.append({
                "sdf_file": os.path.basename(sdf),
                "n_mols_read": n_read,
                "n_mols_sanitized": n_sanitized,
                "n_mols_scored": n_scored,
                "mean_LogP": mean_logp,
            })
        df = pd.DataFrame(rows).sort_values("sdf_file").reset_index(drop=True)
        csv_name = f"{method_label}_logp_means.csv"
        df.to_csv(csv_name, index=False)
        print(f"Wrote {csv_name} with {len(df)} rows")
        out[method_label] = df
    return out


# Example:
method_dirs = {
    "DiffSBDD": "data/diffsbdd/processed",
    "DiffSBDD_hybdrised": "data/hybridised_carbons/processed",
}
results = compute_logp_means_for_methods(method_dirs)
results["DiffSBDD"].head(), results["DiffSBDD_hybdrised"].head()


Wrote DiffSBDD_logp_means.csv with 100 rows
Wrote DiffSBDD_hybdrised_logp_means.csv with 100 rows


(                                            sdf_file  n_mols_read  \
 0  14gs-A-rec-20gs-cbd-lig-tt-min-0-pocket10_14gs...          100   
 1  1a2g-A-rec-4jmv-1ly-lig-tt-min-0-pocket10_1a2g...          100   
 2  1afs-A-rec-1afs-tes-lig-tt-min-0-pocket10_1afs...          100   
 3  1ai4-A-rec-1ai5-mnp-lig-tt-docked-0-pocket10_1...          100   
 4  1coy-A-rec-1coy-and-lig-tt-docked-0-pocket10_1...          100   
 
    n_mols_sanitized  n_mols_scored  mean_LogP  
 0                99             99   0.501517  
 1                98             98   0.704731  
 2                99             99   1.458549  
 3                99             99   1.768427  
 4               100            100   1.970298  ,
                                             sdf_file  n_mols_read  \
 0  14gs-A-rec-20gs-cbd-lig-tt-min-0-pocket10_14gs...          100   
 1  1a2g-A-rec-4jmv-1ly-lig-tt-min-0-pocket10_1a2g...          100   
 2  1afs-A-rec-1afs-tes-lig-tt-min-0-pocket10_1afs...          100   
 3 

In [27]:
log_p_mean_diffsbdd = results["DiffSBDD"]
log_p_mean_hybridised = results["DiffSBDD_hybdrised"]

log_p_mean_diffsbdd.head()
log_p_mean_hybridised.head()

mean_logp_diffsbdd = log_p_mean_diffsbdd["mean_LogP"].mean()
mean_logp_hybridised = log_p_mean_hybridised["mean_LogP"].mean()

print(f"DiffSBDD mean LogP: {mean_logp_diffsbdd}")
print(f"Hybridised DiffSBDD mean LogP: {mean_logp_hybridised}")


DiffSBDD mean LogP: 1.003825185511318
Hybridised DiffSBDD mean LogP: 1.156966096154806


In [28]:
# Compute the average number of rings across *all* molecules in all SDFs per method
#
# Usage:
# method_dirs = {
#     "method1": "/path/to/method1_sdfs",
#     "method2": "/path/to/method2_sdfs",
# }
# results = average_rings_for_methods(method_dirs)
# print(results)

import os
import glob
import pandas as pd
from typing import Dict

from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import rdmolops

# quiet RDKit chatter
RDLogger.DisableLog("rdApp.error")
RDLogger.DisableLog("rdApp.warning")


def sanitize_with_kekule_fallback(mol: Chem.Mol) -> Chem.Mol:
    """Robust sanitization with kekule fallback."""
    Chem.SanitizeMol(
        mol,
        sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_KEKULIZE
    )
    try:
        Chem.Kekulize(mol, clearAromaticFlags=True, maxBackTracks=200)
    except Exception:
        rdmolops.SetAromaticity(mol)
    return mol


def count_rings(m: Chem.Mol) -> int:
    """Return number of rings in a molecule."""
    return m.GetRingInfo().NumRings()


def average_rings_for_methods(method_dir_map: Dict[str, str],
                              pattern: str = "*.sdf") -> Dict[str, float]:
    """
    Compute the mean ring count across all molecules in all SDFs per method.
    Returns {method_label: mean_num_rings}.
    """
    results = {}
    for method_label, dir_path in method_dir_map.items():
        sdf_files = sorted(glob.glob(os.path.join(dir_path, pattern)))
        ring_counts = []
        for sdf in sdf_files:
            suppl = Chem.SDMolSupplier(sdf, removeHs=False, sanitize=False)
            for mol in suppl:
                if mol is None:
                    continue
                try:
                    mol = sanitize_with_kekule_fallback(mol)
                    mol = Chem.RemoveHs(mol)
                    rings = count_rings(mol)
                    ring_counts.append(rings)
                except Exception:
                    continue
        if ring_counts:
            avg_rings = sum(ring_counts) / len(ring_counts)
        else:
            avg_rings = float("nan")
        results[method_label] = avg_rings
    return results


# Example:
# method_dirs = {
#     "method1": "/absolute/path/to/method1",
#     "method2": "/absolute/path/to/method2",
# }
results = average_rings_for_methods(method_dirs)
print(results)   # {'method1': 2.84, 'method2': 3.12}


{'DiffSBDD': 2.3031080110780593, 'DiffSBDD_hybdrised': 2.6684580405126086}


In [1]:
from rdkit import Chem
from rdkit.Chem import SDMolSupplier

sup = SDMolSupplier("/Users/sanazkazeminia/Documents/mol_test_suite/data/other_models_data/Molsnapper_test/ligand.sdf")
mol = [m for m in sup]

smiles = Chem.MolToSmiles(mol[0])
print(smiles)



CN(C)C[C@H](O)COc1ccc(Nc2cc(Nc3c(F)cccc3F)ncn2)cc1
