# QSAR pipeline: reproducible exploratory SAR + ranking QSAR within applicability domain

Этот ноутбук строит воспроизводимый конвейер с честным разделением данных: `train_lit` (строки 0–89) и `experimental_holdout` (строки 90+). Если внешняя предсказательная сила слабая, вывод остаётся валидным за счёт статистического SAR-блока, y-randomization и Applicability Domain.

In [None]:
# Environment & logging
from pathlib import Path
import json
import logging
import random
import sys
import warnings

SEED = 42
random.seed(SEED)

ROOT = Path.cwd()
DATA_PATH = ROOT / "potok.csv"
ARTIFACTS = ROOT / "artifacts"
FIG_DIR = ARTIFACTS / "figures"
ARTIFACTS.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

log_path = ARTIFACTS / "run.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[logging.FileHandler(log_path, mode="w", encoding="utf-8"), logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("qsar")
logger.info("Run started")

config = {
    "seed": SEED,
    "data_path": str(DATA_PATH),
    "split_strategy": "fixed rows: train_df=0..89, exp_df=90+; GroupKFold by Murcko scaffold",
    "duplicate_policy": "median",
    "low_mic_quantile": 0.25,
    "cv_n_splits": 5,
    "fingerprint": {"radius": 2, "nBits": 2048},
    "fragment_min_train_presence": 5,
    "fragment_min_lowmic_presence": 3,
    "yrandomization_n": 50,
    "or_bootstrap_n": 300,
}
(ARTIFACTS / "run_config.json").write_text(json.dumps(config, indent=2, ensure_ascii=False), encoding="utf-8")
logger.info("Saved initial run_config.json")


In [None]:
# Dependency check (safe stop if RDKit is unavailable)
try:
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats

    from sklearn.base import clone
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
    from sklearn.impute import SimpleImputer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import (
        balanced_accuracy_score,
        average_precision_score,
        precision_recall_curve,
        roc_auc_score,
        roc_curve,
    )
    from sklearn.model_selection import GroupKFold
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.tree import DecisionTreeClassifier

    from rdkit import Chem
    from rdkit.Chem import Descriptors, AllChem
    from rdkit.Chem.Scaffolds import MurckoScaffold
    from rdkit.Chem.MolStandardize import rdMolStandardize
except Exception as e:
    msg = (
        "RDKit и/или научный стек недоступны. Нужна локальная среда с RDKit, numpy, pandas, scipy, scikit-learn, matplotlib. "
        "В ноутбуке pip-установка не выполняется по правилам воспроизводимости."
    )
    logger.error(msg)
    logger.error(f"Import error: {e}")
    raise SystemExit(msg)

np.random.seed(SEED)
warnings.filterwarnings("ignore")
logger.info("Dependencies imported successfully")


## Data loading and fixed split

In [None]:
if not DATA_PATH.exists():
    logger.error(f"Missing data file: {DATA_PATH}")
    raise FileNotFoundError(DATA_PATH)

raw = pd.read_csv(DATA_PATH)
raw.columns = [c.strip() for c in raw.columns]
logger.info(f"Raw rows: {len(raw)}")

smiles_col = next((c for c in raw.columns if c.lower() in ["smiles", "smile", "canonical_smiles"]), None)
activity_col = next((c for c in raw.columns if c.lower() in ["mic", "activity", "y", "target", "pmic"]), None)
if smiles_col is None or activity_col is None:
    raise ValueError("Expected columns for SMILES and activity (MIC)")

df = raw[[smiles_col, activity_col]].copy().rename(columns={smiles_col: "smiles", activity_col: "MIC"})
df["row_id"] = np.arange(len(df))
df["MIC"] = pd.to_numeric(df["MIC"], errors="coerce")
if df["MIC"].isna().any():
    raise ValueError("MIC contains non-numeric values")
if not (df["MIC"] > 0).all():
    raise ValueError("MIC must be > 0 for pMIC = -log10(MIC)")

df["pMIC"] = -np.log10(df["MIC"].astype(float))

train_lit = df[df["row_id"] < 90].copy()
experimental_holdout = df[df["row_id"] >= 90].copy()
assert len(train_lit) == min(90, len(df)), "train split mismatch"
assert set(train_lit["row_id"]).isdisjoint(set(experimental_holdout["row_id"])), "split leakage"
logger.info(f"Split sizes | train_df={len(train_lit)} | exp_df={len(experimental_holdout)}")


## Data curation (mandatory)

In [None]:
lfc = rdMolStandardize.LargestFragmentChooser()

def standardize_smiles(smi):
    mol = Chem.MolFromSmiles(str(smi))
    if mol is None:
        return None, "invalid_smiles"
    largest = lfc.choose(mol)
    if largest is None:
        return None, "no_largest_fragment"
    can = Chem.MolToSmiles(largest, canonical=True)
    return can, "ok"


def curate_block(block, block_name, duplicate_policy="median"):
    report_rows = []
    rows = []
    for _, r in block.iterrows():
        std_smi, status = standardize_smiles(r["smiles"])
        action = "keep" if status == "ok" else "drop"
        report_rows.append({
            "block": block_name,
            "row_id": int(r["row_id"]),
            "original_smiles": r["smiles"],
            "standardized_smiles": std_smi,
            "MIC": float(r["MIC"]),
            "pMIC": float(r["pMIC"]),
            "action": action,
            "reason": status,
        })
        if status == "ok":
            rows.append({"row_id": int(r["row_id"]), "smiles": std_smi, "MIC": float(r["MIC"])})

    clean = pd.DataFrame(rows)
    if clean.empty:
        return clean, pd.DataFrame(report_rows)

    grp = clean.groupby("smiles")["MIC"]
    agg_rows = []
    for smi, vals in grp:
        vals = vals.values.astype(float)
        if len(vals) == 1:
            mic = float(vals[0])
            agg_rows.append({"smiles": smi, "MIC": mic, "pMIC": float(-np.log10(mic))})
            continue
        conflict = np.ptp(vals) > 1e-12
        if duplicate_policy == "median":
            mic = float(np.median(vals))
            reason = "duplicate_median"
            agg_rows.append({"smiles": smi, "MIC": mic, "pMIC": float(-np.log10(mic))})
        elif duplicate_policy == "mean":
            mic = float(np.mean(vals))
            reason = "duplicate_mean"
            agg_rows.append({"smiles": smi, "MIC": mic, "pMIC": float(-np.log10(mic))})
        elif duplicate_policy == "drop_conflicts":
            if conflict:
                reason = "duplicate_conflict_drop"
            else:
                mic = float(vals[0])
                reason = "duplicate_identical_keep"
                agg_rows.append({"smiles": smi, "MIC": mic, "pMIC": float(-np.log10(mic))})
        else:
            raise ValueError("duplicate_policy must be median|mean|drop_conflicts")

        idxs = clean.loc[clean["smiles"] == smi, "row_id"].tolist()
        for rid in idxs:
            report_rows.append({
                "block": block_name,
                "row_id": int(rid),
                "original_smiles": None,
                "standardized_smiles": smi,
                "MIC": None,
                "pMIC": None,
                "action": "duplicate_resolution",
                "reason": reason,
            })

    curated = pd.DataFrame(agg_rows).drop_duplicates("smiles").reset_index(drop=True)
    curated["block"] = block_name
    return curated, pd.DataFrame(report_rows)

train_cur, rep_train = curate_block(train_lit, "train_lit", duplicate_policy=config["duplicate_policy"])
exp_cur, rep_exp = curate_block(experimental_holdout, "experimental_holdout", duplicate_policy=config["duplicate_policy"])

# remove cross-split leakage: any SMILES that appears in train and experimental is removed from experimental
shared_smiles = set(train_cur["smiles"]).intersection(set(exp_cur["smiles"]))
removed_cross_split = int(len(shared_smiles))
if removed_cross_split:
    for smi in sorted(shared_smiles):
        rep_exp = pd.concat(
            [
                rep_exp,
                pd.DataFrame([
                    {
                        "block": "experimental_holdout",
                        "row_id": None,
                        "original_smiles": None,
                        "standardized_smiles": smi,
                        "MIC": None,
                        "pMIC": None,
                        "action": "drop",
                        "reason": "cross_split_duplicate_removed",
                    }
                ]),
            ],
            ignore_index=True,
        )
    exp_cur = exp_cur.loc[~exp_cur["smiles"].isin(shared_smiles)].reset_index(drop=True)

curation_report = pd.concat([rep_train, rep_exp], ignore_index=True)
curation_report.to_csv(ARTIFACTS / "curation_report.csv", index=False)

curation_summary = pd.DataFrame([
    {
        "train_curated_n": int(len(train_cur)),
        "experimental_curated_n": int(len(exp_cur)),
        "cross_split_duplicates_removed_from_experimental": removed_cross_split,
    }
])
curation_summary.to_csv(ARTIFACTS / "curation_summary.csv", index=False)

assert train_cur["smiles"].isna().sum() == 0, "NaN in curated smiles"
assert {"MIC", "pMIC"}.issubset(train_cur.columns), "train_cur must have MIC and pMIC"
assert np.isfinite(train_cur["MIC"]).all() and np.isfinite(train_cur["pMIC"]).all(), "Invalid activity values"
assert len(set(train_cur["smiles"]).intersection(set(exp_cur["smiles"]))) == 0, "cross-split leakage after curation"
logger.info(
    f"After curation | train_lit={len(train_cur)} | experimental_holdout={len(exp_cur)} | removed_cross_split={removed_cross_split}"
)


## Feature engineering, CV, Q², external tests, SAR statistics, y-randomization, AD

In [None]:
# Inverse QSAR (activity-conditioned SAR): descriptor stats, scaffold-aware classification,
# fragment enrichment, rule extraction, experimental holdout checks, y-randomization.

def murcko_scaffold(smi):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return "INVALID"
    scaf = MurckoScaffold.MurckoScaffoldSmiles(mol=mol)
    return scaf if scaf else "ACYCLIC"


def desc_dict(mol):
    return {
        "MolWt": Descriptors.MolWt(mol),
        "MolLogP": Descriptors.MolLogP(mol),
        "TPSA": Descriptors.TPSA(mol),
        "NumHDonors": Descriptors.NumHDonors(mol),
        "NumHAcceptors": Descriptors.NumHAcceptors(mol),
        "NumRotatableBonds": Descriptors.NumRotatableBonds(mol),
        "RingCount": Descriptors.RingCount(mol),
        "HeavyAtomCount": Descriptors.HeavyAtomCount(mol),
    }


def featurize(data):
    rows = []
    fps = []
    for _, r in data.iterrows():
        mol = Chem.MolFromSmiles(r["smiles"])
        if mol is None:
            continue
        d = desc_dict(mol)
        d.update({"smiles": r["smiles"], "MIC": r["MIC"], "pMIC": r["pMIC"], "scaffold": murcko_scaffold(r["smiles"])})
        rows.append(d)
        fp = AllChem.GetMorganFingerprintAsBitVect(
            mol,
            radius=config["fingerprint"]["radius"],
            nBits=config["fingerprint"]["nBits"],
        )
        fps.append(np.array(fp, dtype=int))
    desc = pd.DataFrame(rows)
    fp_df = pd.DataFrame(fps, columns=[f"FP_{i}" for i in range(config["fingerprint"]["nBits"])]) if len(fps) else pd.DataFrame()
    return desc.reset_index(drop=True), fp_df.reset_index(drop=True)


def bh_fdr(pvals):
    pvals = np.asarray(pvals, dtype=float)
    n = len(pvals)
    order = np.argsort(pvals)
    ranked = pvals[order]
    q = ranked * n / (np.arange(1, n + 1))
    q = np.minimum.accumulate(q[::-1])[::-1]
    q = np.clip(q, 0, 1)
    out = np.empty_like(q)
    out[order] = q
    return out


def cliffs_delta(x, y):
    x = np.asarray(x)
    y = np.asarray(y)
    gt = np.sum(x[:, None] > y[None, :])
    lt = np.sum(x[:, None] < y[None, :])
    return (gt - lt) / (len(x) * len(y))


train_desc, train_fp = featurize(train_cur)
exp_desc, exp_fp = featurize(exp_cur) if len(exp_cur) else (pd.DataFrame(), pd.DataFrame())

assert {"MIC", "pMIC"}.issubset(train_desc.columns), "train_desc must have MIC and pMIC"
if len(exp_desc):
    assert {"MIC", "pMIC"}.issubset(exp_desc.columns), "exp_desc must have MIC and pMIC"

assert len(set(train_cur["smiles"]).intersection(set(exp_cur["smiles"]))) == 0, "cross-split leakage persists"

desc_cols = [c for c in train_desc.columns if c not in ["smiles", "MIC", "pMIC", "scaffold"]]

# low_MIC label from train only
low_mic_threshold = float(np.quantile(train_desc["MIC"], config["low_mic_quantile"]))
train_desc["low_MIC"] = (train_desc["MIC"] <= low_mic_threshold).astype(int)
if len(exp_desc):
    exp_desc["low_MIC"] = (exp_desc["MIC"] <= low_mic_threshold).astype(int)

config["low_MIC_threshold"] = low_mic_threshold
(ARTIFACTS / "run_config.json").write_text(json.dumps(config, indent=2, ensure_ascii=False), encoding="utf-8")

X_desc = train_desc[desc_cols].copy()
y_cls = train_desc["low_MIC"].values
groups = train_desc["scaffold"].values
assert np.array_equal(np.sort(np.unique(y_cls)), np.array([0, 1])), "Need both classes for classification"

# A) Descriptor stats (train only)
mi_vals = mutual_info_classif(X_desc, y_cls, random_state=SEED, discrete_features=False)
mi_map = dict(zip(desc_cols, mi_vals))

rows = []
for c in desc_cols:
    sp = stats.spearmanr(train_desc[c], train_desc["pMIC"], nan_policy="omit")
    kd = stats.kendalltau(train_desc[c], train_desc["pMIC"], nan_policy="omit")
    rows.append({
        "descriptor": c,
        "spearman_rho": sp.correlation,
        "spearman_p": sp.pvalue,
        "kendall_tau": kd.correlation,
        "kendall_p": kd.pvalue,
        "mutual_information_low_MIC": float(mi_map.get(c, np.nan)),
    })
cor_df = pd.DataFrame(rows)
cor_df["spearman_q"] = bh_fdr(cor_df["spearman_p"].values)
cor_df["kendall_q"] = bh_fdr(cor_df["kendall_p"].values)
cor_df = cor_df.sort_values("spearman_q")
cor_df.to_csv(ARTIFACTS / "descriptor_correlations.csv", index=False)

rng = np.random.default_rng(SEED)
uni_rows = []
for c in desc_cols:
    low_vals = train_desc.loc[train_desc["low_MIC"] == 1, c].values
    high_vals = train_desc.loc[train_desc["low_MIC"] == 0, c].values
    mw = stats.mannwhitneyu(low_vals, high_vals, alternative="two-sided")
    delta = cliffs_delta(low_vals, high_vals)

    scores = train_desc[c].values
    fpr, tpr, thresholds = roc_curve(y_cls, scores)
    youden = tpr - fpr
    best_idx = int(np.argmax(youden))

    x = train_desc[[c]].values.astype(float)
    x = (x - x.mean(axis=0)) / (x.std(axis=0) + 1e-12)
    try:
        lr = LogisticRegression(solver="liblinear", random_state=SEED)
        lr.fit(x, y_cls)
        beta = float(lr.coef_[0][0])
        or_point = float(np.exp(beta))

        boots = []
        for _ in range(int(config["or_bootstrap_n"])):
            idx = rng.choice(len(y_cls), size=len(y_cls), replace=True)
            yb = y_cls[idx]
            if len(np.unique(yb)) < 2:
                continue
            xb = x[idx]
            try:
                blr = LogisticRegression(solver="liblinear", random_state=SEED)
                blr.fit(xb, yb)
                boots.append(float(np.exp(blr.coef_[0][0])))
            except Exception:
                continue
        if len(boots) >= 20:
            or_low, or_high = np.percentile(boots, [2.5, 97.5])
        else:
            or_low, or_high = np.nan, np.nan
    except Exception:
        or_point, or_low, or_high = np.nan, np.nan, np.nan

    uni_rows.append({
        "descriptor": c,
        "mannwhitney_u": mw.statistic,
        "mannwhitney_p": mw.pvalue,
        "cliffs_delta": delta,
        "roc_auc": roc_auc_score(y_cls, scores),
        "youden_index": float(youden[best_idx]),
        "youden_threshold": float(thresholds[best_idx]),
        "window_operator": "<=" if delta < 0 else ">=",
        "odds_ratio": or_point,
        "odds_ratio_ci_low": or_low,
        "odds_ratio_ci_high": or_high,
    })

uni_df = pd.DataFrame(uni_rows)
uni_df["mannwhitney_q"] = bh_fdr(uni_df["mannwhitney_p"].values)
uni_df = uni_df.sort_values("mannwhitney_q")
uni_df.to_csv(ARTIFACTS / "univariate_statistics.csv", index=False)

important_windows = uni_df.loc[uni_df["mannwhitney_q"] <= 0.1].copy()
if important_windows.empty:
    important_windows = uni_df.head(min(5, len(uni_df))).copy()
descriptor_windows = important_windows[["descriptor", "youden_threshold", "window_operator", "roc_auc", "mannwhitney_q"]].rename(
    columns={
        "youden_threshold": "threshold",
        "window_operator": "rule",
        "mannwhitney_q": "q_value",
    }
)
descriptor_windows.to_csv(ARTIFACTS / "descriptor_windows.csv", index=False)

plt.figure(figsize=(8, 4))
sns.barplot(data=cor_df, x="descriptor", y="spearman_rho", color="#4477aa")
plt.xticks(rotation=45, ha="right")
plt.title("Spearman correlation with pMIC (train)")
plt.tight_layout(); plt.savefig(FIG_DIR / "descriptor_spearman.png", dpi=220); plt.close()

plt.figure(figsize=(8, 4))
sns.barplot(data=uni_df, x="descriptor", y="roc_auc", color="#66aa55")
plt.axhline(0.5, ls="--", color="k")
plt.xticks(rotation=45, ha="right")
plt.title("Univariate descriptor ROC-AUC for low_MIC")
plt.tight_layout(); plt.savefig(FIG_DIR / "descriptor_univariate_auc.png", dpi=220); plt.close()

# B) Scaffold-aware classification (train only)
models = {
    "LogReg_L2": Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler()),
        ("clf", LogisticRegression(penalty="l2", solver="liblinear", max_iter=5000, random_state=SEED)),
    ]),
    "LogReg_L1": Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler()),
        ("clf", LogisticRegression(penalty="l1", solver="saga", max_iter=5000, random_state=SEED)),
    ]),
    "RandomForest": RandomForestClassifier(n_estimators=500, random_state=SEED, n_jobs=-1, class_weight="balanced"),
}

n_splits = min(config["cv_n_splits"], len(np.unique(groups)))
cv = GroupKFold(n_splits=n_splits)
clf_rows = []
roc_curves = {}
pr_curves = {}

for name, model in models.items():
    fold_metrics = []
    oof_proba = np.zeros(len(train_desc), dtype=float)

    for fold, (tr, te) in enumerate(cv.split(X_desc, y_cls, groups), start=1):
        Xtr, Xte = X_desc.iloc[tr], X_desc.iloc[te]
        ytr, yte = y_cls[tr], y_cls[te]
        m = clone(model)
        m.fit(Xtr, ytr)
        prob = m.predict_proba(Xte)[:, 1]
        pred = (prob >= 0.5).astype(int)

        oof_proba[te] = prob
        fold_metrics.append({
            "fold": fold,
            "roc_auc": roc_auc_score(yte, prob),
            "pr_auc": average_precision_score(yte, prob),
            "balanced_accuracy": balanced_accuracy_score(yte, pred),
        })

    fold_df = pd.DataFrame(fold_metrics)
    clf_rows.append({
        "model": name,
        "roc_auc_mean": fold_df["roc_auc"].mean(),
        "roc_auc_std": fold_df["roc_auc"].std(ddof=1),
        "pr_auc_mean": fold_df["pr_auc"].mean(),
        "pr_auc_std": fold_df["pr_auc"].std(ddof=1),
        "balanced_accuracy_mean": fold_df["balanced_accuracy"].mean(),
        "balanced_accuracy_std": fold_df["balanced_accuracy"].std(ddof=1),
    })
    roc_curves[name] = roc_curve(y_cls, oof_proba)
    pr_curves[name] = precision_recall_curve(y_cls, oof_proba)

classification_metrics = pd.DataFrame(clf_rows).sort_values("roc_auc_mean", ascending=False)
classification_metrics.to_csv(ARTIFACTS / "classification_metrics.csv", index=False)

plt.figure(figsize=(6, 5))
for name, (fpr, tpr, _) in roc_curves.items():
    plt.plot(fpr, tpr, label=name)
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("OOF ROC curves (GroupKFold by scaffold)")
plt.legend()
plt.tight_layout(); plt.savefig(FIG_DIR / "classification_roc.png", dpi=220); plt.close()

plt.figure(figsize=(6, 5))
for name, (prec, rec, _) in pr_curves.items():
    plt.plot(rec, prec, label=name)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("OOF PR curves (GroupKFold by scaffold)")
plt.legend()
plt.tight_layout(); plt.savefig(FIG_DIR / "classification_pr.png", dpi=220); plt.close()

# C) Fragment enrichment on train
frag_rows = []
for col in train_fp.columns:
    bit = train_fp[col].values.astype(int)
    present_train = int(bit.sum())
    present_low = int(bit[train_desc["low_MIC"].values == 1].sum())
    if present_train < config["fragment_min_train_presence"] or present_low < config["fragment_min_lowmic_presence"]:
        continue

    a = int(np.sum((bit == 1) & (y_cls == 1)))
    b = int(np.sum((bit == 1) & (y_cls == 0)))
    c = int(np.sum((bit == 0) & (y_cls == 1)))
    d = int(np.sum((bit == 0) & (y_cls == 0)))
    _, p = stats.fisher_exact([[a, b], [c, d]], alternative="greater")

    odds_ratio = ((a + 0.5) * (d + 0.5)) / ((b + 0.5) * (c + 0.5))
    enrichment_factor = (a / max(1, a + b)) / (np.sum(y_cls == 1) / len(y_cls))

    frag_rows.append({
        "fragment": col,
        "a_low_present": a,
        "b_nonlow_present": b,
        "c_low_absent": c,
        "d_nonlow_absent": d,
        "odds_ratio": odds_ratio,
        "enrichment_factor": enrichment_factor,
        "fisher_p": p,
    })

frag_df = pd.DataFrame(frag_rows)
if len(frag_df):
    frag_df["fisher_q"] = bh_fdr(frag_df["fisher_p"].values)
    frag_df = frag_df.sort_values(["fisher_q", "odds_ratio"], ascending=[True, False])
else:
    frag_df = pd.DataFrame(columns=["fragment", "fisher_p", "fisher_q", "odds_ratio", "enrichment_factor"])
frag_df.to_csv(ARTIFACTS / "fragments_enrichment.csv", index=False)

top_frag = frag_df.head(15)
if len(top_frag):
    plt.figure(figsize=(8, 4))
    sns.barplot(data=top_frag, x="fragment", y="odds_ratio", color="#cc6677")
    plt.xticks(rotation=60, ha="right")
    plt.title("Top enriched Morgan bits in low_MIC")
    plt.tight_layout(); plt.savefig(FIG_DIR / "fragments_top_enrichment.png", dpi=220); plt.close()

# D) Rule extraction with decision tree on top descriptors + top bits
top_desc = uni_df.sort_values("mannwhitney_q").head(min(4, len(uni_df)))["descriptor"].tolist()
top_bits = frag_df.head(min(6, len(frag_df)))["fragment"].tolist()
rule_features = top_desc + top_bits
rule_matrix = pd.concat([train_desc[top_desc], train_fp[top_bits]], axis=1) if len(rule_features) else pd.DataFrame()

rules_out = []
if len(rule_features):
    for depth in [2, 3]:
        tree = DecisionTreeClassifier(max_depth=depth, random_state=SEED, min_samples_leaf=3)
        tree.fit(rule_matrix, y_cls)

        children_left = tree.tree_.children_left
        children_right = tree.tree_.children_right
        feature = tree.tree_.feature
        threshold = tree.tree_.threshold

        def walk(node, conds):
            if children_left[node] == children_right[node]:
                leaf_idx = tree.apply(rule_matrix) == node
                support = float(leaf_idx.mean())
                if support == 0:
                    return
                tp = int(np.sum((y_cls == 1) & leaf_idx))
                pp = int(np.sum(leaf_idx))
                p_act = np.mean(y_cls == 1)
                precision = tp / pp if pp else 0.0
                recall = tp / max(1, np.sum(y_cls == 1))
                lift = (precision / p_act) if p_act > 0 else np.nan
                pred = int(np.argmax(tree.tree_.value[node][0]))
                rules_out.append({
                    "tree_depth": depth,
                    "rule": " AND ".join(conds) if conds else "TRUE",
                    "predicted_class": pred,
                    "support": support,
                    "precision": precision,
                    "recall": recall,
                    "lift": lift,
                })
            else:
                feat_name = rule_features[feature[node]]
                thr = threshold[node]
                walk(children_left[node], conds + [f"{feat_name} <= {thr:.4f}"])
                walk(children_right[node], conds + [f"{feat_name} > {thr:.4f}"])

        walk(0, [])

rules_df = pd.DataFrame(rules_out).sort_values(["lift", "precision"], ascending=False)
rules_df.to_csv(ARTIFACTS / "rules.csv", index=False)

# E) Experimental holdout evaluation (no fit on exp)
exp_checks = []
selected_windows = descriptor_windows.head(min(3, len(descriptor_windows)))
for idx, row in exp_desc.iterrows() if len(exp_desc) else []:
    w_hits = 0
    for _, w in selected_windows.iterrows():
        op = w["rule"]
        thr = float(w["threshold"])
        val = row[w["descriptor"]]
        ok = (val <= thr) if op == "<=" else (val >= thr)
        w_hits += int(ok)

    enriched_bits = int(exp_fp.loc[idx, top_bits].sum()) if len(top_bits) else 0
    triggered_rules = 0
    if len(rules_df) and len(rule_features):
        one = pd.concat([exp_desc.loc[[idx], top_desc], exp_fp.loc[[idx], top_bits]], axis=1)
        for _, rr in rules_df.iterrows():
            conds = rr["rule"].split(" AND ") if rr["rule"] != "TRUE" else []
            passed = True
            for cond in conds:
                if " <= " in cond:
                    f, t = cond.split(" <= ")
                    passed &= float(one.iloc[0][f]) <= float(t)
                elif " > " in cond:
                    f, t = cond.split(" > ")
                    passed &= float(one.iloc[0][f]) > float(t)
            if passed:
                triggered_rules += 1

    exp_checks.append({
        "smiles": row["smiles"],
        "MIC": row["MIC"],
        "pMIC": row["pMIC"],
        "descriptor_window_hits": w_hits,
        "enriched_fragment_hits": enriched_bits,
        "triggered_rules": triggered_rules,
    })

exp_checks_df = pd.DataFrame(exp_checks)
exp_checks_df.to_csv(ARTIFACTS / "experimental_rule_check.csv", index=False)

if len(exp_checks_df):
    plt.figure(figsize=(6, 4))
    plt.scatter(exp_checks_df["descriptor_window_hits"], exp_checks_df["enriched_fragment_hits"], alpha=0.8)
    plt.xlabel("Descriptor windows hit")
    plt.ylabel("Enriched fragments present")
    plt.title("Experimental holdout summary")
    plt.tight_layout(); plt.savefig(FIG_DIR / "experimental_summary.png", dpi=220); plt.close()

    plt.figure(figsize=(5, 5))
    plt.scatter(exp_checks_df["pMIC"], exp_checks_df["triggered_rules"], alpha=0.8)
    plt.xlabel("Observed pMIC")
    plt.ylabel("Triggered rules")
    plt.title("Experimental parity-style summary")
    plt.tight_layout(); plt.savefig(FIG_DIR / "experimental_parity.png", dpi=220); plt.close()

# F) y-randomization for classification (train only)
def cv_auc_for_labels(labels):
    model = clone(models["LogReg_L2"])
    aucs = []
    for tr, te in cv.split(X_desc, labels, groups):
        model.fit(X_desc.iloc[tr], labels[tr])
        prob = model.predict_proba(X_desc.iloc[te])[:, 1]
        aucs.append(roc_auc_score(labels[te], prob))
    return float(np.mean(aucs))

real_auc = cv_auc_for_labels(y_cls)
yrand_rows = []
for i in range(config["yrandomization_n"]):
    ys = rng.permutation(y_cls)
    yrand_rows.append({"iteration": i + 1, "roc_auc": cv_auc_for_labels(ys), "label": "scrambled"})
yrand_rows.append({"iteration": 0, "roc_auc": real_auc, "label": "real"})

yrand_df = pd.DataFrame(yrand_rows)
yrand_df.to_csv(ARTIFACTS / "y_random_classification.csv", index=False)

plt.figure(figsize=(6, 4))
plt.hist(yrand_df.loc[yrand_df["label"] == "scrambled", "roc_auc"], bins=15, alpha=0.7, label="scrambled")
plt.axvline(real_auc, color="red", lw=2, label=f"real={real_auc:.3f}")
plt.xlabel("ROC-AUC")
plt.ylabel("Count")
plt.title("y-randomization (classification, GroupKFold)")
plt.legend()
plt.tight_layout(); plt.savefig(FIG_DIR / "y_random_classification.png", dpi=220); plt.close()

logger.info("Inverse QSAR activity-conditioned pipeline completed")
print("Artifacts generated in:", ARTIFACTS)
print("- descriptor_correlations.csv")
print("- univariate_statistics.csv")
print("- descriptor_windows.csv")
print("- classification_metrics.csv")
print("- fragments_enrichment.csv")
print("- rules.csv")
print("- experimental_rule_check.csv")
print("- y_random_classification.csv")


## Формула Q² (документация)

Используется только out-of-fold предсказание на `train_lit`:

\[
Q^2 = 1 - rac{PRESS}{TSS},\quad PRESS = \sum_i (y_i - \hat y_{i,OOF})^2,\quad TSS = \sum_i (y_i - ar y_{train\_lit})^2.
\]

Внешний `experimental_holdout` никогда не участвует в обучении/CV.


## Ограничения и риски

- **Source bias**: литература и эксперимент могут иметь систематические различия условий.
- **Small sample size**: нестабильность оценок и широкие доверительные интервалы.
- **Experimental variability**: шум MIC измерений влияет на верхний потолок качества.
- **Scaffold-empty issue**: для ациклических структур Murcko scaffold может быть пустой (`ACYCLIC`), что ухудшает групповое разбиение.
- Рекомендации: расширить датасет, делать group split по источникам/сериям, добавить uncertainty и калибровку доверия.


## Как отвечать на критику «модель не работает»

Честная формулировка для защиты:

> Жёсткий scaffold-test действительно снижает метрики — это ожидаемо для реалистичной валидации. Именно поэтому работа не опирается только на одно число R²: добавлены статистические SAR-тесты, y-randomization и Applicability Domain (Williams plot). Итог: модель применима для ранжирования **внутри домена применимости**, а не как универсальный предиктор для любых структур.


## Checklist выполнения задач

- [x] Environment & logging + фиксированный seed + `artifacts/run.log`.
- [x] Без pip-установки RDKit в ноутбуке; безопасная остановка при отсутствии зависимостей.
- [x] Явный split: `train_lit=0..89`, `experimental_holdout=90+`.
- [x] Data curation: canonicalization, invalid removal, largest organic fragment, duplicate policy, `curation_report.csv`.
- [x] CV только на train_lit (GroupKFold по Murcko), OOF-метрики и Q².
- [x] External validation: experimental_holdout (если есть) + scaffold external test.
- [x] Statistical SAR evidence (Spearman/Kendall+FDR, Mann–Whitney + Cliff’s delta + bootstrap CI, Fisher design windows).
- [x] y-randomization (N=50) + CSV + PNG.
- [x] Applicability Domain: Williams plot + `in_domain` в predictions.
- [x] Feature importance: Ridge coefficients + RF permutation importance.
- [x] Overfitting flag: `delta_R2` и `suspicious_overfit`.
- [x] Артефакты и `validation_bullets_ru.txt`.


## Experimental rows start at 90

Экспериментальные строки начинаются с индекса 90. Если в данных меньше 90 строк, `experimental_holdout` пустой, и выполняется только scaffold external test внутри `train_lit`.
