# QSAR pipeline for antimicrobial peptidomimetics

## Experimental holdout (rows 90+)
Начиная с индекса/пункта 90 находятся новые экспериментальные (синтезированные) соединения.
Они **не участвуют в обучении и CV**. Используются только как внешний экспериментальный тест (`test_exp`).


## Data curation (before features)
- Canonical SMILES via RDKit (`MolFromSmiles` -> `MolToSmiles(canonical=True)`).
- Invalid SMILES are removed.
- Salts/counter-ions are normalized with `LargestFragmentChooser` (keep parent fragment).
- Duplicates after standardization are resolved by configurable `duplicate_policy` and `tolerance`.


In [None]:
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.ML.Descriptors import MoleculeDescriptors

from sklearn.model_selection import GroupKFold, GroupShuffleSplit, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [None]:
# Paths and constants
DATA_PATH = 'potok.csv'  # required columns: smiles, activity
ARTIFACTS_DIR = Path('artifacts')
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
CURATION_REPORT_PATH = ARTIFACTS_DIR / 'curation_report.csv'
METRICS_SUMMARY_PATH = ARTIFACTS_DIR / 'metrics_summary.csv'
PRED_EXP_PATH = ARTIFACTS_DIR / 'predictions_experimental.csv'

LIT_CUTOFF = 90  # rows 0..89 = literature set; rows 90+ = external experimental set
N_SPLITS_CV = 5
RANDOM_STATE = 42


## 1) Load raw data

In [None]:
df_raw = pd.read_csv(DATA_PATH)
print('Raw shape:', df_raw.shape)
print('Columns:', list(df_raw.columns))
df_raw.head()


## 2) Data curation functions

In [None]:
def _standardize_smiles(smiles: str, lfc: rdMolStandardize.LargestFragmentChooser):
    """Return standardized canonical parent SMILES, reason, salt-flag."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, 'invalid_smiles', False

    n_frags_before = len(Chem.GetMolFrags(mol))
    parent = lfc.choose(mol)
    if parent is None:
        return None, 'standardization_failed', False

    standardized = Chem.MolToSmiles(parent, canonical=True)
    salt_norm = n_frags_before > 1
    reason = 'ok_salt_normalized' if salt_norm else 'ok'
    return standardized, reason, salt_norm


def curate_data(
    df: pd.DataFrame,
    smiles_col: str = 'smiles',
    activity_col: str = 'activity',
    duplicate_policy: str = 'median',
    tolerance: float = 1e-6,
    report_path: Path = Path('artifacts/curation_report.csv')
):
    """Unified data curation for QSAR-ready table."""
    allowed = {'median', 'mean', 'drop_conflicts'}
    if duplicate_policy not in allowed:
        raise ValueError(f'duplicate_policy must be one of {allowed}')

    required = {smiles_col, activity_col}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f'Missing required columns: {missing}')

    work = df.copy().reset_index(drop=True)
    work[activity_col] = pd.to_numeric(work[activity_col], errors='coerce')
    work = work.dropna(subset=[smiles_col, activity_col]).reset_index(drop=True)

    lfc = rdMolStandardize.LargestFragmentChooser()

    rows = []
    for row_id, row in work.iterrows():
        std_smi, reason, salt_norm = _standardize_smiles(row[smiles_col], lfc)
        rows.append({
            'row_id': int(row_id),
            'original_smiles': row[smiles_col],
            'standardized_smiles': std_smi,
            'original_activity': float(row[activity_col]),
            'reason': reason,
            'salt_normalized': bool(salt_norm),
        })

    std_df = pd.DataFrame(rows)
    invalid_mask = std_df['standardized_smiles'].isna()

    report_frames = []
    if invalid_mask.any():
        report_frames.append(std_df.loc[invalid_mask, ['original_smiles', 'standardized_smiles', 'reason', 'original_activity']])

    valid_df = std_df.loc[~invalid_mask].copy()

    grouped_rows = []
    duplicate_groups, duplicate_collapsed, conflicts_found = 0, 0, 0

    for ssmiles, grp in valid_df.groupby('standardized_smiles', sort=False):
        acts = grp['original_activity'].to_numpy()
        n = len(grp)
        if n == 1:
            grouped_rows.append({
                'standardized_smiles': ssmiles,
                'activity': float(acts[0]),
                'source_row_min': int(grp['row_id'].min())
            })
            continue

        duplicate_groups += 1
        duplicate_collapsed += (n - 1)
        conflict = (np.max(acts) - np.min(acts)) > tolerance
        if conflict:
            conflicts_found += 1

        if (not conflict):
            final_activity = float(np.median(acts))
            action = 'duplicates_equal_keep_one'
            keep_row = True
        elif duplicate_policy == 'median':
            final_activity = float(np.median(acts))
            action = 'duplicates_conflict_aggregated_median'
            keep_row = True
        elif duplicate_policy == 'mean':
            final_activity = float(np.mean(acts))
            action = 'duplicates_conflict_aggregated_mean'
            keep_row = True
        else:
            final_activity = np.nan
            action = 'duplicates_conflict_dropped'
            keep_row = False

        report_frames.append(pd.DataFrame({
            'original_smiles': grp['original_smiles'].values,
            'standardized_smiles': grp['standardized_smiles'].values,
            'reason': [action] * len(grp),
            'original_activity': grp['original_activity'].values,
        }))

        if keep_row:
            grouped_rows.append({
                'standardized_smiles': ssmiles,
                'activity': final_activity,
                'source_row_min': int(grp['row_id'].min())
            })

    curated_df = pd.DataFrame(grouped_rows).sort_values('source_row_min').reset_index(drop=True)

    report_df = (pd.concat(report_frames, ignore_index=True)
                 if report_frames else
                 pd.DataFrame(columns=['original_smiles', 'standardized_smiles', 'reason', 'original_activity']))
    report_df.to_csv(report_path, index=False)

    summary = {
        'n_input_rows': int(len(df)),
        'n_after_numeric_and_notna': int(len(work)),
        'n_invalid_removed': int(invalid_mask.sum()),
        'n_salt_normalized': int(std_df['salt_normalized'].sum()),
        'n_duplicate_groups': int(duplicate_groups),
        'n_duplicates_collapsed': int(duplicate_collapsed),
        'n_activity_conflicts': int(conflicts_found),
        'duplicate_policy': duplicate_policy,
        'tolerance': tolerance,
        'n_output_rows': int(len(curated_df)),
        'report_path': str(report_path),
    }

    return curated_df, summary, report_df


In [None]:
curated_df, curation_summary, curation_report = curate_data(
    df_raw,
    smiles_col='smiles',
    activity_col='activity',
    duplicate_policy='median',
    tolerance=1e-6,
    report_path=CURATION_REPORT_PATH,
)

print('=== Data curation summary ===')
for k, v in curation_summary.items():
    print(f'{k}: {v}')

assert curated_df['standardized_smiles'].notna().all(), 'NaN in standardized_smiles after curation.'
if (curated_df['activity'] <= 0).any():
    raise ValueError('Found activity <= 0; cannot safely apply log10 transform for target.')
assert len(curated_df) > 0, 'Curated dataset is empty.'


## 3) Experimental split: train_lit (0–89) vs test_exp (90+)

In [None]:
def split_literature_experimental(df: pd.DataFrame, cutoff: int = 90):
    """Robust split by order: rows 0..cutoff-1 are literature, cutoff+ are experimental.

    If column '№' exists and is aligned with row order, it is used as a helper reference.
    Otherwise positional index is used.
    """
    work = df.copy().reset_index(drop=True)

    use_number_column = False
    if '№' in work.columns:
        num_col = pd.to_numeric(work['№'], errors='coerce')
        aligned = num_col.notna().all() and np.allclose(num_col.values, np.arange(len(work)))
        if aligned:
            use_number_column = True

    if use_number_column:
        idx_ref = pd.to_numeric(work['№'])
    else:
        idx_ref = pd.Series(np.arange(len(work)))

    train_mask = idx_ref < cutoff
    test_mask = idx_ref >= cutoff

    train_lit = work.loc[train_mask].reset_index(drop=True)
    test_exp = work.loc[test_mask].reset_index(drop=True)
    return train_lit, test_exp

train_lit, test_exp = split_literature_experimental(curated_df, cutoff=LIT_CUTOFF)
print('train_lit:', train_lit.shape)
print('test_exp:', test_exp.shape)


## 4) Feature generation

In [None]:
descriptor_names = [
    'MolWt', 'MolLogP', 'NumHDonors', 'NumHAcceptors',
    'NumHeteroatoms', 'NumRotatableBonds', 'TPSA',
    'NumAromaticRings', 'RingCount', 'FractionCSP3'
]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

def calc_descriptor_frame(mols):
    rows = [calc.CalcDescriptors(mol) for mol in mols]
    return pd.DataFrame(rows, columns=descriptor_names)

def morgan_fp_array(mol, radius=2, n_bits=2048):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def murcko_scaffold_smiles(mol):
    return MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False)


In [None]:
def build_model_table(df):
    tbl = df.copy()
    tbl['mol'] = tbl['standardized_smiles'].apply(Chem.MolFromSmiles)
    tbl = tbl[tbl['mol'].notnull()].reset_index(drop=True)
    if (tbl['activity'] <= 0).any():
        raise ValueError('activity must be > 0 for log transform')
    tbl['y'] = np.log10(tbl['activity'])
    tbl['scaffold'] = tbl['mol'].apply(murcko_scaffold_smiles)
    return tbl

train_tbl = build_model_table(train_lit)
test_exp_tbl = build_model_table(test_exp) if len(test_exp) > 0 else pd.DataFrame(columns=train_tbl.columns)

X_desc_train = calc_descriptor_frame(train_tbl['mol'])
X_fp_train = np.vstack([morgan_fp_array(m) for m in train_tbl['mol']])
y_train = train_tbl['y'].to_numpy()
groups_train = train_tbl['scaffold'].to_numpy()

if len(test_exp_tbl) > 0:
    X_desc_exp = calc_descriptor_frame(test_exp_tbl['mol'])
    X_fp_exp = np.vstack([morgan_fp_array(m) for m in test_exp_tbl['mol']])
    y_exp = test_exp_tbl['y'].to_numpy()
else:
    X_desc_exp, X_fp_exp, y_exp = None, None, None


## 5) GroupKFold CV on train_lit + Q²(CV) from OOF predictions

In [None]:
def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def q2_cv_from_oof(y_true, y_oof):
    """Q²(CV) = 1 - PRESS/TSS, with PRESS from OOF predictions."""
    y_mean = float(np.mean(y_true))
    press = float(np.sum((y_true - y_oof) ** 2))
    tss = float(np.sum((y_true - y_mean) ** 2))
    if tss == 0:
        return np.nan
    return 1.0 - (press / tss)

def evaluate_cv_groupkfold(model, X, y, groups, n_splits=5):
    unique_groups = pd.Series(groups).nunique()
    if unique_groups < 2:
        raise ValueError('Need at least 2 unique scaffolds for GroupKFold.')
    n_splits_eff = min(n_splits, unique_groups)
    cv = GroupKFold(n_splits=n_splits_eff)
    y_oof = cross_val_predict(model, X, y, cv=cv, groups=groups, n_jobs=-1)
    return {
        'R2(CV)': r2_score(y, y_oof),
        'RMSE(CV)': rmse(y, y_oof),
        'MAE(CV)': mean_absolute_error(y, y_oof),
        'Q2(CV)': q2_cv_from_oof(y, y_oof),
        'n_splits': n_splits_eff,
    }, y_oof


In [None]:
models = {
    'Descriptors+Ridge': Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
    'MorganFP+RandomForest': Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('model', RandomForestRegressor(
            n_estimators=500,
            random_state=RANDOM_STATE,
            min_samples_leaf=2,
            n_jobs=-1
        ))
    ])
}

feature_sets = {
    'Descriptors+Ridge': (X_desc_train, X_desc_exp),
    'MorganFP+RandomForest': (X_fp_train, X_fp_exp),
}

metrics_rows = []
exp_pred_rows = []


In [None]:
# Optional internal scaffold-external split (inside train_lit) for extra reporting
# It is reported only if feasible by scaffold diversity.
def scaffold_external_split(tbl, test_size=0.2, random_state=42):
    groups = tbl['scaffold'].to_numpy()
    uniq = pd.Series(groups).nunique()
    if uniq < 3:
        return None, None
    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    idx_train, idx_test = next(splitter.split(tbl, groups=groups))
    return idx_train, idx_test

scaf_train_idx, scaf_test_idx = scaffold_external_split(train_tbl)


In [None]:
for model_name, model in models.items():
    X_train_all, X_exp = feature_sets[model_name]

    # 1) CV on train_lit only (GroupKFold by Murcko scaffold)
    cv_metrics, y_oof = evaluate_cv_groupkfold(
        model=model,
        X=X_train_all,
        y=y_train,
        groups=groups_train,
        n_splits=N_SPLITS_CV,
    )
    metrics_rows.append({
        'dataset': 'CV(train_lit)',
        'model': model_name,
        'R2': cv_metrics['R2(CV)'],
        'RMSE': cv_metrics['RMSE(CV)'],
        'MAE': cv_metrics['MAE(CV)'],
        'Q2': cv_metrics['Q2(CV)'],
        'n_samples': len(y_train),
    })

    # 2) Optional scaffold-external test inside train_lit
    if scaf_train_idx is not None and scaf_test_idx is not None:
        model_scaf = model
        X_scaf_train = X_train_all[scaf_train_idx] if isinstance(X_train_all, np.ndarray) else X_train_all.iloc[scaf_train_idx]
        X_scaf_test = X_train_all[scaf_test_idx] if isinstance(X_train_all, np.ndarray) else X_train_all.iloc[scaf_test_idx]
        y_scaf_train, y_scaf_test = y_train[scaf_train_idx], y_train[scaf_test_idx]
        model_scaf.fit(X_scaf_train, y_scaf_train)
        y_scaf_pred = model_scaf.predict(X_scaf_test)
        metrics_rows.append({
            'dataset': 'external_test_scaffold',
            'model': model_name,
            'R2': r2_score(y_scaf_test, y_scaf_pred),
            'RMSE': rmse(y_scaf_test, y_scaf_pred),
            'MAE': mean_absolute_error(y_scaf_test, y_scaf_pred),
            'Q2': np.nan,
            'n_samples': len(y_scaf_test),
        })

    # 3) Train on all train_lit, predict external experimental test (rows 90+)
    model.fit(X_train_all, y_train)
    if X_exp is not None and len(test_exp_tbl) > 0:
        y_exp_pred = model.predict(X_exp)

        metrics_rows.append({
            'dataset': 'external_experimental_test',
            'model': model_name,
            'R2': r2_score(y_exp, y_exp_pred),
            'RMSE': rmse(y_exp, y_exp_pred),
            'MAE': mean_absolute_error(y_exp, y_exp_pred),
            'Q2': np.nan,
            'n_samples': len(y_exp),
        })

        # save predictions for experimental holdout
        if model_name == 'MorganFP+RandomForest':
            pred_table = test_exp_tbl[['standardized_smiles', 'y']].copy()
            pred_table['y_pred'] = y_exp_pred
            pred_table['residual'] = pred_table['y'] - pred_table['y_pred']
            pred_table = pred_table.rename(columns={'standardized_smiles': 'smiles', 'y': 'y_true'})
            exp_pred_rows.append(pred_table)


## 6) Save outputs

In [None]:
metrics_summary = pd.DataFrame(metrics_rows)
metrics_summary.to_csv(METRICS_SUMMARY_PATH, index=False)

if exp_pred_rows:
    predictions_experimental = pd.concat(exp_pred_rows, ignore_index=True)
else:
    predictions_experimental = pd.DataFrame(columns=['smiles', 'y_true', 'y_pred', 'residual'])

predictions_experimental.to_csv(PRED_EXP_PATH, index=False)

print('Saved:', METRICS_SUMMARY_PATH)
print('Saved:', PRED_EXP_PATH)
print('Saved:', CURATION_REPORT_PATH)
metrics_summary
