In [1]:
# Environment information
import sklearn
import rdkit
import pandas
import numpy

print("scikit-learn:", sklearn.__version__)
print("RDKit:", rdkit.__version__)
print("pandas:", pandas.__version__)
print("numpy:", numpy.__version__)


scikit-learn: 1.6.1
RDKit: 2025.03.2
pandas: 2.2.3
numpy: 2.2.6


In [None]:
# This notebook reproduces the results in the manuscript:
# "Rapid and General Machine Learning Modeling of Coupling Reactions..."
# Author: Shinya Shiomi
# Python 3.10
# RDKit 2025.03.2

# ==========================
# External validation: prediction, metrics, and scatter plot
# (Train + Test + External)
# ==========================
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, r2_score

# ------------------------------------------------------------
# Prerequisites:
# - A trained model pipeline named `pipe` (e.g., Pipeline(...))
# - Arrays/Series already prepared from the same split:
#     y_train, y_test
#   and predictions:
#     y_pred_train = pipe.predict(X_train)
#     y_pred_test  = pipe.predict(X_test)
# ------------------------------------------------------------

# ---- load external dataset (same fingerprint layout as training sheet) ----
external_path = "ML_external.csv"  # <-- replace if needed
external = pd.read_csv(external_path)

X_ext = external.iloc[:, 4:4100]
y_ext = external.iloc[:, 3]

# ---- predict on external dataset ----
y_pred_ext = pipe.predict(X_ext)


# ---- metrics (external) ----
ext_mae = mean_absolute_error(y_ext, y_pred_ext)
ext_r2 = r2_score(y_ext, y_pred_ext)
ext_r, ext_p = pearsonr(y_ext, y_pred_ext)

print("=== External validation dataset ===")
print(f"External: MAE = {ext_mae:.2f}")
print(f"External: R²  = {ext_r2:.2f}")
print(f"External: Pearson r = {ext_r:.3f} (p = {ext_p:.2e})")

# ---- plot settings (publication-style) ----
plt.rcParams.update({
    "font.size": 20,        # global font size
    "axes.titlesize": 24,
    "axes.labelsize": 22,
    "xtick.labelsize": 18,
    "ytick.labelsize": 18,
    "legend.fontsize": 18,
})

plt.figure(figsize=(6.8, 6.8))

# Train: gray/black (Predicted on x, Experimental on y)
plt.scatter(
    y_pred_train, y_train,
    color="black", alpha=0.30, s=45,
    label="Train", edgecolor="none"
)

# Test: orange
plt.scatter(
    y_pred_test, y_test,
    color="#F28E2B", alpha=0.85, s=55,
    label="Test", edgecolor="none"
)

# External: blue
plt.scatter(
    y_pred_ext, y_ext,
    color="#4E79A7", alpha=0.90, s=65,
    label="External", edgecolor="none"
)

# y = x reference line
plt.plot([0, 100], [0, 100], "k--", lw=2)

plt.xlabel("Predicted yield (%)")
plt.ylabel("Experimental yield (%)")
plt.title(
    "Train / Test / External Validation\n"
    f"External: MAE = {ext_mae:.2f}, R² = {ext_r2:.2f}, r = {ext_r:.2f}"
)

plt.legend(frameon=True)
plt.grid(True, linestyle="--", alpha=0.5)

plt.xlim(0, 100)
plt.ylim(0, 100)

plt.tight_layout()
plt.show()
