In [None]:
import sys
sys.path.append('../')
from synrxn.io.io import save_df_gz

In [None]:
from typing import Union, Iterable, Optional, List
import pandas as pd
from pathlib import Path

def curate_reactions(
    data: Union[str, pd.DataFrame],
    *,
    data_name: str,
    rxn_col: str,
    target_cols: Union[str, Iterable[str]],
    split_col: Optional[Union[str, Iterable[str]]] = None,
    r_id_col: str = "R-id",
    index_base: int = 0,
    index_zero_pad: Optional[int] = None,
    keep_other_columns: bool = False,
    inplace: bool = False,
    out_csv: Optional[Union[str, Path]] = None,
    encoding: str = "utf-8",
    check_unique_rid: bool = True,
    verbose: bool = False,
) -> pd.DataFrame:
    if isinstance(data, str) or isinstance(data, Path):
        df_in = pd.read_csv(str(data), encoding=encoding)
        orig_is_df = False
    elif isinstance(data, pd.DataFrame):
        df_in = data if inplace else data.copy(deep=True)
        orig_is_df = True
    else:
        raise TypeError("`data` must be a file path or a pandas DataFrame")

    if isinstance(target_cols, str):
        target_list: List[str] = [target_cols]
    else:
        target_list = list(target_cols)

    if split_col is None:
        split_list: List[str] = []
    elif isinstance(split_col, str):
        split_list = [split_col]
    else:
        split_list = list(split_col)

    if not isinstance(index_base, int) or index_base < 0:
        raise ValueError("index_base must be a non-negative integer")

    expected_cols = [rxn_col] + target_list + split_list
    missing = [c for c in expected_cols if c not in df_in.columns]
    if missing:
        raise KeyError(f"Missing expected column(s) in input data: {missing}")

    df_in.reset_index(drop=True, inplace=True)

    idx_vals = (df_in.index + index_base).astype(int).astype(str)
    if index_zero_pad is not None:
        if not isinstance(index_zero_pad, int) or index_zero_pad <= 0:
            raise ValueError("index_zero_pad must be a positive integer or None")
        idx_vals = idx_vals.str.zfill(index_zero_pad)

    rids = data_name + "_" + idx_vals
    df_in[r_id_col] = rids

    if rxn_col != "rxn":
        if "rxn" in df_in.columns and rxn_col != "rxn":
            df_in.rename(columns={"rxn": "rxn_orig"}, inplace=True)
            if verbose:
                print("Renamed existing 'rxn' column to 'rxn_orig' to avoid collision.")
        df_in = df_in.rename(columns={rxn_col: "rxn"})

    df_in["rxn"] = df_in["rxn"].astype(str).str.strip()

    keep_cols = [r_id_col, "rxn"] + target_list + split_list

    if keep_other_columns:
        other_cols = [c for c in df_in.columns if c not in keep_cols]
        ordered_cols = keep_cols + other_cols
        result = df_in.loc[:, ordered_cols]
    else:
        result = df_in.loc[:, [c for c in keep_cols if c in df_in.columns]]

    if check_unique_rid:
        if result[r_id_col].duplicated().any():
            dupes = result[result[r_id_col].duplicated(keep=False)][r_id_col].unique().tolist()
            raise ValueError(f"Non-unique R-id values produced (sample): {dupes[:10]}")

    if out_csv:
        result.to_csv(str(out_csv), index=False, encoding=encoding)
        if verbose:
            print(f"Wrote curated DataFrame to {out_csv}")

    if inplace and orig_is_df:
        orig_df = data  # type: ignore[assignment]
        for col in list(orig_df.columns):
            orig_df.drop(columns=col, inplace=True)
        for col in result.columns:
            orig_df[col] = result[col].values
        return orig_df

    return result


# 1. B97xd3
https://www.nature.com/articles/s41597-020-0460-4#Sec2

https://doi.org/10.5281/zenodo.3715478

In [None]:
b97xd3 = "https://zenodo.org/records/3715478/files/b97d3.csv?download=1"
b97xd3 = pd.read_csv(b97xd3)
b97xd3

In [None]:
def combine(r, p):
    return f"{r}>>{p}"

b97xd3['rxn'] = b97xd3.apply(lambda row: combine(row['rsmi'], row['psmi']), axis=1)


In [None]:
b97xd3 = curate_reactions(
    b97xd3,
    data_name="b97xd3",
    rxn_col="rxn",
    target_cols=['ea', 'dh'],
    split_col=None,
    index_base=1,               
    keep_other_columns=False,   
)

save_df_gz(b97xd3, '../Data/property/b97xd3.csv.gz')

In [None]:
print(b97xd3.shape)

# 2. SnAR
https://pubs.rsc.org/en/content/articlelanding/2021/sc/d0sc04896h

https://www.rsc.org/suppdata/d0/sc/d0sc04896h/d0sc04896h2.zip

In [None]:
import os
import io
import csv
import zipfile
import tempfile
from typing import Optional
import requests
import pandas as pd

def fetch_snar_df(
    url: str = "https://www.rsc.org/suppdata/d0/sc/d0sc04896h/d0sc04896h2.zip",
    target_basename: str = "SNAR_reaction_dataset_SI.csv",
    timeout: int = 30,
) -> pd.DataFrame:
    encodings_default = ["cp1252", "latin1", "iso-8859-1", "utf-8", "utf-16"]

    with tempfile.TemporaryDirectory() as td:
        zip_path = os.path.join(td, "archive.zip")

        with requests.get(url, stream=True, timeout=timeout) as r:
            r.raise_for_status()
            with open(zip_path, "wb") as fh:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        fh.write(chunk)

        with zipfile.ZipFile(zip_path, "r") as z:
            names = z.namelist()
            candidate: Optional[str] = None
            for n in names:
                if target_basename.lower() in os.path.basename(n).lower():
                    candidate = n
                    break
            if candidate is None:
                for n in names:
                    if target_basename.lower() in n.lower():
                        candidate = n
                        break
            if candidate is None:
                sample = names[:40]
                raise FileNotFoundError(
                    f"Could not find a file matching {target_basename!r} inside the ZIP. Sample entries: {sample}"
                )

            raw_bytes = z.read(candidate)
            sample_bytes = raw_bytes[:200_000]

            detected_enc = None
            try:
                import chardet  # type: ignore
                det = chardet.detect(sample_bytes)
                detected_enc = det.get("encoding")
            except Exception:
                detected_enc = None

            encodings = []
            if detected_enc:
                encodings.append(detected_enc)
            for e in encodings_default:
                if e not in encodings:
                    encodings.append(e)

            delim = ","
            for enc in encodings:
                try:
                    sample_text = sample_bytes.decode(enc, errors="replace")
                    sniff = csv.Sniffer()
                    dialect = sniff.sniff(sample_text)
                    delim = dialect.delimiter
                    break
                except Exception:
                    continue

            last_exc: Optional[Exception] = None
            for enc in encodings:
                try:
                    df = pd.read_csv(io.BytesIO(raw_bytes), encoding=enc, delimiter=delim, engine="c", low_memory=False)
                    return df
                except Exception as e_c:
                    last_exc = e_c
                    try:
                        df = pd.read_csv(io.BytesIO(raw_bytes), encoding=enc, delimiter=delim, engine="python")
                        return df
                    except Exception as e_py:
                        last_exc = e_py
                        continue

            text = raw_bytes.decode("utf-8", errors="replace")
            try:
                df = pd.read_csv(io.StringIO(text), delimiter=delim)
                return df
            except Exception as final_e:
                raise RuntimeError(
                    f"Failed to parse CSV inside ZIP with multiple encodings and fallbacks. Last parsing error: {last_exc!r}. Final attempt error: {final_e!r}"
                ) from final_e


In [None]:
snar = fetch_snar_df()
snar['ea'] = snar['Activation Free Energy (kcalmol-1)']
snar = curate_reactions(
    snar,
    data_name="snar",
    rxn_col="Reaction SMILES",
    target_cols='ea',
    split_col=None,
    index_base=1,               
    keep_other_columns=False,   
)
save_df_gz(snar, '../Data/property/snar.csv.gz')
print(snar.shape)

# 3. E2SN2
https://doi.org/10.1088/2632-2153/aba822

In [None]:
e2sn2 ='https://raw.githubusercontent.com/hesther/reactiondatabase/refs/heads/main/data/e2sn2.csv'

e2sn2 = pd.read_csv(e2sn2)
e2sn2 = curate_reactions(
    e2sn2,
    data_name="e2sn2",
    rxn_col="AAM",
    target_cols='ea',
    split_col=None,
    index_base=1,               
    keep_other_columns=False,   
)
save_df_gz(e2sn2, '../Data/property/e2sn2.csv.gz')
print(e2sn2.shape)
display(e2sn2.head(1))

# 4. Rad6re



In [None]:
rad6re = 'https://github.com/hesther/reactiondatabase/raw/refs/heads/main/data/rad6re.csv'

rad6re = pd.read_csv(rad6re)
rad6re = curate_reactions(
    rad6re,
    data_name="rad6re",
    rxn_col="AAM",
    target_cols='dh',
    split_col=None,
    index_base=1,               
    keep_other_columns=False,   
)
save_df_gz(rad6re, '../Data/property/rad6re.csv.gz')
print(rad6re.shape)
display(rad6re.head(1))

# 5. lograte

https://doi.org/10.1021/acs.jpca.7b07361



In [None]:
lograte = 'https://raw.githubusercontent.com/hesther/reactiondatabase/refs/heads/main/data/lograte.csv'

lograte = pd.read_csv(lograte)

lograte = curate_reactions(
    lograte,
    data_name="lograte",
    rxn_col="AAM",
    target_cols='lograte',
    split_col=None,
    index_base=1,               
    keep_other_columns=False,   
)
save_df_gz(lograte, '../Data/property/lograte.csv.gz')
print(lograte.shape)
display(lograte.head(1))

# 6. Phosphate

https://doi.org/10.1073/pnas.1423570112


In [None]:
import numpy as np
import pandas as pd
from typing import Any

def add_array_column(df: pd.DataFrame, arr: Any, col_name: str) -> pd.DataFrame:
    arr_np = np.asarray(arr)
    if arr_np.ndim == 0:
        values = [arr_np.item()] * len(df)
    elif arr_np.ndim == 1:
        values = arr_np.tolist()
    elif arr_np.ndim == 2 and arr_np.shape[1] == 1:
        values = arr_np.ravel().tolist()
    else:
        values = [list(row) for row in arr_np]
    if len(df) != len(values):
        raise ValueError(f"Length mismatch: df has {len(df)} rows but array has {len(values)} rows")
    new_df = df.copy()
    new_df[col_name] = values
    return new_df

phosphatase = 'https://github.com/hesther/reactiondatabase/raw/refs/heads/main/data/phosphatase.csv'
phosphatase = pd.read_csv(phosphatase)

po = 'https://github.com/hesther/reactiondatabase/raw/refs/heads/main/data/phosphatase_onehotenzyme.csv'
po = pd.read_csv(po)

phosphatase = add_array_column(phosphatase, po.values, col_name='onehot')

In [None]:
phosphatase = curate_reactions(
    phosphatase,
    data_name="phosphatase",
    rxn_col="AAM",
    target_cols='Conversion',
    split_col='onehot',
    index_base=1,               
    keep_other_columns=False,   
)

save_df_gz(phosphatase, '../Data/property/phosphatase.csv.gz')
print(phosphatase.shape)
display(phosphatase.head(1))

# Chemprop

In [None]:
from pathlib import Path
import requests
import tarfile
import os
import re
from typing import List, Dict, Set

def _normalize_token(s: str) -> str:
    return re.sub(r'[^0-9a-z]', '_', s.lower())

def stream_extract_selected_from_targz_safe(
    url: str,
    targets: List[str],
    dest_dir: str = "extracted",
    timeout: int = 1000,
    max_no_progress: int = 100000,
    min_matches_per_target: int = 1,
) -> Dict[str, List[Path]]:
    """
    Stream-download a .tar.gz and extract only members matching tokens in `targets`.
    Safety and robustness improvements:
      - skips duplicate members (seen_members)
      - tracks tokens_found and extracted files per token
      - stops early when all tokens have at least `min_matches_per_target` files,
        or when `max_no_progress` consecutive members produce no new matches.
    Returns a dict mapping normalized target -> list of extracted file Paths.
    """
    dest_dir_path = Path(dest_dir)
    dest_dir_path.mkdir(parents=True, exist_ok=True)

    norm_targets = [_normalize_token(t) for t in targets]
    target_map = {nt: t for nt, t in zip(norm_targets, targets)}  # norm -> original

    extracted_by_target: Dict[str, List[Path]] = {nt: [] for nt in norm_targets}
    tokens_found: Set[str] = set()
    seen_members: Set[str] = set()
    no_progress = 0
    total_processed = 0

    with requests.get(url, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        r.raw.decode_content = True
        with tarfile.open(fileobj=r.raw, mode="r|gz") as tar:
            print("Opened remote tar.gz in streaming mode; iterating members...")
            for member in tar:
                total_processed += 1
                # basic bailouts
                if member is None or not getattr(member, "name", None):
                    continue
                name = member.name
                # skip duplicates (prevents repeated extraction/printing)
                if name in seen_members:
                    no_progress += 1
                    if no_progress >= max_no_progress:
                        print(f"No progress for {max_no_progress} members; stopping early.")
                        break
                    # continue reading until stop condition triggers
                    continue
                seen_members.add(name)
                no_progress += 1  # will be reset if we make progress below

                name_norm = _normalize_token(name)
                basename_norm = _normalize_token(Path(name).name)

                # flexible match: if any target appears in normalized path or basename
                matched_tokens = [tok for tok in norm_targets if (tok in name_norm or tok in basename_norm)]
                if not matched_tokens:
                    # no match -> continue
                    if total_processed % 1000 == 0:
                        print(f"Processed {total_processed} members so far; still searching...")
                    # keep going; no_progress remains incremented
                    if no_progress >= max_no_progress:
                        print(f"No progress for {max_no_progress} members; stopping early.")
                        break
                    continue

                # We have at least one matching token â€” reset no-progress
                no_progress = 0
                for tok in matched_tokens:
                    tokens_found.add(tok)

                # Extract safely (create parent dirs, prevent path traversal)
                target_path = dest_dir_path / name
                target_resolved = target_path.resolve(strict=False)
                dest_resolved = dest_dir_path.resolve()
                if not str(target_resolved).startswith(str(dest_resolved)):
                    print(f"Skipping unsafe member path: {name}")
                    continue
                target_path.parent.mkdir(parents=True, exist_ok=True)

                if member.isdir():
                    target_path.mkdir(parents=True, exist_ok=True)
                    # record directory as "extracted" for each matched token
                    for tok in matched_tokens:
                        extracted_by_target[tok].append(target_path.resolve())
                    print(f"Matched directory: {name}")
                elif member.isreg():
                    f = tar.extractfile(member)
                    if f is None:
                        print(f"Warning: could not extract file member {name}")
                        continue
                    with open(target_path, "wb") as outfh:
                        while True:
                            chunk = f.read(1024 * 64)
                            if not chunk:
                                break
                            outfh.write(chunk)
                    try:
                        os.chmod(target_path, member.mode)
                    except Exception:
                        pass
                    for tok in matched_tokens:
                        extracted_by_target[tok].append(target_path.resolve())
                    print(f"Extracted -> {target_path}")
                else:
                    # skip symlinks / special types but create small marker
                    print(f"Skipping non-regular member: {name}")
                    with open(target_path, "w", encoding="utf-8") as outfh:
                        outfh.write(f"# skipped non-regular archive member: {name}\n")
                    for tok in matched_tokens:
                        extracted_by_target[tok].append(target_path.resolve())

                # Early stop: if all tokens have at least min_matches_per_target files, stop
                if all(len(extracted_by_target[tok]) >= min_matches_per_target for tok in norm_targets):
                    print("All requested targets have at least", min_matches_per_target, "matches. Stopping early.")
                    break

    # Final reporting
    print("Streaming extraction finished. Processed members:", total_processed)
    for nt in norm_targets:
        print(f"Target '{target_map[nt]}' -> extracted {len(extracted_by_target[nt])} item(s).")
    return extracted_by_target


In [None]:
url = "https://zenodo.org/records/10078142/files/data.tar.gz?download=1"
targets = [
    "barriers_e2",
    "barriers_sn2",
    "barriers_cycloadd",
    "barriers_rdb7",
    # "barriers_rgd1", This should direct donwload and put to folder
]
result = stream_extract_selected_from_targz_safe(url, targets, dest_dir="chemprop_zenodo")


In [None]:
from pathlib import Path
from typing import Iterable, Optional, Tuple, Dict, List, Union
import io
import pandas as pd


def _try_read_csv(path: Path, encodings: Optional[List[str]] = None, **pd_kwargs) -> pd.DataFrame:
    if encodings is None:
        encodings = ["utf-8", "cp1252", "latin1", "iso-8859-1", "utf-16"]
    raw = path.read_bytes()
    last_exc = None
    for enc in encodings:
        try:
            return pd.read_csv(io.BytesIO(raw), encoding=enc, **pd_kwargs)
        except Exception as e:
            last_exc = e
    # final tolerant fallback
    try:
        txt = raw.decode("utf-8", errors="replace")
        return pd.read_csv(io.StringIO(txt), **pd_kwargs)
    except Exception as final_e:
        raise RuntimeError(f"Failed to read CSV {path!s}. Last error: {last_exc!r}; final attempt: {final_e!r}")

def combine_barriers_split(
    base_dir: str = "./extracted_zenodo_stream_safe/data/barriers_e2",
    patterns_split: Optional[List[tuple]] = None,
    encodings: Optional[List[str]] = None,
    verbose: bool = True,
    save: bool = False,
    out_csv: str = "barriers_e2_combined.csv",
    return_splits: bool = False,
) -> pd.DataFrame | Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]:
    """
    Combine train/val/test CSVs under base_dir into one DataFrame with a 'split' column.
    - save=False -> will NOT write any file to disk (default).
    - return_splits=True -> returns (combined_df, {'train': df_t, 'val': df_v, 'test': df_te})
    """
    base = Path(base_dir)
    if patterns_split is None:
        patterns_split = [
            ("train", "train"),
            ("val", "val"),
            ("validation", "val"),
            ("test", "test"),
        ]
    if encodings is None:
        encodings = ["utf-8", "cp1252", "latin1", "iso-8859-1", "utf-16"]

    if not base.exists():
        raise FileNotFoundError(f"Base directory does not exist: {base.resolve()}")

    csv_files = sorted(list(base.rglob("*.csv")))
    if verbose:
        print(f"Found {len(csv_files)} CSV file(s) under {base}")

    rows = []
    files_used = 0
    split_frames: Dict[str, List[pd.DataFrame]] = {"train": [], "val": [], "test": []}

    for p in csv_files:
        name_l = p.name.lower()
        assigned_split = None
        for token, label in patterns_split:
            if token in name_l:
                assigned_split = label
                break

        # check up to 3 parent directory names
        if assigned_split is None:
            parents_to_check = list(p.parents)[:3]
            for parent in parents_to_check:
                part_l = parent.name.lower()
                for token, label in patterns_split:
                    if token in part_l:
                        assigned_split = label
                        break
                if assigned_split is not None:
                    break

        if assigned_split is None:
            if verbose:
                print(f"Skipping (no split token found): {p}")
            continue

        if verbose:
            print(f"Loading {p}  -> split='{assigned_split}'")
        try:
            df = _try_read_csv(p, encodings=encodings)
        except Exception as e:
            print(f"Failed to read {p}: {type(e).__name__}: {e}")
            continue

        df = df.copy()
        df["split"] = assigned_split
        try:
            df["_source_file"] = str(p.relative_to(base.parent))
        except Exception:
            df["_source_file"] = str(p)

        rows.append(df)
        split_frames.setdefault(assigned_split, []).append(df)
        files_used += 1

    if files_used == 0:
        raise RuntimeError("No CSV files were loaded/labelled. Check filenames and patterns_split.")

    combined = pd.concat(rows, ignore_index=True, sort=False)
    combined["split"] = pd.Categorical(combined["split"], categories=["train", "val", "test"], ordered=True)

    if save:
        combined.to_csv(out_csv, index=False, encoding="utf-8")
        if verbose:
            print(f"Saved combined CSV to: {Path(out_csv).resolve()}")

    if verbose:
        print(f"Combined {files_used} file(s) -> {combined.shape[0]} rows, {combined.shape[1]} cols")

    if return_splits:
        # merge per-split lists into DataFrames
        per_split = {k: (pd.concat(v, ignore_index=True, sort=False) if v else pd.DataFrame()) for k, v in split_frames.items()}
        return combined, per_split

    return combined

# 7. E2

In [None]:
e2 = combine_barriers_split('./chemprop_zenodo/data/barriers_e2')
e2 = curate_reactions(
    e2,
    data_name="e2",
    rxn_col="AAM",
    target_cols="ea",
    split_col="split",
    index_base=1,               # optional: start R-id at 1
    keep_other_columns=False,   # drop 'extra'
)

save_df_gz(e2, '../Data/property/e2.csv.gz')
print(e2.shape)
display(e2.head(1))

# 8. SN2

In [None]:
sn2 = combine_barriers_split('./chemprop_zenodo/data/barriers_sn2')
sn2 = curate_reactions(
    sn2,
    data_name="sn2",
    rxn_col="AAM",
    target_cols="ea",
    split_col="split",
    index_base=1,               # optional: start R-id at 1
    keep_other_columns=False,   # drop 'extra'
)
save_df_gz(sn2, '../Data/property/sn2.csv.gz')
print(sn2.shape)
display(sn2.head(1))

# 9. RDB7

In [None]:
rdb7 = combine_barriers_split('./chemprop_zenodo/data/barriers_rdb7')
rdb7 = curate_reactions(
    rdb7,
    data_name="rdb7",
    rxn_col="smiles",
    target_cols="ea",
    split_col="split",
    index_base=1,               # optional: start R-id at 1
    keep_other_columns=False,   # drop 'extra'
)
save_df_gz(rdb7, '../Data/property/rdb7.csv.gz')
print(rdb7.shape)
display(rdb7.head(1))

# 10. Cycloadd

In [None]:
cycloadd = combine_barriers_split('./chemprop_zenodo/data/barriers_cycloadd')
cycloadd = curate_reactions(
    cycloadd,
    data_name="cycloadd",
    rxn_col="rxn_smiles",
    target_cols=["G_act","G_r"],
    split_col="split",
    index_base=1,               # optional: start R-id at 1
    keep_other_columns=False,   # drop 'extra'
)
save_df_gz(cycloadd, '../Data/property/cycloadd.csv.gz')
print(cycloadd.shape)
display(cycloadd.head(1))

# 11. Rgd1

In [None]:
rgd1 = combine_barriers_split('./chemprop_zenodo/data/barriers_rgd1')

rgd1 = curate_reactions(
    rgd1,
    data_name="rgd1",
    rxn_col="smiles",
    target_cols="ea",
    split_col="split",
    index_base=1,               
    keep_other_columns=False,   
)

save_df_gz(rgd1, '../Data/property/rgd1.csv.gz')
print(rgd1.shape)
display(rgd1.head(1))

# 12. Suzuki-Miyaura Yields

In [None]:
import io, requests, pandas as pd
from joblib import Parallel, delayed
from tqdm.auto import tqdm

_RAW_BASE = "https://raw.githubusercontent.com/reymond-group/drfp/main/data/Suzuki-Miyaura/random_splits"
_STD = None

def _get_std():
    global _STD
    if _STD is None:
        from synkit.Chem.Reaction.standardize import Standardize
        _STD = Standardize()
    return _STD

def _standardize_one(rxns, remove_aam=False):
    try:
        std = _get_std()
    except Exception as e:
        raise RuntimeError("Failed to import synkit.Standardize. Ensure synkit is installed.") from e
    try:
        try:
            out = std.fit(rxns, remove_aam=remove_aam)
        except TypeError:
            out = std.fit(rxns)
        if out is None:
            for attr in ("canonical_rsmi", "rsmi", "canonical", "standardized"):
                if hasattr(std, attr):
                    maybe = getattr(std, attr)
                    if isinstance(maybe, str) and maybe:
                        return maybe
            return rxns
        if isinstance(out, (list, tuple)):
            return "|".join(map(str, out))
        return str(out)
    except Exception:
        return rxns

def _parse_text_to_df(txt):
    if "\\t" in txt and "\t" not in txt:
        txt = txt.replace("\\t", "\t")
    try:
        df = pd.read_csv(io.StringIO(txt), sep="\t", engine="python", dtype=str)
    except Exception:
        df = pd.read_csv(io.StringIO(txt), sep=r"\s+", engine="python", header=None, dtype=str)
    if len(df.columns) == 1:
        col0 = df.columns[0]
        sample_vals = df[col0].astype(str).iloc[:10].tolist()
        if any("\t" in s for s in sample_vals):
            df = df[col0].str.split("\t", expand=True)
        elif any("\\t" in s for s in sample_vals):
            fixed = df[col0].str.replace("\\t", "\t", regex=False)
            df = fixed.str.split("\t", expand=True)
        else:
            df = df[col0].str.split(r"\s+", expand=True)
    df.columns = [str(c).strip().replace("\\", "") for c in df.columns]
    col_lower = [c.lower() for c in df.columns]
    if "rxn" in col_lower and "y" in col_lower:
        rename_map = {}
        for c in df.columns:
            if c.lower() == "rxn":
                rename_map[c] = "rxn"
            if c.lower() == "y":
                rename_map[c] = "y"
        df = df.rename(columns=rename_map)
    else:
        if df.shape[1] >= 2:
            first_is_index = df.iloc[:,0].astype(str).str.match(r'^\s*\d+\s*$').all()
            second_looks_like_rxn = df.shape[1] >= 2 and df.iloc[:,1].astype(str).str.contains(r'[\.\~\[\]\/\\\=\#]').any()
            if first_is_index and second_looks_like_rxn and df.shape[1] >= 3:
                cols = list(df.columns)
                new_cols = ["suzuki_index","rxn","y"] + [f"col_{i}" for i in range(3, len(cols))]
                df.columns = new_cols
            else:
                new_cols = ["rxn", "y"] + [f"col_{i}" for i in range(2, df.shape[1])]
                df.columns = new_cols
    return df

def load_and_standardize(n_splits=10, n_jobs=-1, remove_aam=False, progress=True):
    dfs = []
    for i in range(n_splits):
        name = f"random_split_{i}.tsv"
        url = f"{_RAW_BASE}/{name}"
        try:
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            txt = r.text
        except Exception as e:
            print(f"Warning: failed to download {url} -> {e}")
            continue
        df = _parse_text_to_df(txt)
        df["split_file"] = name
        if "y" in df.columns:
            df["y"] = pd.to_numeric(df["y"], errors="coerce")
        dfs.append(df)
    if not dfs:
        raise RuntimeError("No dataframes downloaded/parsed successfully.")
    combined = pd.concat(dfs, ignore_index=True, sort=False)
    combined = combined.loc[:, ~combined.columns.str.match(r"^Unnamed")]
    if "y" in combined.columns:
        combined = combined.rename(columns={"y": "yield"})
    if "split_file" in combined.columns:
        combined = combined.rename(columns={"split_file": "source"})
    if "suzuki_index" in combined.columns:
        combined["suzuki_index"] = pd.to_numeric(combined["suzuki_index"], errors="coerce").astype("Int64")
    else:
        candidate_idx = None
        for c in combined.columns:
            if combined[c].astype(str).str.match(r'^\s*\d+\s*$').all():
                candidate_idx = c
                break
        if candidate_idx is not None and candidate_idx != "rxn":
            combined["suzuki_index"] = pd.to_numeric(combined[candidate_idx], errors="coerce").astype("Int64")
        else:
            combined["suzuki_index"] = pd.RangeIndex(start=0, stop=len(combined), step=1)
    if "rxn" not in combined.columns:
        rxn_col = None
        for c in combined.columns:
            if combined[c].astype(str).str.contains(r'[\.\~\[\]\/\\\=\#]').any():
                rxn_col = c
                break
        if rxn_col is None:
            raise RuntimeError("Parsed data does not contain an 'rxn' column.")
        combined = combined.rename(columns={rxn_col: "rxn"})
    combined["rxn"] = combined["rxn"].astype(str)
    rxn_list = combined["rxn"].tolist()
    if progress:
        rxn_list_for_jobs = list(tqdm(rxn_list, desc="Scheduling standardization", leave=True))
    else:
        rxn_list_for_jobs = rxn_list
    results = Parallel(n_jobs=n_jobs, prefer="processes")(
        delayed(_standardize_one)(r, remove_aam) for r in rxn_list_for_jobs
    )
    combined["rxn"] = results
    combined["R-id"] = combined["suzuki_index"].apply(lambda v: f"Suzuki-Miyaura-{int(v)}" if pd.notna(v) else "")
    final_cols = ["R-id", "rxn"]
    if "yield" in combined.columns:
        final_cols.append("yield")
    if "source" in combined.columns:
        final_cols.append("source")
    df_out = combined[final_cols].reset_index(drop=True)
    return df_out

In [None]:
df = load_and_standardize(n_splits=10, n_jobs=4, remove_aam=False)
save_df_gz(df, '../Data/property/suzuki_miyaura.csv.gz')

# 13. Buchwald Hartwig

In [None]:
import requests
import io
import pandas as pd

url = "https://github.com/reymond-group/drfp/raw/refs/heads/main/data/Dreher_and_Doyle_input_data.xlsx"
r = requests.get(url, timeout=30)
r.raise_for_status()
buf = io.BytesIO(r.content)

# list sheet names
xls = pd.ExcelFile(buf, engine="openpyxl")
print("sheets:", xls.sheet_names)

# read a sheet by name (rewind buffer or re-open ExcelFile)
df = pd.read_excel(buf, sheet_name=xls.sheet_names[0], engine="openpyxl")
print(df.head())


# 14. USPTO

In [None]:
import re
import pandas as pd
from joblib import Parallel, delayed
from tqdm.auto import tqdm

_STD = None

def _get_std():
    global _STD
    if _STD is None:
        from synkit.Chem.Reaction.standardize import Standardize
        _STD = Standardize()
    return _STD

def _standardize_one(rxns, remove_aam=False):
    try:
        std = _get_std()
    except Exception as e:
        raise RuntimeError("Failed to import synkit.Standardize. Ensure synkit is installed.") from e
    try:
        try:
            out = std.fit(rxns, remove_aam=remove_aam)
        except TypeError:
            out = std.fit(rxns)
        if out is None:
            for attr in ("canonical_rsmi", "rsmi", "canonical", "standardized"):
                if hasattr(std, attr):
                    maybe = getattr(std, attr)
                    if isinstance(maybe, str) and maybe:
                        return maybe
            return rxns
        if isinstance(out, (list, tuple)):
            return "|".join(map(str, out))
        return str(out)
    except Exception:
        return rxns

def _find_rxn_col(df):
    if "rxn" in df.columns:
        return "rxn"
    candidates = []
    pattern = re.compile(r'[\.\~\[\]\/\\\=\#]')  # chars typical in reaction SMILES
    for c in df.columns:
        try:
            s = df[c].astype(str)
            if s.str.contains(pattern).any():
                candidates.append(c)
        except Exception:
            continue
    return candidates[0] if candidates else None

def standardize_df_rxn(df, rxn_col=None, n_jobs=-1, remove_aam=False, progress=True):
    if rxn_col is None:
        rxn_col = _find_rxn_col(df)
    if rxn_col is None:
        raise RuntimeError("Could not locate an 'rxn' column. Provide rxn_col explicitly.")
    rxn_series = df[rxn_col].astype(str).tolist()
    if progress:
        rxn_for_jobs = list(tqdm(rxn_series, desc="Scheduling standardization", leave=True))
    else:
        rxn_for_jobs = rxn_series
    results = Parallel(n_jobs=n_jobs, prefer="processes")(
        delayed(_standardize_one)(r, remove_aam) for r in rxn_for_jobs
    )
    df[rxn_col] = results
    return df


In [None]:

uspto_yields_above = pd.read_csv('https://github.com/reymond-group/drfp/raw/refs/heads/main/data/uspto_yields_above.csv')
uspto_yields_above['Type'] = 'above'

uspto_yields_below = pd.read_csv('https://github.com/reymond-group/drfp/raw/refs/heads/main/data/uspto_yields_below.csv')
uspto_yields_below['Type'] = 'below'

uspto_yields = pd.concat([uspto_yields_above, uspto_yields_below], axis=0, ignore_index=True)
uspto_yields.reset_index(inplace=True)


uspto_yields = standardize_df_rxn(uspto_yields, n_jobs=-1, remove_aam=False, progress=True)


In [None]:
save_df_gz(uspto_yields, '../Data/property/uspto_yield.csv.gz')