In [None]:
import sys
sys.path.append('../')

# 1. USPTO 50k

In [None]:
import io
import zipfile
import requests
from typing import Optional, Dict, Iterable
import pandas as pd

DEFAULT_PATTERNS = {
    "raw_train": "train",
    "raw_val": "valid",
    "raw_valid": "valid",
    "raw_test": "test",
}

def _normalize_dropbox_url(url: str) -> str:
    if "dropbox.com" not in url:
        return url
    if "dl=0" in url:
        return url.replace("dl=0", "dl=1")
    if "dl=1" in url:
        return url
    return url + ("&dl=1" if "?" in url else "?dl=1")

def _find_member_for_pattern(namelist: Iterable[str], token: str) -> Optional[str]:
    token_l = token.lower()
    for nm in namelist:
        if nm.lower().endswith(token_l) or nm.lower().endswith(token_l + ".csv") or nm.lower().endswith(token_l + ".txt"):
            return nm
    for nm in namelist:
        if token_l in nm.lower():
            return nm
    return None

def download_and_combine_raw_splits(
    url: str,
    patterns: Optional[Dict[str, str]] = None,
    *,
    encoding: str = "utf-8",
    treat_txt_as_lines: bool = True,
    save_csv: Optional[str] = None,
    timeout: int = 60,
) -> pd.DataFrame:
    if patterns is None:
        patterns = DEFAULT_PATTERNS

    url = _normalize_dropbox_url(url)
    resp = requests.get(url, stream=True, timeout=timeout)
    resp.raise_for_status()
    raw = resp.content
    bio = io.BytesIO(raw)

    try:
        with zipfile.ZipFile(bio) as zf:
            namelist = zf.namelist()
            dfs = []
            for token, split_label in patterns.items():
                member = _find_member_for_pattern(namelist, token)
                if not member:
                    continue
                with zf.open(member) as fh:
                    if member.lower().endswith(".csv"):
                        df = pd.read_csv(io.TextIOWrapper(fh, encoding=encoding))
                    else:
                        text = fh.read().decode(encoding, errors="replace")
                        if treat_txt_as_lines:
                            lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
                            df = pd.DataFrame({"rxn": lines})
                        else:
                            fh.seek(0)
                            df = pd.read_csv(io.TextIOWrapper(fh, encoding=encoding))
                    df["split"] = split_label
                    dfs.append(df)
            if not dfs:
                raise RuntimeError("No files matching raw_train/raw_val/raw_test found inside ZIP.")
            combined = pd.concat(dfs, ignore_index=True, sort=False)
            if save_csv:
                combined.to_csv(save_csv, index=False)
            return combined
    except zipfile.BadZipFile:
        pass

    try:
        bio.seek(0)
        df_all = pd.read_csv(io.BytesIO(raw))
        if "split" in df_all.columns:
            if save_csv:
                df_all.to_csv(save_csv, index=False)
            return df_all
        inferred_split = None
        lower_url = url.lower()
        for token, split_label in patterns.items():
            if token in lower_url:
                inferred_split = split_label
                break
        if inferred_split is None:
            inferred_split = "train"
        df_all["split"] = inferred_split
        if save_csv:
            df_all.to_csv(save_csv, index=False)
        return df_all
    except Exception:
        text = raw.decode(encoding, errors="replace")
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        lower_url = url.lower()
        inferred_split = None
        for token, split_label in patterns.items():
            if token in lower_url:
                inferred_split = split_label
                break
        if inferred_split is None:
            inferred_split = "train"
        df = pd.DataFrame({"rxn": lines})
        df["split"] = inferred_split
        if save_csv:
            df.to_csv(save_csv, index=False)
        return df

In [None]:
import pandas as pd
from typing import Optional, Union

def curate_uspto_minimal(
    df_or_path: Union[pd.DataFrame, str],
    id_col: str = "id",
    rxn_col: str = "reactants>reagents>production",
    split_col: str = "split",
    zero_based_rid: bool = True,
    drop_original_rxn: bool = True,
    encoding: str = "utf-8",
) -> pd.DataFrame:
    if isinstance(df_or_path, str):
        df = pd.read_csv(df_or_path, encoding=encoding)
    else:
        df = df_or_path.copy()

    if id_col in df.columns:
        df = df.rename(columns={id_col: "source"})
    else:
        df["source"] = df.index.astype(str)

    if rxn_col not in df.columns:
        raise KeyError(f"Reaction column '{rxn_col}' not found.")

    start = 0 if zero_based_rid else 1
    df = df.reset_index(drop=True)
    df["R-id"] = [f"uspto_{i + start}" for i in df.index]

    def _split_make_rxn(cell):
        if pd.isna(cell):
            return None
        s = str(cell).strip()
        parts = [p.strip() for p in s.split(">")]
        if len(parts) >= 3:
            reactants = parts[0] or ""
            production = ">".join(p for p in parts[2:] if p != "") or ""
        elif len(parts) == 2:
            reactants = parts[0] or ""
            production = parts[1] or ""
        else:
            reactants = parts[0] or ""
            production = ""
        if reactants == "" and production == "":
            return None
        return f"{reactants}>>{production}"

    df["aam"] = df[rxn_col].apply(_split_make_rxn)

    if split_col in df.columns:
        out_split = df[split_col].astype(object)
    else:
        out_split = pd.Series([None] * len(df), name="split")

    out = pd.DataFrame({
        "R-id": df["R-id"],
        "aam": df["aam"],
        "split": out_split,
        "source": df["source"],
    })

    return out

In [None]:
dropbox_url = "https://www.dropbox.com/scl/fo/df10x2546d7a0483tousa/AGhjiD7hSUY4AmQJd3DrUQE/USPTO_50K_data?dl=0&rlkey=n2s3kn34bnfkzkmii4jeb9woy&subfolder_nav_tracking=1"
df = download_and_combine_raw_splits(dropbox_url, save_csv=None)
print(df.shape)
print(df["split"].value_counts())
display(df.head())


In [None]:
import pandas as pd
import traceback
from typing import Tuple, Optional, Any, List, Dict
from joblib import Parallel, delayed


_WORKER_STD = None
_WORKER_CANON = None

def _create_worker_instances(std_factory=None, canon_factory=None):
    """Create or return cached Standardize/CanonRSMI instances inside a worker."""
    global _WORKER_STD, _WORKER_CANON
    if _WORKER_STD is None or _WORKER_CANON is None:
        if std_factory is not None and canon_factory is not None:
            _WORKER_STD = std_factory()
            _WORKER_CANON = canon_factory()
        else:
            # default lazy import/construct
            try:
                from synkit.Chem.Reaction.standardize import Standardize
                from synkit.Chem.Reaction.canon_rsmi import CanonRSMI
            except Exception as e:
                raise RuntimeError("Failed to import Standardize/CanonRSMI in worker: " + str(e))
            _WORKER_STD = Standardize()
            _WORKER_CANON = CanonRSMI()
    return _WORKER_STD, _WORKER_CANON


def _canonicalise_value_worker_v2(idx, original_value, aam_col,
                                  std_factory, canon_factory):
    """
    Worker function: apply std.fit(..., remove_aam=False) then canon.canonicalise(...).canonical_rsmi
    Returns: (idx, canonical_string_or_None, error_or_None)
    """
    try:
        std, canon = _create_worker_instances(std_factory=std_factory, canon_factory=canon_factory)
    except Exception as e:
        tb = traceback.format_exc(limit=6)
        return idx, None, f"worker init/import error: {e}\n{tb}"

    def _to_str_or_none(x: Any) -> Optional[str]:
        if x is None:
            return None
        if isinstance(x, float) and pd.isna(x):
            return None
        s = str(x).strip()
        return s if s != "" else None

    try:
        s = _to_str_or_none(original_value)
        if s is None:
            return idx, None, "empty_or_nan"

        # IMPORTANT: use remove_aam=False per user's correct code
        try:
            fitted = std.fit(s, remove_aam=False)
        except TypeError:
            # in case older/newer API has different signature, try fallback without kwargs
            try:
                fitted = std.fit(s)
            except Exception as e:
                tb = traceback.format_exc(limit=6)
                return idx, None, f"std.fit error: {e}\n{tb}"
        except Exception as e:
            tb = traceback.format_exc(limit=6)
            return idx, None, f"std.fit error: {e}\n{tb}"

        # decide candidate to pass to canonicalise
        cand = None
        if isinstance(fitted, str):
            cand = fitted
        else:
            # try common attr names
            for attr in ("canonical_rsmi", "rsmi", "rxn", "reaction", "smiles"):
                if hasattr(fitted, attr):
                    val = getattr(fitted, attr)
                    if isinstance(val, str) and val.strip():
                        cand = val.strip()
                        break
            if cand is None:
                # fallback to string representation
                cand = str(fitted).strip() if str(fitted).strip() else None

        if not cand:
            return idx, None, "empty_after_std"

        # canonicalise
        try:
            canon_out = canon.canonicalise(cand)
        except Exception as e:
            tb = traceback.format_exc(limit=6)
            return idx, None, f"canon.canonicalise error: {e}\n{tb}"

        # extract canonical_rsmi preferentially
        canonical_string = None
        if isinstance(canon_out, str):
            canonical_string = canon_out.strip()
        else:
            if hasattr(canon_out, "canonical_rsmi"):
                val = getattr(canon_out, "canonical_rsmi")
                if isinstance(val, str) and val.strip():
                    canonical_string = val.strip()
            # tolerant fallback: try other attrs if canonical_rsmi missing
            if canonical_string is None:
                for attr in ("canonical", "canonical_smiles", "rsmi", "r_smiles"):
                    if hasattr(canon_out, attr):
                        val = getattr(canon_out, attr)
                        if isinstance(val, str) and val.strip():
                            canonical_string = val.strip()
                            break
            if canonical_string is None:
                # final fallback to str()
                srep = str(canon_out).strip()
                canonical_string = srep if srep else None

        if not canonical_string:
            return idx, None, "empty_after_canonicalise"

        return idx, canonical_string, None

    except Exception as e:
        tb = traceback.format_exc(limit=6)
        return idx, None, f"unexpected error: {e}\n{tb}"


def fix_aam(
    df: pd.DataFrame,
    std=None,
    canon=None,
    aam_col: str = "aam",
    out_col: Optional[str] = None,
    overwrite: bool = True,
    show_progress: bool = False,
    stop_on_error: bool = False,
    n_jobs: int = 1,
    backend: str = "loky",
    std_factory=None,
    canon_factory=None,
    batch_size: int = 1,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Parallel canonicalisation using std.fit(..., remove_aam=False) and canon.canonicalise(...).canonical_rsmi.

    - If n_jobs==1, uses provided std/canon instances (or imports locally).
    - If n_jobs!=1, workers instantiate std/canon themselves via std_factory/canon_factory or default import.
    - Returns (out_df, errors_df).
    """
    if out_col is None:
        out_col = aam_col if overwrite else f"{aam_col}_fixed"

    if aam_col not in df.columns:
        raise KeyError(f"Column '{aam_col}' not found in DataFrame")

    # Serial fast path (n_jobs == 1)
    if n_jobs == 1:
        # ensure std/canon available
        if std is None or canon is None:
            try:
                from synkit.Chem.Reaction.standardize import Standardize
                from synkit.Chem.Reaction.canon_rsmi import CanonRSMI
            except Exception as e:
                raise RuntimeError("Failed to import Standardize/CanonRSMI locally: " + str(e))
            std = std or Standardize()
            canon = canon or CanonRSMI()

        def _canonicalise_one_serial(original_value):
            try:
                if original_value is None or (isinstance(original_value, float) and pd.isna(original_value)):
                    return None, "empty_or_nan"
                s = str(original_value).strip()
                if s == "":
                    return None, "empty_or_nan"

                try:
                    fitted = std.fit(s, remove_aam=False)
                except TypeError:
                    fitted = std.fit(s)
                except Exception as e:
                    tb = traceback.format_exc(limit=6)
                    return None, f"std.fit error: {e}\n{tb}"

                cand = None
                if isinstance(fitted, str):
                    cand = fitted
                else:
                    for attr in ("canonical_rsmi", "rsmi", "rxn", "reaction", "smiles"):
                        if hasattr(fitted, attr):
                            val = getattr(fitted, attr)
                            if isinstance(val, str) and val.strip():
                                cand = val.strip()
                                break
                    if cand is None:
                        cand = str(fitted).strip() if str(fitted).strip() else None

                if not cand:
                    return None, "empty_after_std"

                try:
                    canon_out = canon.canonicalise(cand)
                except Exception as e:
                    tb = traceback.format_exc(limit=6)
                    return None, f"canon.canonicalise error: {e}\n{tb}"

                canonical_string = None
                if hasattr(canon_out, "canonical_rsmi"):
                    val = getattr(canon_out, "canonical_rsmi")
                    if isinstance(val, str) and val.strip():
                        canonical_string = val.strip()
                if canonical_string is None:
                    for attr in ("canonical", "canonical_smiles", "rsmi", "r_smiles"):
                        if hasattr(canon_out, attr):
                            val = getattr(canon_out, attr)
                            if isinstance(val, str) and val.strip():
                                canonical_string = val.strip()
                                break
                if canonical_string is None:
                    sval = str(canon_out).strip()
                    canonical_string = sval if sval else None

                if not canonical_string:
                    return None, "empty_after_canonicalise"
                return canonical_string, None
            except Exception as e:
                tb = traceback.format_exc(limit=6)
                return None, f"unexpected error: {e}\n{tb}"

        out_df = df.copy()
        errors: List[Dict] = []
        for idx in df.index:
            original = out_df.at[idx, aam_col]
            fixed, err = _canonicalise_one_serial(original)
            out_df.at[idx, out_col] = fixed
            if err is not None:
                errors.append({
                    "index": idx,
                    "R-id": out_df.at[idx, "R-id"] if "R-id" in out_df.columns else None,
                    "source": out_df.at[idx, "source"] if "source" in out_df.columns else None,
                    "original": original,
                    "error": err,
                })
                if stop_on_error:
                    raise RuntimeError(f"Row {idx} failed: {err}; original={original}")
        errors_df = pd.DataFrame(errors)
        if overwrite and out_col != aam_col:
            out_df[aam_col] = out_df[out_col]
            out_df.drop(columns=[out_col], inplace=True)
        return out_df, errors_df

    # Parallel path (n_jobs != 1)
    tasks = [(int(idx), df.at[idx, aam_col]) for idx in df.index]

    results = Parallel(n_jobs=n_jobs, backend=backend, batch_size=batch_size)(
        delayed(_canonicalise_value_worker_v2)(idx, original, aam_col, std_factory, canon_factory)
        for idx, original in tasks
    )

    out_df = df.copy()
    errors: List[Dict] = []
    for idx, canonical_string, err in results:
        out_df.at[idx, out_col] = canonical_string
        if err is not None:
            errors.append({
                "index": idx,
                "R-id": out_df.at[idx, "R-id"] if "R-id" in out_df.columns else None,
                "source": out_df.at[idx, "source"] if "source" in out_df.columns else None,
                "original": df.at[idx, aam_col],
                "error": err,
            })

    errors_df = pd.DataFrame(errors)

    if stop_on_error and not errors_df.empty:
        sample = errors_df.iloc[0].to_dict()
        raise RuntimeError(f"Errors occurred during parallel processing; sample error: {sample}")

    if overwrite and out_col != aam_col:
        out_df[aam_col] = out_df[out_col]
        out_df.drop(columns=[out_col], inplace=True)

    return out_df, errors_df


In [None]:
from synkit.Chem.Reaction.standardize import Standardize
from synkit.Chem.Reaction.canon_rsmi import CanonRSMI
std = Standardize()
canon = CanonRSMI()
df = curate_uspto_minimal(df)
df_fixed, errors_df = fix_aam(df.iloc[:,:], std=std, canon=canon, n_jobs=4, show_progress=True)


In [None]:
from synrxn.io.io import save_df_gz
save_df_gz(df_fixed, '../Data/synthesis/uspto_50k.csv.gz')

# 2. USPTO MIT

In [None]:
import io
import zipfile
import requests
import re
from typing import Dict, Optional, List, Tuple
import pandas as pd

RC_PATTERN = re.compile(r'^\d+-\d+(?:;\d+-\d+)*$')

def _parse_rc_string_to_tuples(rc: str, zero_index: bool = False) -> Optional[List[Tuple[int,int]]]:
    """'15-19;6-15;6-8' -> [(15,19),(6,15),(6,8)] (or zero-indexed if zero_index=True)."""
    if not rc:
        return None
    parts = rc.split(';')
    parsed = []
    for p in parts:
        if '-' not in p:
            return None
        a, b = p.split('-', 1)
        try:
            ai, bi = int(a), int(b)
        except ValueError:
            return None
        if zero_index:
            ai -= 1
            bi -= 1
        parsed.append((ai, bi))
    return parsed

def process_uspto_mt(
    url: str,
    files_map: Optional[Dict[str, str]] = None,
    *,
    encoding: str = "utf-8",
    strip_lines: bool = True,
    zero_index: bool = False,
    save_csv: Optional[str] = None,
    timeout: int = 30,
) -> pd.DataFrame:
    """
    Download ZIP at `url`, extract split files and return DataFrame with columns:
      - rxn : reaction string (line content without trailing rc token)
      - split: 'train'|'valid'|'test'
      - rc  : parsed list of (int,int) tuples (e.g. [(15,19),(6,15)]) or None

    Notes:
      - If a trailing whitespace-separated token matches the RC pattern (N-N[;N-N...]),
        it will be parsed and stored in `rc` as tuples. Otherwise `rc` is None.
      - Set zero_index=True if you prefer rc indices to be zero-based.
    """
    if files_map is None:
        files_map = {"train": "train.txt", "valid": "valid.txt", "test": "test.txt"}

    resp = requests.get(url, stream=True, timeout=timeout)
    resp.raise_for_status()
    zbytes = io.BytesIO(resp.content)

    def _find_member(zf: zipfile.ZipFile, target: str) -> Optional[str]:
        nm_list = zf.namelist()
        if target in nm_list:
            return target
        low = target.lower()
        for nm in nm_list:
            if nm.lower().endswith(low):
                return nm
        for nm in nm_list:
            if low in nm.lower():
                return nm
        return None

    rows = []
    with zipfile.ZipFile(zbytes) as zf:
        for split, desired_name in files_map.items():
            member = _find_member(zf, desired_name)
            if member is None:
                print(f"Warning: '{desired_name}' not found in archive; skipping split '{split}'.")
                continue
            with zf.open(member) as fh:
                text = fh.read().decode(encoding, errors="replace")

            for line in text.splitlines():
                if strip_lines:
                    line = line.strip()
                if not line:
                    continue

                parts = line.rsplit(None, 1)  # split on last whitespace token
                rc_parsed = None
                if len(parts) == 2 and RC_PATTERN.fullmatch(parts[1]):
                    rc_parsed = _parse_rc_string_to_tuples(parts[1], zero_index=zero_index)
                    rxn = parts[0]
                else:
                    rxn = line
                    rc_parsed = None

                rows.append({"aam": rxn, "split": split, "rc": rc_parsed})

    if not rows:
        raise RuntimeError("No data found in ZIP for any requested split files.")

    df = pd.DataFrame(rows)
    if save_csv:
        df.to_csv(save_csv, index=False)
    return df


In [None]:
url = "https://github.com/wengong-jin/nips17-rexgen/raw/refs/heads/master/USPTO/data.zip"
df = process_uspto_mt(url)
print(df.shape)
display(df.head())

In [None]:
df_fixed, errors_df = fix_aam(df.iloc[:,:], std=std, canon=canon, n_jobs=6, show_progress=True)

In [None]:
save_df_gz(df_fixed, '../Data/synthesis/uspto_mit.csv.gz')

# USPTO_500_MT

In [None]:
from typing import Dict, Optional
import io
import tarfile
import requests
import pandas as pd

def load_df_from_tar_url(
    url: str,
    sep: Optional[str] = None,
    names: Optional[list] = None,
    encoding: str = "utf-8",
    timeout: int = 60,
    max_in_memory_bytes: int = 300 * 1024 * 1024,
) -> Dict[str, pd.DataFrame]:
    resp = requests.get(url, stream=True, timeout=timeout)
    resp.raise_for_status()
    results: Dict[str, pd.DataFrame] = {}
    with tarfile.open(fileobj=resp.raw, mode="r|bz2") as tf:
        for member in tf:
            if not member.isreg():
                continue
            member_name = member.name
            fh = tf.extractfile(member)
            if fh is None:
                continue
            try:
                b = fh.read()
            except Exception:
                continue
            if len(b) > max_in_memory_bytes:
                pass
            try:
                text = b.decode(encoding)
            except Exception:
                text = b.decode(encoding, errors="replace")
            stream = io.StringIO(text)
            try:
                if names is not None:
                    df = pd.read_csv(stream, header=None, names=names, sep=sep)
                else:
                    if sep is None:
                        df = pd.read_csv(stream, sep=None, engine="python", low_memory=False)
                    else:
                        df = pd.read_csv(stream, sep=sep, low_memory=False)
            except Exception:
                try:
                    stream.seek(0)
                    df = pd.read_csv(stream, header=None, names=names or ["col0"], sep=sep or r"\s+", engine="python")
                except Exception:
                    continue
            results[member_name] = df
    return results


In [None]:
URL = "https://yzhang.hpc.nyu.edu/T5Chem/data/USPTO_500_MT.tar.bz2"

# load the first CSV whose path contains "train" (substring match)
dfs = load_df_from_tar_url(URL)
# `dfs` will be { 'path/inside/archive/train.csv': DataFrame(...) }



In [48]:
from typing import Dict, Optional
import io
import tarfile
import requests
import pandas as pd
import numpy as np

def load_reagents_from_tar_url(
    url: str,
    sep: Optional[str] = None,
    names: Optional[list] = None,
    encoding: str = "utf-8",
    timeout: int = 60,
    max_in_memory_bytes: int = 300 * 1024 * 1024,
) -> Dict[str, pd.DataFrame]:
    resp = requests.get(url, stream=True, timeout=timeout)
    resp.raise_for_status()
    results: Dict[str, pd.DataFrame] = {}
    with tarfile.open(fileobj=resp.raw, mode="r|bz2") as tf:
        for member in tf:
            if not member.isreg():
                continue
            member_name = member.name
            if not member_name.startswith("data/USPTO_500_MT/Reagents/"):
                continue
            fh = tf.extractfile(member)
            if fh is None:
                continue
            try:
                b = fh.read()
            except Exception:
                continue
            if len(b) > max_in_memory_bytes:
                pass
            try:
                text = b.decode(encoding)
            except Exception:
                text = b.decode(encoding, errors="replace")
            stream = io.StringIO(text)
            try:
                if names is not None:
                    df = pd.read_csv(stream, header=None, names=names, sep=sep)
                else:
                    if sep is None:
                        df = pd.read_csv(stream, sep=None, engine="python", low_memory=False)
                    else:
                        df = pd.read_csv(stream, sep=sep, low_memory=False)
            except Exception:
                try:
                    stream.seek(0)
                    df = pd.read_csv(stream, header=None, names=names or ["col0"], sep=sep or r"\s+", engine="python")
                except Exception:
                    continue
            results[member_name] = df
    return results

def combine_reagents_dict(dfs: Dict[str, pd.DataFrame], prefix: str = "data/USPTO_500_MT/Reagents/") -> pd.DataFrame:
    mapping = {}
    for k, df in dfs.items():
        name = k[len(prefix):] if k.startswith(prefix) else k.split("/")[-1]
        parts = name.split(".")
        if len(parts) < 2:
            continue
        split, role = parts[0], parts[1]
        mapping.setdefault(split, {})[role] = df.reset_index(drop=True)

    rows = []
    for split, grp in mapping.items():
        src_df = grp.get("source")
        tgt_df = grp.get("target")
        def col_series(df):
            if df is None:
                return None
            return df.iloc[:, 0].astype(str).str.strip().reset_index(drop=True)
        src_s = col_series(src_df)
        tgt_s = col_series(tgt_df)
        n = max(0, (len(src_s) if src_s is not None else 0), (len(tgt_s) if tgt_s is not None else 0))
        for i in range(n):
            rxn = src_s.iloc[i] if (src_s is not None and i < len(src_s)) else np.nan
            reagent = tgt_s.iloc[i] if (tgt_s is not None and i < len(tgt_s)) else np.nan
            rows.append({"rxn": rxn, "reagent": reagent, "split": split})
    return pd.DataFrame(rows)[["rxn", "reagent", "split"]]


In [None]:
URL = "https://yzhang.hpc.nyu.edu/T5Chem/data/USPTO_500_MT.tar.bz2"
data = load_reagents_from_tar_url(URL)
results = combine_reagents_dict(data)

In [None]:
results = combine_reagents_dict(data)

In [None]:
results['split'].value_counts()