In [1]:
import sys
sys.path.append('../')
from synrxn.io.io import save_df_gz

# 1. USPTO 50k

In [None]:
from typing import Optional, Mapping, Any
import pandas as pd
import ast
import logging

logger = logging.getLogger(__name__)


def _extract_first_str(value: Any) -> Optional[str]:
    """Return the first non-empty string from a value that may be a string, list, tuple,
    or string-encoded list. Returns None for missing/NaN."""
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    if isinstance(value, (list, tuple)):
        for el in value:
            if el is None:
                continue
            s = str(el).strip()
            if s:
                return s
        return None
    if isinstance(value, str):
        s = value.strip()
        # try to parse python literal like "['a','b']"
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple)) and parsed:
                    return str(parsed[0]).strip() or None
            except Exception:
                pass
        return s or None
    # fallback
    s = str(value).strip()
    return s if s else None


def uspto_curate(
    df: pd.DataFrame,
    *,
    id_col: str = "uspto_index",
    id_prefix: str = "USPTO",
    class_col: str = "new_class",
    reactions_col: str = "reactions",
    split_col: str = "split",
    class_map: Optional[Mapping[int, str]] = None,
    default_split: str = "train",
) -> pd.DataFrame:
    """
    Curate a USPTO-style dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe (must contain `reactions` and `new_class` in typical usage).
    id_col : str
        Column to use for the numeric/index identifier if present. If absent, df.index is used.
    id_prefix : str
        Prefix for R-id (final id will be f"{id_prefix}_{id_value}").
    class_col : str
        Column name containing class integers (will be turned into `label`).
    reactions_col : str
        Column name containing reaction SMILES (renamed to `rxn`).
    split_col : str
        Column name containing split values (kept as `split` in output).
    class_map : Mapping[int, str] or None
        Optional mapping to translate numeric classes to human labels. If None, raw class retained.
    default_split : str
        When `split` column is missing or NaN, fill using this value.

    Returns
    -------
    pd.DataFrame
        Curated dataframe with columns ordered: ["R-id", "rxn", "label", "split"].
    """
    df_in = df.copy()

    # Check presence of reactions column
    if reactions_col not in df_in.columns:
        raise ValueError(f"Input dataframe must contain a '{reactions_col}' column.")

    # Prepare identifier source: either id_col or index
    if id_col in df_in.columns:
        id_values = df_in[id_col].astype(str)
    else:
        # use index
        id_values = df_in.index.astype(str)

    # Build R-id
    R_ids = [f"{id_prefix}_{v}" for v in id_values]

    # Extract rxn (first element if list-like or stringified list)
    rxn_series = df_in[reactions_col].apply(_extract_first_str)

    # Prepare label column from class_col
    if class_col in df_in.columns:
        raw_labels = df_in[class_col]
        # apply map if provided, otherwise attempt to cast to int where possible
        if class_map is not None:
            def _map_label(x):
                try:
                    return class_map.get(int(x), class_map.get(x, x))
                except Exception:
                    return class_map.get(x, x)
            label_series = raw_labels.map(_map_label)
        else:
            # try int where possible, else leave as-is
            def _to_int_or_pass(x):
                try:
                    if pd.isna(x):
                        return None
                    return int(x)
                except Exception:
                    return x
            label_series = raw_labels.map(_to_int_or_pass)
    else:
        # no class column: create label as None
        label_series = pd.Series([None] * len(df_in), index=df_in.index)

    # Prepare split column
    if split_col in df_in.columns:
        split_series = df_in[split_col].fillna(default_split).astype(str)
    else:
        split_series = pd.Series([default_split] * len(df_in), index=df_in.index)

    # Compose output DataFrame
    out = pd.DataFrame(
        {
            "R-id": R_ids,
            "rxn": rxn_series,
            "label": label_series,
            "split": split_series,
        },
        index=df_in.index,
    )

    # Reorder and reset index for cleanliness
    out = out[["R-id", "rxn", "label", "split"]].reset_index(drop=True)
    return out


In [None]:
uspto_b = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/USPTO_50k_balanced.csv.gz"
uspto_u = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/USPTO_50k_unbalanced.csv.gz"
uspto_u = pd.read_csv(uspto_u, compression="gzip", low_memory=False)
uspto_b = pd.read_csv(uspto_b, compression="gzip", low_memory=False)


uspto_u = uspto_curate(uspto_u)  
uspto_b = uspto_curate(uspto_b) 

save_df_gz(uspto_u, '../Data/classification/uspto_50k_u.csv.gz')
save_df_gz(uspto_b, '../Data/classification/uspto_50k_b.csv.gz')

# 2. Schneider

In [None]:
from typing import Optional, Mapping, Any
import pandas as pd
import ast
import logging

logger = logging.getLogger(__name__)


def _extract_first_str(value: Any) -> Optional[str]:
    """
    Return the first non-empty string from a value that may be a string, list, tuple,
    or string-encoded list. Returns None for missing/NaN.
    """
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    if isinstance(value, (list, tuple)):
        for el in value:
            if el is None:
                continue
            s = str(el).strip()
            if s:
                return s
        return None
    if isinstance(value, str):
        s = value.strip()
        # try to parse python literal like "['a','b']"
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple)) and parsed:
                    return str(parsed[0]).strip() or None
            except Exception:
                pass
        return s or None
    # fallback
    s = str(value).strip()
    return s if s else None


def schneider_curate(
    df: pd.DataFrame,
    *,
    id_col: str = "schneider_index",
    id_prefix: str = "sch",
    rxn_col: str = "rxn",
    split_col: str = "split",
    y_col: str = "y",
    class_map: Optional[Mapping[int, str]] = None,
    default_split: str = "train",
) -> pd.DataFrame:
    """
    Curate a Schneider-style dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe expected to contain at least `rxn` and `y` (or their equivalents).
    id_col : str
        Column to use for the identifier if present (fallback to df.index when absent).
    id_prefix : str
        Prefix for the generated R-id (final R-id will be f"{id_prefix}_{id_value}").
    rxn_col : str
        Column containing reaction strings; will be normalized to `rxn` in output.
    split_col : str
        Column containing split labels; kept as `split`. If missing, filled with `default_split`.
    y_col : str
        Column containing target integers used to create `label`.
    class_map : Mapping[int, str] or None
        Optional mapping to translate `y` integer values to textual labels.
    default_split : str
        Value to use when `split` is missing/NaN.

    Returns
    -------
    pd.DataFrame
        Curated dataframe with columns ordered: ["R-id", "rxn", "label", "split"].
    """
    df_in = df.copy()

    # Build identifier values (stringified)
    if id_col in df_in.columns:
        id_values = df_in[id_col].astype(str)
    else:
        id_values = df_in.index.astype(str)
    R_ids = [f"{id_prefix}_{v}" for v in id_values]

    # Extract rxn (first element if list-like or stringified list)
    if rxn_col not in df_in.columns:
        raise ValueError(f"Input dataframe must contain a '{rxn_col}' column.")
    rxn_series = df_in[rxn_col].apply(_extract_first_str)

    # Prepare label from y_col
    if y_col in df_in.columns:
        raw_y = df_in[y_col]
        if class_map is not None:
            def _map_y(x):
                try:
                    return class_map.get(int(x), class_map.get(x, x))
                except Exception:
                    return class_map.get(x, x)
            label_series = raw_y.map(_map_y)
        else:
            # try to cast to int where possible
            def _to_int_or_pass(x):
                try:
                    if pd.isna(x):
                        return None
                    return int(x)
                except Exception:
                    return x
            label_series = raw_y.map(_to_int_or_pass)
    else:
        label_series = pd.Series([None] * len(df_in), index=df_in.index)

    # Prepare split
    if split_col in df_in.columns:
        split_series = df_in[split_col].fillna(default_split).astype(str)
    else:
        split_series = pd.Series([default_split] * len(df_in), index=df_in.index)

    # Compose output
    out = pd.DataFrame(
        {
            "R-id": R_ids,
            "rxn": rxn_series,
            "label": label_series,
            "split": split_series,
        },
        index=df_in.index,
    )

    out = out[["R-id", "rxn", "label", "split"]].reset_index(drop=True)
    return out


In [None]:

schneider_b = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/schneider50k_balanced.csv.gz"
schneider_u = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/schneider50k_unbalanced.csv.gz"

schneider_u = pd.read_csv(schneider_u, compression="gzip", low_memory=False)
schneider_b = pd.read_csv(schneider_b, compression="gzip", low_memory=False)



schneider_u = schneider_curate(schneider_u)  
schneider_b = schneider_curate(schneider_b, rxn_col='reactions') 

save_df_gz(schneider_u, '../Data/classification/schneider_u.csv.gz')
save_df_gz(schneider_b, '../Data/classification/schneider_b.csv.gz')

# 3. TPL

In [None]:
from typing import Optional, Mapping, Any
import pandas as pd
import ast
import logging

logger = logging.getLogger(__name__)


def _extract_first_str(value: Any) -> Optional[str]:
    """Return the first non-empty string from value which may be:
       - a string
       - a list/tuple of strings
       - a stringified python list "['a','b']"
       - None / NaN -> returns None
    """
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    if isinstance(value, (list, tuple)):
        for el in value:
            if el is None:
                continue
            s = str(el).strip()
            if s:
                return s
        return None
    if isinstance(value, str):
        s = value.strip()
        # try to parse stringified list/tuple like "['a','b']"
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple)) and parsed:
                    return str(parsed[0]).strip() or None
            except Exception:
                # fall back to raw string
                pass
        return s or None
    # fallback coercion
    s = str(value).strip()
    return s if s else None


def rxnclass_curate(
    df: pd.DataFrame,
    *,
    id_col: Optional[str] = None,
    id_prefix: str = "RXN",
    rxn_col: str = "rxn",
    class_col: str = "rxn_class",
    split_col: str = "split",
    class_map: Optional[Mapping[int, str]] = None,
    default_split: str = "train",
    zero_pad: Optional[int] = None,
    keep_orig_index: bool = False,
) -> pd.DataFrame:
    """
    Curate dataframe where `rxn_class` should be converted to `label`.

    Returns DataFrame with columns in this exact order:
      ["R-id", "rxn", "label", "split"]

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame; expected to contain at least `rxn` and `rxn_class`.
    id_col :
        Optional column to use for id values. If None or not present, df.index is used.
    id_prefix :
        Prefix for R-id, final form is f"{id_prefix}_{id_value}" (id_value stringified).
    rxn_col :
        Column name holding reaction strings (list-like or string). Extracted to `rxn`.
    class_col :
        Column name holding class identifiers; will be converted to `label`.
    split_col :
        Column name for dataset split (kept as `split`). Missing entries are filled with `default_split`.
    class_map :
        Optional mapping {int: str} to translate numeric class -> label. If None, attempt to cast to int else keep raw.
    default_split :
        String to fill `split` when missing.
    zero_pad :
        If provided (e.g. 6), zero-pad the numeric id portion: RXN_000123. If id_col values are non-numeric, zero_pad is ignored.
    keep_orig_index :
        If True, include a column "orig_index" with the original dataframe index to aid traceability.

    Returns
    -------
    pd.DataFrame
        Curated dataframe with ordered columns ["R-id","rxn","label","split"].
    """
    df_in = df.copy()

    # Validate presence of rxn column
    if rxn_col not in df_in.columns:
        raise ValueError(f"Input dataframe must contain a '{rxn_col}' column.")

    # Build id values
    if id_col and id_col in df_in.columns:
        id_vals = df_in[id_col].astype(str)
    else:
        id_vals = df_in.index.astype(str)

    # Optionally zero-pad numeric ids
    def _maybe_pad(v: str) -> str:
        if zero_pad is None:
            return v
        try:
            iv = int(v)
            return str(iv).zfill(zero_pad)
        except Exception:
            return v  # cannot pad non-numeric ids

    R_ids = [f"{id_prefix}_{_maybe_pad(v)}" for v in id_vals]

    # Extract rxn
    rxn_series = df_in[rxn_col].apply(_extract_first_str)

    # Convert rxn_class -> label
    if class_col in df_in.columns:
        raw_cls = df_in[class_col]

        if class_map is not None:
            def _map_fn(x):
                try:
                    return class_map.get(int(x), class_map.get(x, x))
                except Exception:
                    return class_map.get(x, x)
            label_series = raw_cls.map(_map_fn)
        else:
            # try cast to int, otherwise keep raw
            def _to_int_or_pass(x):
                try:
                    if pd.isna(x):
                        return None
                    return int(x)
                except Exception:
                    return x
            label_series = raw_cls.map(_to_int_or_pass)
    else:
        label_series = pd.Series([None] * len(df_in), index=df_in.index)

    # Prepare split
    if split_col in df_in.columns:
        split_series = df_in[split_col].fillna(default_split).astype(str)
    else:
        split_series = pd.Series([default_split] * len(df_in), index=df_in.index)

    # Compose final DataFrame
    out = pd.DataFrame(
        {
            "R-id": R_ids,
            "rxn": rxn_series,
            "label": label_series,
            "split": split_series,
        },
        index=df_in.index,
    )

    if keep_orig_index:
        out.insert(0, "orig_index", df_in.index)

    out = out[["R-id", "rxn", "label", "split"]].reset_index(drop=True)
    return out


In [41]:
tpl_b = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/USPTO_TPL_balanced.csv.gz"
tpl_u = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/USPTO_TPL_unbalanced.csv.gz"

tpl_u = pd.read_csv(tpl_u, compression="gzip", low_memory=False)
tpl_u =  rxnclass_curate(tpl_u, id_prefix="tpl", zero_pad=6, class_map=None)

tpl_b = pd.read_csv(tpl_b, compression="gzip", low_memory=False)
tpl_b =  rxnclass_curate(tpl_b, id_prefix="tpl", zero_pad=6, class_map=None)

save_df_gz(schneider_u, '../Data/classification/tpl_u.csv.gz')
save_df_gz(schneider_b, '../Data/classification/tpl_b.csv.gz')

# 4. SynTemp

In [44]:
from typing import Optional, Any, List, Sequence
import pandas as pd
import ast


def _extract_first_str(value: Any) -> Optional[str]:
    """Return first non-empty string from value which may be a str, list/tuple,
    or string-encoded list. Return None for missing/NaN."""
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    if isinstance(value, (list, tuple)):
        for el in value:
            if el is None:
                continue
            s = str(el).strip()
            if s:
                return s
        return None
    if isinstance(value, str):
        s = value.strip()
        # try parse python literal list/tuple e.g. "['a','b']"
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple)) and parsed:
                    return str(parsed[0]).strip() or None
            except Exception:
                pass
        return s or None
    s = str(value).strip()
    return s if s else None


def _first_present_column(df: pd.DataFrame, candidates: Sequence[str]) -> Optional[str]:
    """Return the first candidate that exists in df.columns (case-sensitive)."""
    for c in candidates:
        if c in df.columns:
            return c
    return None


def _try_int(v: Any) -> Optional[int]:
    """Attempt to cast to int; return None if NaN or casting fails."""
    if v is None or (isinstance(v, float) and pd.isna(v)):
        return None
    try:
        return int(v)
    except Exception:
        try:
            # sometimes values are string numbers with spaces
            s = str(v).strip()
            return int(s)
        except Exception:
            return None


def syntemp_curate(
    df: pd.DataFrame,
    *,
    id_col_candidates: Optional[List[str]] = None,
    rsmicol_candidates: Optional[List[str]] = None,
    newr0_candidates: Optional[List[str]] = None,
    newr1_candidates: Optional[List[str]] = None,
    newr2_candidates: Optional[List[str]] = None,
    id_prefix: str = "SYN",
    zero_pad: Optional[int] = None,
    keep_orig_index: bool = False,
) -> pd.DataFrame:
    """
    Curate a 'syntemp' DataFrame into columns:
      ["R-id", "rxn", "label_0", "label_1", "label_2"]

    Parameters
    ----------
    df
        Input DataFrame (expects at least R_ID/R-ID and RSMI and New_R0/1/2 or variants).
    id_col_candidates
        Ordered list of candidate column names to use for the id (default: ['R_ID','R_ID','R-id','R-ID','R_ID']).
    rsmicol_candidates
        Candidate names for reaction column (default: ['RSMI','RsmI','R_smI','RSMI','RsmI','RSMI']).
    newr0_candidates, newr1_candidates, newr2_candidates
        Candidate names for label columns; defaults accept common naming variants.
    id_prefix
        Prefix for the produced R-id e.g. "SYN" -> "SYN_123".
    zero_pad
        If provided, zero-pad numeric id portion to this width (non-numeric ids are left unchanged).
    keep_orig_index
        If True, include an "orig_index" column with the original dataframe index.

    Returns
    -------
    pd.DataFrame
        Curated DataFrame with ordered columns ["R-id","rxn","label_0","label_1","label_2"] (and optionally "orig_index").
    """
    df_in = df.copy()

    # default candidate lists
    if id_col_candidates is None:
        id_col_candidates = ["R_ID", "R-ID", "R-id", "R_ID", "R_ID"]
    if rsmicol_candidates is None:
        rsmicol_candidates = ["RSMI", "RsmI", "RSMI", "R_smI", "rxn", "reaction"]
    if newr0_candidates is None:
        newr0_candidates = ["New_R0", "NewR0", "New_R_0", "New0"]
    if newr1_candidates is None:
        newr1_candidates = ["New_R1", "NewR1", "New_R_1", "New1"]
    if newr2_candidates is None:
        newr2_candidates = ["New_R2", "NewR2", "New_R_2", "New2"]

    # pick columns that exist
    id_col = _first_present_column(df_in, id_col_candidates)
    rsmicol = _first_present_column(df_in, rsmicol_candidates)
    c_newr0 = _first_present_column(df_in, newr0_candidates)
    c_newr1 = _first_present_column(df_in, newr1_candidates)
    c_newr2 = _first_present_column(df_in, newr2_candidates)

    # prepare id values (use provided id col or index)
    if id_col is not None:
        id_vals = df_in[id_col].astype(str)
    else:
        id_vals = df_in.index.astype(str)

    # optional zero-pad numeric ids
    def _maybe_pad_val(v: str) -> str:
        if zero_pad is None:
            return v
        try:
            iv = int(v)
            return str(iv).zfill(zero_pad)
        except Exception:
            return v

    R_ids = [f"{id_prefix}_{_maybe_pad_val(v)}" for v in id_vals]

    # rxn extraction
    if rsmicol is None:
        # try lowercase alternatives
        lowcols = [c for c in df_in.columns if c.lower() == "rsmI".lower()]
        rsmicol = lowcols[0] if lowcols else None

    if rsmicol is None:
        raise ValueError("Could not find reaction column (candidates: {}).".format(rsmicol_candidates))

    rxn_series = df_in[rsmicol].apply(_extract_first_str)

    # labels: try to cast New_R* -> int when possible, else keep None or raw
    def _get_label_series(col_name: Optional[str]) -> pd.Series:
        if col_name is None or col_name not in df_in.columns:
            return pd.Series([None] * len(df_in), index=df_in.index)
        raw = df_in[col_name]
        # if list-like (string encoded), extract first then try int
        extracted = raw.map(_extract_first_str)
        casted = extracted.map(lambda v: _try_int(v))
        # if cast failed for many values, fallback to extracted strings
        # but keep ints where they succeeded
        return casted

    label_0 = _get_label_series(c_newr0)
    label_1 = _get_label_series(c_newr1)
    label_2 = _get_label_series(c_newr2)

    # build output
    out = pd.DataFrame(
        {
            "R-id": R_ids,
            "rxn": rxn_series,
            "label_0": label_0.values,
            "label_1": label_1.values,
            "label_2": label_2.values,
        },
        index=df_in.index,
    )

    if keep_orig_index:
        out.insert(0, "orig_index", df_in.index)

    # ensure exact ordering
    final_cols = ["R-id", "rxn", "label_0", "label_1", "label_2"]
    out = out[final_cols].reset_index(drop=True)
    return out


In [53]:
syntemp = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/Syntemp_cluster.csv.gz"

syntemp =  pd.read_csv(syntemp, compression="gzip", low_memory=False)
syntemp = syntemp_curate(syntemp, id_prefix='syntemp', keep_orig_index=True)
save_df_gz(syntemp, '../Data/classification/syntemp.csv.gz')


# 5. ECREACT

In [3]:
from typing import Optional, Sequence, Any
import pandas as pd
import ast


def _extract_first_str(value: Any) -> Optional[str]:
    """Return the first non-empty string from value which may be:
       - a plain string
       - a list/tuple of strings
       - a stringified python list "['a','b']"
       - None / NaN -> returns None
    """
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    if isinstance(value, (list, tuple)):
        for el in value:
            if el is None:
                continue
            s = str(el).strip()
            if s:
                return s
        return None
    if isinstance(value, str):
        s = value.strip()
        # try to parse python literal list/tuple e.g. "['a','b']"
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple)) and parsed:
                    return str(parsed[0]).strip() or None
            except Exception:
                pass
        return s or None
    s = str(value).strip()
    return s if s else None


def _first_present_column(df: pd.DataFrame, candidates: Sequence[str]) -> Optional[str]:
    """Return the first candidate column name that exists in df.columns."""
    for c in candidates:
        if c in df.columns:
            return c
    return None


def _try_int(v: Any) -> Optional[int]:
    """Try to cast v to int; return None for NaN or on failure."""
    if v is None or (isinstance(v, float) and pd.isna(v)):
        return None
    try:
        return int(v)
    except Exception:
        try:
            return int(str(v).strip())
        except Exception:
            return None


def claire_curate(
    df: pd.DataFrame,
    *,
    id_col_candidates: Optional[Sequence[str]] = None,
    rxn_col_candidates: Optional[Sequence[str]] = None,
    ec1_col_candidates: Optional[Sequence[str]] = None,
    ec2_col_candidates: Optional[Sequence[str]] = None,
    ec3_col_candidates: Optional[Sequence[str]] = None,
    id_prefix: str = "ecreact",
    zero_pad: Optional[int] = None,
    default_split: str = "train",
    keep_orig_index: bool = False,
) -> pd.DataFrame:
    """
    Curate Claire-style dataframe.

    Transformations performed:
      - Build R-id as f"{id_prefix}_{index_value}" where index_value comes from a chosen id column
        (default candidates include 'index') or df.index if none found.
      - rxn_smiles -> rxn (extract first element if list-like or stringified list)
      - ec1_encode -> ec1, ec2_encode -> ec2, ec3_encode -> ec3 (fallback to ec1/ec2/ec3 if encode cols missing)
      - keep `split` column (fill with default_split when missing)

    Output columns (in this order):
      ["R-id", "rxn", "ec1", "ec2", "ec3", "split"]

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe expected to contain columns like 'rxn_smiles', 'ec*_encode' and 'split'.
    id_col_candidates : sequence[str], optional
        Candidate column names to use as the numeric id. Defaults to ['index', 'Index'] then falls back to df.index.
    rxn_col_candidates : sequence[str], optional
        Candidate names for reaction smiles column. Defaults to ['rxn_smiles', 'rxn', 'reaction'].
    ec1_col_candidates/ec2_col_candidates/ec3_col_candidates : sequence[str], optional
        Candidate names for encoded enzyme class columns. Defaults prefer 'ec1_encode' etc., then 'ec1'.
    id_prefix : str
        Prefix for generated R-id (default 'claire').
    zero_pad : int or None
        If provided and id value is numeric, zero-pad to this width.
    default_split : str
        Value used to fill missing split entries.
    keep_orig_index : bool
        If True, an 'orig_index' column with the original df.index is included at start.

    Returns
    -------
    pd.DataFrame
        Curated DataFrame with columns ["R-id", "rxn", "ec1", "ec2", "ec3", "split"].
    """
    df_in = df.copy()

    # sensible defaults
    if id_col_candidates is None:
        id_col_candidates = ["index", "Index"]
    if rxn_col_candidates is None:
        rxn_col_candidates = ["rxn_smiles", "rxn_smiles", "rxn", "reaction"]
    if ec1_col_candidates is None:
        ec1_col_candidates = ["ec1_encode", "ec1encode", "ec1_encode", "ec1"]
    if ec2_col_candidates is None:
        ec2_col_candidates = ["ec2_encode", "ec2encode", "ec2_encode", "ec2"]
    if ec3_col_candidates is None:
        ec3_col_candidates = ["ec3_encode", "ec3encode", "ec3_encode", "ec3"]

    # pick id source
    id_col = _first_present_column(df_in, id_col_candidates)
    if id_col is not None:
        id_vals = df_in[id_col].astype(str)
    else:
        id_vals = df_in.index.astype(str)

    # optional zero-pad numeric ids
    def _maybe_pad(v: str) -> str:
        if zero_pad is None:
            return v
        try:
            iv = int(v)
            return str(iv).zfill(zero_pad)
        except Exception:
            return v

    R_ids = [f"{id_prefix}_{_maybe_pad(v)}" for v in id_vals]

    # rxn extraction
    rxn_col = _first_present_column(df_in, rxn_col_candidates)
    if rxn_col is None:
        raise ValueError(f"Could not find a reaction column (candidates: {rxn_col_candidates}).")
    rxn_series = df_in[rxn_col].apply(_extract_first_str)

    # ec columns mapping: prefer ec?_encode then ec?
    ec1_col = _first_present_column(df_in, ec1_col_candidates)
    ec2_col = _first_present_column(df_in, ec2_col_candidates)
    ec3_col = _first_present_column(df_in, ec3_col_candidates)

    def _get_ec_series(col_name: Optional[str]) -> pd.Series:
        if col_name is None or col_name not in df_in.columns:
            return pd.Series([None] * len(df_in), index=df_in.index)
        # if values look list-like / stringified, extract first string then try int
        extracted = df_in[col_name].map(_extract_first_str)
        casted = extracted.map(lambda v: _try_int(v))
        # prefer int when possible, otherwise keep extracted string
        # we return the int series (may contain None)
        return casted

    ec1_series = _get_ec_series(ec1_col)
    ec2_series = _get_ec_series(ec2_col)
    ec3_series = _get_ec_series(ec3_col)

    # split: keep as-is but fill missing with default_split
    if "split" in df_in.columns:
        split_series = df_in["split"].fillna(default_split).astype(str)
    else:
        split_series = pd.Series([default_split] * len(df_in), index=df_in.index)

    # assemble output
    out = pd.DataFrame(
        {
            "R-id": R_ids,
            "rxn": rxn_series,
            "ec1": ec1_series.values,
            "ec2": ec2_series.values,
            "ec3": ec3_series.values,
            "split": split_series.values,
        },
        index=df_in.index,
    )

    if keep_orig_index:
        out.insert(0, "orig_index", df_in.index)

    # ensure ordering and reset index
    final_cols = ["R-id", "rxn", "ec1", "ec2", "ec3", "split"]
    out = out[final_cols].reset_index(drop=True)
    return out


In [4]:
claire = "https://raw.githubusercontent.com/phuocchung123/SynCat/main/Data/raw/claire_full.csv.gz"

claire =  pd.read_csv(claire, compression="gzip", low_memory=False)
claire = claire_curate(claire)
save_df_gz(claire, '../Data/classification/ecreact.csv.gz')