In [10]:
import pandas as pd
import numpy as np
import os

ZINC_DIR = "."  # ZINC_cat_norm (this folder)
PREPROCESSED_DIR = "../preprocessed_reactions_no_unspec_no_intra_unnorm"
OUT_DIR = "."  # output subfolder

# Columns to exclude from min-max normalization (identifiers, labels, categorical)
COLS_EXCLUDE = ["Reactant_SMILES", "Atom_nº", "Selectivity", "Reactive Atom", "DOI", "source"]

os.makedirs(OUT_DIR, exist_ok=True)


def minmax_normalize(df, cols_exclude):
    """Min-max normalization to [0, 1] for numeric columns. Constant columns -> 0."""
    out = df.copy()
    for col in out.columns:
        if col in cols_exclude or col not in out.select_dtypes(include=[np.number]).columns:
            continue
        mn, mx = out[col].min(), out[col].max()
        if mx > mn:
            out[col] = (out[col] - mn) / (mx - mn)
        else:
            out[col] = 0.0
    return out

In [11]:
# Get CSV names in both folders (match by filename)
zinc_files = [f for f in os.listdir(ZINC_DIR) if f.endswith(".csv") and f.startswith("df_")]
preprocessed_files = set(os.listdir(PREPROCESSED_DIR))
common = [f for f in zinc_files if f in preprocessed_files]
print(f"Matching dataframes: {sorted(common)}")

Matching dataframes: ['df_bde.csv', 'df_custom.csv', 'df_en1.csv', 'df_en1_ohe.csv', 'df_gas.csv', 'df_rdkVbur.csv', 'df_xtb.csv']


In [12]:
for name in sorted(common):
    df_zinc = pd.read_csv(os.path.join(ZINC_DIR, name), index_col=0, low_memory=False)
    df_pre = pd.read_csv(os.path.join(PREPROCESSED_DIR, name), index_col=0, low_memory=False)

    df_zinc["source"] = "ZINC_cat_norm"
    df_pre["source"] = "preprocessed_reactions_no_unspec_no_intra_unnorm"

    # Union of columns: align and concat along rows
    all_cols = list(df_zinc.columns) + [c for c in df_pre.columns if c not in df_zinc.columns]
    for c in all_cols:
        if c not in df_zinc.columns:
            df_zinc[c] = pd.NA
        if c not in df_pre.columns:
            df_pre[c] = pd.NA
    df_merged = pd.concat([df_zinc[all_cols], df_pre[all_cols]], axis=0, ignore_index=True)

    # Remove duplicate (Reactant_SMILES, Atom_nº), keep first
    key_cols = [c for c in ["Reactant_SMILES", "Atom_nº"] if c in df_merged.columns]
    if key_cols:
        n_before = len(df_merged)
        df_merged = df_merged.drop_duplicates(subset=key_cols, keep="first")
        n_dropped = n_before - len(df_merged)
        if n_dropped:
            print(f"  Dropped {n_dropped} duplicate (SMILES, Atom_nº) rows")

    # Min-max normalize numeric columns to [0, 1]
    df_merged = minmax_normalize(df_merged, COLS_EXCLUDE)

    out_path = os.path.join(OUT_DIR, name)
    df_merged.to_csv(out_path)
    print(f"{name}: -> {len(df_merged)} rows, {len(all_cols)} cols (deduped, min-max normalized)")

  Dropped 3823 duplicate (SMILES, Atom_nº) rows
df_bde.csv: -> 281411 rows, 12 cols (deduped, min-max normalized)
  Dropped 3823 duplicate (SMILES, Atom_nº) rows
df_custom.csv: -> 281267 rows, 11 cols (deduped, min-max normalized)
  Dropped 3823 duplicate (SMILES, Atom_nº) rows
df_en1.csv: -> 281417 rows, 14 cols (deduped, min-max normalized)
  Dropped 3823 duplicate (SMILES, Atom_nº) rows
df_en1_ohe.csv: -> 281417 rows, 55 cols (deduped, min-max normalized)
  Dropped 3823 duplicate (SMILES, Atom_nº) rows
df_gas.csv: -> 281402 rows, 10 cols (deduped, min-max normalized)
  Dropped 3823 duplicate (SMILES, Atom_nº) rows
df_rdkVbur.csv: -> 281283 rows, 14 cols (deduped, min-max normalized)
  Dropped 3823 duplicate (SMILES, Atom_nº) rows
df_xtb.csv: -> 281288 rows, 42 cols (deduped, min-max normalized)
