# Configuration-Based Transformation Workflow

This notebook demonstrates how to use the configuration system for end-to-end analyser alignment workflows.
In the previous notebooks, we have used the config already to load the paths to the files. We will now see that the API supports just inputting the config directly.

**Why use configuration-based workflows?**
- **Reproducibility**: All paths and parameters centralized in config files
- **Portability**: Easy to switch between development/production environments
- **Maintainability**: Update paths once, apply everywhere
- **Collaboration**: Team members use same logical dataset names

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from glob import glob
from tqdm.auto import tqdm

# PDF-compatible fonts
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

# Scientific plot style
import scienceplots

plt.style.use(["science", "nature"])

# Colourblind-friendly palette
SEABORN_PALETTE = "colorblind"
seaborn_colors = sns.color_palette(SEABORN_PALETTE)

# Add parent directory to path for imports
sys.path.insert(0, str(Path.cwd().parent.parent))

from sysmexcbctools.transfer.sysmexalign import (
    FlowTransformer,
    ImpedanceTransformer,
    XNSampleTransformer,
)
from sysmexcbctools.transfer.config import ConfigLoader, get_config_loader
from sysmexcbctools.transfer.utils import DataLoader, load_centile_samples

print("Imports successful!")

## 1. Load Configuration System

The configuration system uses `config/data_paths.yaml` to map logical dataset names to actual file paths. (Again, our specific files are used. Users will have to create their own configs for their datasets)

In [None]:
# Load configuration
config_path = "../../sysmexcbctools/transfer/config/data_paths.yaml"
config = get_config_loader(config_path)

print("Configuration loaded successfully!")
print(f"Config file: {config_path}")
print(f"\nAvailable datasets:")

# List available datasets using the public API
datasets = config.list_datasets()
for category in ["raw", "processed"]:
    print(f"\n{category.upper()}:")
    if category in datasets:
        for dataset_name in datasets[category].keys():
            path = config.get_dataset_path(category, dataset_name)
            exists = os.path.exists(path)
            status = "✓" if exists else "✗"
            print(f"  {status} {dataset_name}: {path}")

## 2. Load Sample Numbers Using Config

Instead of hardcoding paths to sample number files, we use logical names from the config.

In [None]:
# Load sample numbers using config
print("Loading sample numbers from config...\n")

# Source: INTERVAL baseline analyser 36
source_samples = load_centile_samples("interval_baseline_36", config_path)
print(f"Source samples (interval_baseline_36): {len(source_samples):,} samples")

# Target: INTERVAL baseline analyser 41
target_samples = load_centile_samples("interval_baseline_41", config_path)
print(f"Target samples (interval_baseline_41): {len(target_samples):,} samples")

print(f"\nSample number format: {source_samples[0]} (type: {type(source_samples[0])})")

## 3. Flow Cytometry Transformation Using Config

Let's transform RET (reticulocyte) flow cytometry data using the config-based approach.

In [None]:
# Define datasets using logical names
source_dataset_name = "interval_36"
target_dataset_name = "interval_41"
channel = "RET"

# Get actual paths from config
source_path = config.get_dataset_path("raw", source_dataset_name)
target_path = config.get_dataset_path("raw", target_dataset_name)

print(f"Source dataset: {source_dataset_name}")
print(f"  Path: {source_path}")
print(f"\nTarget dataset: {target_dataset_name}")
print(f"  Path: {target_path}")
print(f"\nChannel: {channel}")

In [None]:
# Create and fit FlowTransformer
print(f"\nCreating FlowTransformer for {channel} channel...\n")

transformer = FlowTransformer(
    channel=channel,
    n_components=20,
    max_samples=50000,
    save_fitted_data=True,  # Save data for visualization
)

# Get file lists
source_files = sorted(glob(f"{source_path}/SCT/{channel}_*.116.csv"))
target_files = sorted(glob(f"{target_path}/SCT/{channel}_*.116.csv"))

print(f"Source files: {len(source_files)}")
print(f"Target files: {len(target_files)}")

# Fit transformer
print(f"\nFitting transformer...")
transformer.fit(
    source_files=source_files,
    target_files=target_files,
    source_sample_nos=source_samples,
    target_sample_nos=target_samples,
)

print("\nTransformer fitted successfully!")

In [None]:
# Save transformer for reuse
output_dir = "../outputs/transformers"
os.makedirs(output_dir, exist_ok=True)

transformer_path = (
    f"{output_dir}/flow_{channel}_{source_dataset_name}_to_{target_dataset_name}.pkl"
)
transformer.save(transformer_path)

print(f"Transformer saved to: {transformer_path}")

## 4. Batch Processing Multiple Flow Cytometry Channels

Process all flow cytometry channels (RET, WDF, WNR, PLTF) in a single workflow.

In [None]:
# Define channels to process
channels = ["RET", "WDF", "WNR", "PLTF"]

# Check which channels have data
available_channels = []
for ch in channels:
    source_ch_files = glob(f"{source_path}/SCT/{ch}_*.116.csv")
    target_ch_files = glob(f"{target_path}/SCT/{ch}_*.116.csv")
    if len(source_ch_files) > 0 and len(target_ch_files) > 0:
        available_channels.append(ch)
        print(
            f"✓ {ch}: {len(source_ch_files)} source files, {len(target_ch_files)} target files"
        )
    else:
        print(f"✗ {ch}: No data available")

print(f"\nWill process {len(available_channels)} channels: {available_channels}")

In [None]:
# Batch fit transformers for all channels
transformers = {}

for channel in tqdm(available_channels, desc="Fitting transformers"):
    print(f"\n{'='*60}")
    print(f"Processing channel: {channel}")
    print("=" * 60)

    # Create transformer
    trans = FlowTransformer(
        channel=channel,
        n_components=20,
        max_samples=50000,
        save_fitted_data=False,  # Don't save data for batch processing
    )

    # Get file lists
    source_files = sorted(glob(f"{source_path}/SCT/{channel}_*.116.csv"))
    target_files = sorted(glob(f"{target_path}/SCT/{channel}_*.116.csv"))

    # Fit
    trans.fit(
        source_files=source_files,
        target_files=target_files,
        source_sample_nos=source_samples,
        target_sample_nos=target_samples,
    )

    # Save
    save_path = f"{output_dir}/flow_{channel}_{source_dataset_name}_to_{target_dataset_name}.pkl"
    trans.save(save_path)

    transformers[channel] = trans
    print(f"✓ {channel} transformer saved")

print(f"\n{'='*60}")
print(f"Batch processing complete! Fitted {len(transformers)} transformers.")
print("=" * 60)

## 5. Impedance Transformation Using Config

Now let's transform impedance data (RBC and PLT histograms).

In [None]:
# Load OutputData.csv files
print("Loading impedance data (OutputData.csv)...\n")

source_impedance_path = f"{source_path}/OutputData.csv"
target_impedance_path = f"{target_path}/OutputData.csv"

print(f"Source: {source_impedance_path}")
print(f"Target: {target_impedance_path}")

# Check files exist
if os.path.exists(source_impedance_path) and os.path.exists(target_impedance_path):
    print("\n✓ Both impedance files found")

    # Load data
    source_impedance_df = pd.read_csv(source_impedance_path, low_memory=False)
    target_impedance_df = pd.read_csv(target_impedance_path, low_memory=False)

    print(f"Source shape: {source_impedance_df.shape}")
    print(f"Target shape: {target_impedance_df.shape}")
else:
    print("\n✗ Impedance files not found - skipping impedance transformation")
    source_impedance_df = None
    target_impedance_df = None

In [None]:
# Fit ImpedanceTransformer
if source_impedance_df is not None:
    print("\nCreating ImpedanceTransformer...\n")

    impedance_transformer = ImpedanceTransformer(
        # n_components=20,
        gmm_sample_size=50000,
        # max_samples=100000
    )

    # Fit transformer
    impedance_transformer.fit(
        source_df=source_impedance_df,
        target_df=target_impedance_df,
        source_sample_nos=source_samples,
        target_sample_nos=target_samples,
    )

    # Save transformer
    impedance_path = (
        f"{output_dir}/impedance_{source_dataset_name}_to_{target_dataset_name}.pkl"
    )
    impedance_transformer.save(impedance_path)

    print(f"\nImpedance transformer saved to: {impedance_path}")
else:
    print("\nSkipping impedance transformation (no data)")

## 6. XN_SAMPLE Transformation Using Config

Finally, let's transform the tabular XN_SAMPLE data.

In [None]:
# Load XN_SAMPLE files
print("Loading XN_SAMPLE data...\n")

source_xnsample_path = f"{source_path}/XN_SAMPLE.csv"
target_xnsample_path = f"{target_path}/XN_SAMPLE.csv"

print(f"Source: {source_xnsample_path}")
print(f"Target: {target_xnsample_path}")

# Check files exist
if os.path.exists(source_xnsample_path) and os.path.exists(target_xnsample_path):
    print("\n✓ Both XN_SAMPLE files found")

    # Load data
    source_xnsample_df = pd.read_csv(source_xnsample_path, low_memory=False)
    target_xnsample_df = pd.read_csv(target_xnsample_path, low_memory=False)

    print(f"Source shape: {source_xnsample_df.shape}")
    print(f"Target shape: {target_xnsample_df.shape}")
else:
    print("\n✗ XN_SAMPLE files not found - skipping XN_SAMPLE transformation")
    source_xnsample_df = None
    target_xnsample_df = None

In [None]:
# Define columns to transform
if source_xnsample_df is not None:
    fbc_columns = [
        "RBC(10^6/uL)",
        "HGB(g/dL)",
        "HCT(%)",
        "MCV(fL)",
        "MCH(pg)",
        "MCHC(g/dL)",
        "RDW-SD(fL)",
        "RDW-CV(%)",
        "WBC(10^3/uL)",
        "NEUT#(10^3/uL)",
        "LYMPH#(10^3/uL)",
        "MONO#(10^3/uL)",
        "EO#(10^3/uL)",
        "BASO#(10^3/uL)",
        "NEUT%(%)",
        "LYMPH%(%)",
        "MONO%(%)",
        "EO%(%)",
        "BASO%(%)",
        "PLT(10^3/uL)",
        "MPV(fL)",
        "PCT(%)",
        "PDW(fL)",
    ]

    # Filter to available columns
    source_cols = set(source_xnsample_df.columns)
    target_cols = set(target_xnsample_df.columns)
    columns_to_transform = [
        col for col in fbc_columns if col in source_cols and col in target_cols
    ]

    print(f"Columns to transform: {len(columns_to_transform)} / {len(fbc_columns)}")
    print(f"Available columns: {sorted(columns_to_transform)}")

In [None]:
# Fit XNSampleTransformer
if source_xnsample_df is not None:
    print("\nCreating XNSampleTransformer...\n")

    xnsample_transformer = XNSampleTransformer(columns=columns_to_transform)

    # Fit transformer
    xnsample_transformer.fit(
        source_df=source_xnsample_df,
        target_df=target_xnsample_df,
        source_sample_nos=source_samples,
        target_sample_nos=target_samples,
    )

    # Save transformer
    xnsample_path = (
        f"{output_dir}/xnsample_{source_dataset_name}_to_{target_dataset_name}.pkl"
    )
    xnsample_transformer.save(xnsample_path)

    print(f"\nXN_SAMPLE transformer saved to: {xnsample_path}")
else:
    print("\nSkipping XN_SAMPLE transformation (no data)")

## 7. Summary of Created Transformers

Let's summarize all the transformation models we've created.

In [None]:
# List all saved transformers
print("Saved Transformation Models:")
print("=" * 80)

transformer_files = sorted(glob(f"{output_dir}/*.pkl"))

for tf_file in transformer_files:
    filename = os.path.basename(tf_file)
    size_mb = os.path.getsize(tf_file) / (1024 * 1024)
    print(f"  {filename:<60} ({size_mb:>6.2f} MB)")

print("=" * 80)
print(f"Total transformers: {len(transformer_files)}")
total_size = sum([os.path.getsize(f) for f in transformer_files]) / (1024 * 1024)
print(f"Total size: {total_size:.2f} MB")

## 8. Loading and Reusing Transformers

Once transformers are saved, you can reload them without refitting.

In [None]:
# Example: Load a saved transformer
print("Testing transformer loading...\n")

# Load flow transformer
if len(available_channels) > 0:
    test_channel = available_channels[0]
    test_path = f"{output_dir}/flow_{test_channel}_{source_dataset_name}_to_{target_dataset_name}.pkl"

    print(f"Loading {test_channel} transformer from: {test_path}")
    loaded_flow = FlowTransformer.load(test_path)
    print(f"✓ Loaded successfully! Channel: {loaded_flow.channel}")

# Load impedance transformer
if source_impedance_df is not None:
    test_path = (
        f"{output_dir}/impedance_{source_dataset_name}_to_{target_dataset_name}.pkl"
    )
    print(f"\nLoading impedance transformer from: {test_path}")
    loaded_impedance = ImpedanceTransformer.load(test_path)
    print(f"✓ Loaded successfully! GMMs fitted: {loaded_impedance.is_fitted_}")

# Load XN_SAMPLE transformer
if source_xnsample_df is not None:
    test_path = (
        f"{output_dir}/xnsample_{source_dataset_name}_to_{target_dataset_name}.pkl"
    )
    print(f"\nLoading XN_SAMPLE transformer from: {test_path}")
    loaded_xnsample = XNSampleTransformer.load(test_path)
    print(f"✓ Loaded successfully! Columns: {len(loaded_xnsample.columns)}")

## 9. Applying Transformations to New Data

Once transformers are fitted and saved, you can apply them to new data from the same source analyser.

In [None]:
# Example: Transform a subset of XN_SAMPLE data
if source_xnsample_df is not None:
    print("Example: Transforming a subset of XN_SAMPLE data\n")

    # Take first 1000 samples
    test_subset = source_xnsample_df.head(1000).copy()
    print(f"Test subset: {test_subset.shape}")

    # Transform
    transformed_subset = loaded_xnsample.transform(test_subset)
    print(f"Transformed subset: {transformed_subset.shape}")

    # Show example transformation for one column
    example_col = columns_to_transform[0]
    print(f"\nExample transformation for {example_col}:")

    original_vals = pd.to_numeric(test_subset[example_col], errors="coerce").dropna()
    transformed_vals = pd.to_numeric(
        transformed_subset[example_col], errors="coerce"
    ).dropna()

    print(
        f"  Original:    mean={original_vals.mean():.4f}, std={original_vals.std():.4f}"
    )
    print(
        f"  Transformed: mean={transformed_vals.mean():.4f}, std={transformed_vals.std():.4f}"
    )

## 10. Production Workflow Recommendations

**Best practices for production pipelines:**

### Configuration Management
1. **Use version control** for `data_paths.yaml`
2. **Separate configs** for development/staging/production
3. **Document dataset names** and their purposes

### Transformer Management
1. **Save all transformers** with descriptive names including dataset names and dates
2. **Version transformers** when refitting (e.g., `flow_RET_36_to_41_v2.pkl`)
3. **Document transformation parameters** (n_components, max_samples, etc.)

### Data Processing
1. **Batch process** multiple channels/files to save time
2. **Use sample numbers** to select representative subsets for fitting
3. **Validate transformations** before using in production
4. **Log all operations** including file paths, parameters, and timestamps

## 11. Example: Automated Batch Pipeline

Here's a complete example of an automated pipeline that could be used in production.

In [None]:
def create_transformation_pipeline(
    config_path,
    source_dataset,
    target_dataset,
    source_samples_name,
    target_samples_name,
    output_dir,
    flow_channels=["RET", "WDF", "WNR", "PLTF"],
    process_impedance=True,
    process_xnsample=True,
    force_refit=False,
):
    """
    Complete automated transformation pipeline.

    Parameters
    ----------
    config_path : str
        Path to data_paths.yaml
    source_dataset : str
        Source dataset name (e.g., 'interval_36')
    target_dataset : str
        Target dataset name (e.g., 'interval_41')
    source_samples_name : str
        Name of source sample number file (e.g., 'interval_baseline_36')
    target_samples_name : str
        Name of target sample number file (e.g., 'interval_baseline_41')
    output_dir : str
        Directory to save transformers
    flow_channels : list
        List of flow cytometry channels to process
    process_impedance : bool
        Whether to process impedance data
    process_xnsample : bool
        Whether to process XN_SAMPLE data
    force_refit : bool
        If True, refit even if transformer exists

    Returns
    -------
    dict
        Dictionary of transformer paths
    """
    print("=" * 80)
    print("AUTOMATED TRANSFORMATION PIPELINE")
    print("=" * 80)

    # Load config
    config = get_config_loader(config_path)
    print(f"\n✓ Config loaded: {config_path}")

    # Get paths
    source_path = config.get_dataset_path("raw", source_dataset)
    target_path = config.get_dataset_path("raw", target_dataset)
    print(f"✓ Source path: {source_path}")
    print(f"✓ Target path: {target_path}")

    # Load samples - pass config_path string, not config object
    source_samples = load_centile_samples(source_samples_name, config_path)
    target_samples = load_centile_samples(target_samples_name, config_path)
    print(f"✓ Source samples: {len(source_samples):,}")
    print(f"✓ Target samples: {len(target_samples):,}")

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    print(f"✓ Output directory: {output_dir}")

    transformer_paths = {}

    # Process flow cytometry channels
    print(f"\n{'='*80}")
    print("FLOW CYTOMETRY CHANNELS")
    print("=" * 80)

    for channel in flow_channels:
        save_path = (
            f"{output_dir}/flow_{channel}_{source_dataset}_to_{target_dataset}.pkl"
        )

        if os.path.exists(save_path) and not force_refit:
            print(f"\n✓ {channel}: Transformer already exists (skipping)")
            transformer_paths[f"flow_{channel}"] = save_path
            continue

        print(f"\n→ Processing {channel}...")

        # Get files
        source_files = sorted(glob(f"{source_path}/SCT/{channel}_*.116.csv"))
        target_files = sorted(glob(f"{target_path}/SCT/{channel}_*.116.csv"))

        if len(source_files) == 0 or len(target_files) == 0:
            print(f"  ✗ No data found - skipping")
            continue

        print(f"  Files: {len(source_files)} source, {len(target_files)} target")

        # Fit transformer
        trans = FlowTransformer(channel=channel, n_components=20, max_samples=50000)
        trans.fit(source_files, target_files, source_samples, target_samples)
        trans.save(save_path)

        transformer_paths[f"flow_{channel}"] = save_path
        print(f"  ✓ Saved: {save_path}")

    # Process impedance
    if process_impedance:
        print(f"\n{'='*80}")
        print("IMPEDANCE DATA")
        print("=" * 80)

        save_path = f"{output_dir}/impedance_{source_dataset}_to_{target_dataset}.pkl"

        if os.path.exists(save_path) and not force_refit:
            print(f"\n✓ Impedance transformer already exists (skipping)")
            transformer_paths["impedance"] = save_path
        else:
            source_imp = f"{source_path}/OutputData.csv"
            target_imp = f"{target_path}/OutputData.csv"

            if os.path.exists(source_imp) and os.path.exists(target_imp):
                print(f"\n→ Processing impedance data...")

                source_df = pd.read_csv(source_imp, low_memory=False)
                target_df = pd.read_csv(target_imp, low_memory=False)

                trans = ImpedanceTransformer(n_components=20, gmm_sample_size=50000)
                trans.fit(source_df, target_df, source_samples, target_samples)
                trans.save(save_path)

                transformer_paths["impedance"] = save_path
                print(f"  ✓ Saved: {save_path}")
            else:
                print(f"\n✗ Impedance data not found - skipping")

    # Process XN_SAMPLE
    if process_xnsample:
        print(f"\n{'='*80}")
        print("XN_SAMPLE DATA")
        print("=" * 80)

        save_path = f"{output_dir}/xnsample_{source_dataset}_to_{target_dataset}.pkl"

        if os.path.exists(save_path) and not force_refit:
            print(f"\n✓ XN_SAMPLE transformer already exists (skipping)")
            transformer_paths["xnsample"] = save_path
        else:
            source_xn = f"{source_path}/XN_SAMPLE.csv"
            target_xn = f"{target_path}/XN_SAMPLE.csv"

            if os.path.exists(source_xn) and os.path.exists(target_xn):
                print(f"\n→ Processing XN_SAMPLE data...")

                source_df = pd.read_csv(source_xn, low_memory=False)
                target_df = pd.read_csv(target_xn, low_memory=False)

                # Define columns
                fbc_cols = [
                    "RBC(10^6/uL)",
                    "HGB(g/dL)",
                    "HCT(%)",
                    "MCV(fL)",
                    "MCH(pg)",
                    "MCHC(g/dL)",
                    "WBC(10^3/uL)",
                    "PLT(10^3/uL)",
                    "MPV(fL)",
                ]
                available_cols = [
                    c
                    for c in fbc_cols
                    if c in source_df.columns and c in target_df.columns
                ]

                trans = XNSampleTransformer(columns=available_cols)
                trans.fit(source_df, target_df, source_samples, target_samples)
                trans.save(save_path)

                transformer_paths["xnsample"] = save_path
                print(f"  ✓ Saved: {save_path}")
            else:
                print(f"\n✗ XN_SAMPLE data not found - skipping")

    print(f"\n{'='*80}")
    print(f"PIPELINE COMPLETE")
    print(f"Created {len(transformer_paths)} transformers")
    print("=" * 80)

    return transformer_paths


print("Pipeline function defined! Ready to use.")

In [None]:
# Example usage of automated pipeline
print("Example: Running automated pipeline\n")

results = create_transformation_pipeline(
    config_path=config_path,
    source_dataset="interval_36",
    target_dataset="interval_41",
    source_samples_name="interval_baseline_36",
    target_samples_name="interval_baseline_41",
    output_dir="../outputs/transformers",
    flow_channels=["RET"],  # Just RET for demo
    process_impedance=True,
    process_xnsample=True,
    force_refit=False,
)

print("\nCreated transformers:")
for name, path in results.items():
    print(f"  {name}: {path}")