# Advanced Configuration for XNSampleProcessor

This notebook provides a deep dive into all processing parameters and advanced features of the `XNSampleProcessor` class.

## Topics Covered

1. All Processing Parameters Explained
2. keep_drop_rows Mode - Preview Before Dropping
3. std_threshold Tuning
4. make_dummy_marks Parameter
5. Memory Optimisation Settings
6. Correlation Analysis Deep Dive
7. Custom Cleaning Workflows
8. Performance Tuning
9. Integration with Other Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# PDF-compatible fonts
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

# Scientific plot style
import scienceplots

plt.style.use(["science", "nature"])

# Colourblind-friendly palette
SEABORN_PALETTE = "colorblind"
seaborn_colors = sns.color_palette(SEABORN_PALETTE)

from sysmexcbctools.data import XNSampleProcessor

# For nice display
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 1000)

print("Setup complete!")

## 1. All Processing Parameters Explained

Let's review every parameter available in `XNSampleProcessor` and what they do.

In [None]:
# Create a default processor to see all parameters
default_processor = XNSampleProcessor()

print("XNSampleProcessor Parameters:")
print("=" * 70)
print()

params = [
    (
        "config_path",
        default_processor.config_path,
        "Path to YAML config file (overrides other params)",
    ),
    (
        "remove_clotintube",
        default_processor.remove_clotintube,
        "Remove samples with clot indicators (turbidity, agglutination, PLT clumps)",
    ),
    (
        "remove_multimeasurementsamples",
        default_processor.remove_multimeasurementsamples,
        "Handle multiple measurements per sample",
    ),
    (
        "remove_correlated",
        default_processor.remove_correlated,
        "Remove highly correlated columns (NOT RECOMMENDED)",
    ),
    (
        "std_threshold",
        default_processor.std_threshold,
        "SD threshold for comparing multiple measurements",
    ),
    (
        "keep_drop_rows",
        default_processor.keep_drop_rows,
        "Keep rows but mark for dropping (preview mode)",
    ),
    (
        "make_dummy_marks",
        default_processor.make_dummy_marks,
        "One-hot encode data mark fields (/M columns)",
    ),
    (
        "use_memory_optimized",
        default_processor.use_memory_optimized,
        "Use memory-efficient processing for large datasets",
    ),
    (
        "enable_memory_monitoring",
        default_processor.enable_memory_monitoring,
        "Log memory usage throughout processing",
    ),
    (
        "correlation_sample_size",
        default_processor.correlation_sample_size,
        "Max rows for correlation analysis",
    ),
    (
        "chunk_size",
        default_processor.chunk_size,
        "Chunk size for processing multiple measurements",
    ),
    ("force_dask", default_processor.force_dask, "Force Dask usage (for testing)"),
    ("output_dir", default_processor.output_dir, "Directory for output files"),
    ("output_prefix", default_processor.output_prefix, "Prefix for output filenames"),
    ("log_to_file", default_processor.log_to_file, "Save logs and diagnostic files"),
    (
        "verbose",
        default_processor.verbose,
        "Verbosity level (0=silent, 1=info, 2=debug)",
    ),
]

for param_name, default_value, description in params:
    print(f"{param_name:30s} = {str(default_value):15s}  # {description}")

## 2. keep_drop_rows Mode - Preview Before Dropping

Sometimes you want to see what would be removed without actually removing it. The `keep_drop_rows` parameter adds indicator columns instead of dropping rows (shoutouts to Andrew Gibbs for this feature).

In [None]:
# Process with keep_drop_rows=True
processor_preview = XNSampleProcessor(keep_drop_rows=True, verbose=1)

df_preview = processor_preview.process_files(
    "../data/temp/XN_SAMPLE_part1.csv", save_output=False
)

print(f"\nDataframe shape: {df_preview.shape}")
print(f"\nColumns starting with 'drop_':")
drop_cols = [col for col in df_preview.columns if col.startswith("drop_")]
print(drop_cols)

In [None]:
# Analyse what would be dropped
if drop_cols:
    print("Samples marked for dropping:")
    print("=" * 60)

    for col in drop_cols:
        n_dropped = int(df_preview[col].sum())
        pct = n_dropped / len(df_preview) * 100
        print(f"{col:30s}: {n_dropped:6d} ({pct:5.2f}%)")

    # Overall drop column (union of all drop reasons)
    if "drop" in df_preview.columns:
        total_drop = df_preview["drop"].sum()
        pct_total = total_drop / len(df_preview) * 100
        print("\n" + "=" * 60)
        print(f"{'TOTAL (union)':30s}: {total_drop:6d} ({pct_total:5.2f}%)")

        # Samples that would be kept
        n_kept = (~df_preview["drop"]).sum()
        pct_kept = n_kept / len(df_preview) * 100
        print(f"{'Samples kept':30s}: {n_kept:6d} ({pct_kept:5.2f}%)")

In [None]:
# Visualise drop reasons
if drop_cols and len(drop_cols) > 0:
    drop_counts = df_preview[drop_cols].sum().sort_values(ascending=True)

    plt.figure(figsize=(4.5, 2.2))
    drop_counts.plot(kind="barh", color="salmon")
    plt.xlabel("Number of samples")
    plt.title("Samples Marked for Dropping by Reason")
    plt.tight_layout()
    plt.show()

In [None]:
# Example: Inspect samples that would be dropped due to PLT clumping
if "drop_IP SUS(PLT)PLT Clumps?" in df_preview.columns:
    clumped_samples = df_preview[df_preview["drop_IP SUS(PLT)PLT Clumps?"] == True]

    print(f"\nFound {len(clumped_samples)} samples with PLT clumping indicators")

    if len(clumped_samples) > 0:
        # Show a few examples
        print("\nExample clumped samples:")
        display_cols = ["Sample No.", "Date", "WBC(10^3/uL)", "PLT(10^3/uL)"]
        display_cols = [c for c in display_cols if c in clumped_samples.columns]
        print(clumped_samples[display_cols].head())

    print("\nClumping samples platelet volume distribution difference:")
    sns.displot(
        data=df_preview,
        x="MPV(fL)",
        hue="drop_IP SUS(PLT)PLT Clumps?",
        kind="kde",
        fill=True,
        common_norm=False,
    )

## 3. std_threshold Tuning

The `std_threshold` parameter controls how close multiple measurements of the same sample have to be to each other to be considered correct and retained (this may not be necessary at all if you just take the samples from your respective electronic health records system, as that should have been properly checked by the haematology lab staff before being entered). Let's explore its impact.

In [None]:
# Test different thresholds
thresholds = [0.5, 1.0, 1.5, 2.0]
results = {}

print("Processing with different std_threshold values...\n")

for threshold in thresholds:
    processor = XNSampleProcessor(std_threshold=threshold, verbose=0)  # Quiet mode

    df = processor.process_files("../data/temp/XN_SAMPLE_part1.csv", save_output=False)

    results[threshold] = {
        "n_rows": len(df),
        "n_samples": df["Sample No."].nunique(),
        "dataframe": df,
    }

# Compare results
print("Impact of std_threshold on data retention:")
print("=" * 70)
print(f"{'Threshold':>12s} {'Rows':>10s} {'Samples':>10s} {'Change vs 1.0':>15s}")
print("=" * 70)

baseline_rows = results[1.0]["n_rows"]

for threshold in thresholds:
    r = results[threshold]
    change = r["n_rows"] - baseline_rows
    change_pct = (change / baseline_rows) * 100 if baseline_rows > 0 else 0
    print(
        f"{threshold:12.1f} {r['n_rows']:10d} {r['n_samples']:10d} {change:+6d} ({change_pct:+5.2f}%)"
    )

In [None]:
# Visualize the impact
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(4.5, 2.2))

# Plot 1: Number of rows retained
rows_retained = [results[t]["n_rows"] for t in thresholds]
ax1.plot(thresholds, rows_retained, marker="o", linewidth=2, markersize=8)
ax1.set_xlabel("std_threshold")
ax1.set_ylabel("Number of rows retained")
ax1.set_title("Impact of std_threshold on Data Retention")
ax1.grid(True, alpha=0.3)
ax1.axvline(x=1.0, color="red", linestyle="--", alpha=0.5, label="Default (1.0)")
ax1.legend()

# Plot 2: Percentage change vs default
pct_changes = [
    (results[t]["n_rows"] - baseline_rows) / baseline_rows * 100 for t in thresholds
]
ax2.bar(
    thresholds,
    pct_changes,
    width=0.3,
    alpha=0.7,
    color=["salmon" if x < 0 else "lightblue" for x in pct_changes],
)
ax2.set_xlabel("std_threshold")
ax2.set_ylabel("% Change vs default (1.0)")
ax2.set_title("Relative Change in Data Retention")
ax2.axhline(y=0, color="red", linestyle="--", alpha=0.5)
ax2.grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()

## 4. make_dummy_marks Parameter

Data mark columns (ending in `/M`) contain quality indicators. By default they're kept as strings, but you can one-hot encode them.

In [None]:
# Process WITHOUT dummy encoding (default)
processor_no_dummy = XNSampleProcessor(make_dummy_marks=False, verbose=0)

df_no_dummy = processor_no_dummy.process_files(
    "../data/temp/XN_SAMPLE_part1.csv", save_output=False
)

# Find /M columns
mark_cols_no_dummy = [col for col in df_no_dummy.columns if col.endswith("/M")]

print("WITHOUT make_dummy_marks:")
print(f"  Shape: {df_no_dummy.shape}")
print(f"  Columns ending in /M: {len(mark_cols_no_dummy)}")
print(f"\nExample /M column values:")
if mark_cols_no_dummy:
    example_col = mark_cols_no_dummy[0]
    print(f"  {example_col}: {df_no_dummy[example_col].value_counts().head()}")

In [None]:
# Process WITH dummy encoding
processor_dummy = XNSampleProcessor(make_dummy_marks=True, verbose=0)

df_dummy = processor_dummy.process_files(
    "../data/temp/XN_SAMPLE_part1.csv", save_output=False
)

print("\nWITH make_dummy_marks:")
print(f"  Shape: {df_dummy.shape}")
print(f"  Additional columns created: {df_dummy.shape[1] - df_no_dummy.shape[1]}")

# Find dummy columns
if mark_cols_no_dummy:
    base_name = mark_cols_no_dummy[0]
    dummy_cols = [
        col
        for col in df_dummy.columns
        if col.startswith(base_name) and col != mark_cols_no_dummy[0]
    ]
    if dummy_cols:
        print(f"\nExample: {mark_cols_no_dummy[0]} was expanded to:")
        for col in dummy_cols[:5]:  # Show first 5
            print(f"  - {col}")

### When to use make_dummy_marks?

- **TRUE**: When feeding data to ML models that need numeric features
- **FALSE** (default): When you want to preserve the original mark information, or when you'll filter/analyze marks manually

**Note**: One-hot encoding can significantly increase the number of columns if there are many unique mark values.

## 5. Memory Optimization Settings

For large datasets, memory-optimized processing can be crucial.

In [None]:
import time

# Test standard vs memory-optimized processing
test_file = "../data/temp/XN_SAMPLE_part1.csv"

print("Comparing standard vs memory-optimized processing...\n")

# Standard processing
start = time.time()
processor_standard = XNSampleProcessor(
    use_memory_optimized=False, enable_memory_monitoring=True, verbose=1
)
df_standard = processor_standard.process_files(test_file, save_output=False)
time_standard = time.time() - start

print(f"\n{'='*70}")
print("Standard processing complete")
print(f"Time: {time_standard:.2f}s")
print(f"Result shape: {df_standard.shape}")

In [None]:
# Memory-optimized processing
start = time.time()
processor_optimized = XNSampleProcessor(
    use_memory_optimized=True, enable_memory_monitoring=True, chunk_size=1000, verbose=1
)
df_optimized = processor_optimized.process_files(test_file, save_output=False)
time_optimized = time.time() - start

print(f"\n{'='*70}")
print("Memory-optimized processing complete")
print(f"Time: {time_optimized:.2f}s")
print(f"Result shape: {df_optimized.shape}")
print(
    f"\nSpeedup: {time_standard/time_optimized:.2f}x"
    if time_optimized < time_standard
    else f"\nSlowdown: {time_optimized/time_standard:.2f}x"
)

### Memory Optimization Parameters:

- **use_memory_optimized** (default=True): Automatically uses chunked processing for large datasets
- **chunk_size** (default=1000): Number of sample IDs to process at once. Lower = less memory, but more overhead
- **correlation_sample_size** (default=50000): Subsample for correlation analysis to save memory
- **force_dask** (default=False): Force Dask usage for distributed processing (experimental)

**When to adjust**: For datasets >100k rows, consider decreasing chunk_size to 500. For datasets >1M rows, try force_dask=True.

## 6. Correlation Analysis Deep Dive

The processor analyzes correlations between all columns and standard FBC parameters. Let's understand this better.

In [None]:
# Process with correlation analysis saved
processor = XNSampleProcessor(
    remove_correlated=False,  # Don't remove, just analyze
    log_to_file=True,  # Save correlation analysis to file
    output_dir="./advanced_output",
    verbose=1,
)

df = processor.process_files(
    "../data/temp/XN_SAMPLE_part1.csv",
    dataset_name="correlation_demo",
    save_output=False,
)

# Check if correlation file was created
correlation_file = Path("./advanced_output/correlation_demo_correlated_columns.csv")
if correlation_file.exists():
    print(f"\n✓ Correlation analysis saved to: {correlation_file}")
    df_corr = pd.read_csv(correlation_file)
    print(f"\nShape: {df_corr.shape}")
    print("\nFirst few rows:")
    print(df_corr.head(10))
else:
    print("\nNo correlation file created (log_to_file=False or no correlations found)")

In [None]:
# Visualise correlation patterns
if correlation_file.exists():
    df_corr = pd.read_csv(correlation_file)

    # Rename columns for easier access
    df_corr = df_corr.rename(
        columns={
            "Standard FBC feature": "FBC_Parameter",
            "Correlated features": "Feature",
            "Correlation strength": "Correlation",
        }
    )

    # Function to escape special LaTeX characters in parameter names
    def latex_escape(text):
        """Escape special characters for LaTeX rendering."""
        import re

        # FIRST: Handle superscripts in math mode (before escaping ^)
        # Match patterns like (10^3/uL) and convert to LaTeX math mode
        text = re.sub(r"\(10\^(\d+)/(\w+)\)", r"(10$^\1$/\2)", text)

        # THEN: Escape LaTeX special characters
        # Order matters - do backslash first
        text = text.replace("\\", "\\textbackslash")
        text = text.replace("&", "\\&")
        text = text.replace("%", "\\%")
        text = text.replace("#", "\\#")
        text = text.replace("_", "\\_")
        text = text.replace("{", "\\{")
        text = text.replace("}", "\\}")
        text = text.replace("~", "\\textasciitilde")

        return text

    # Plot correlation distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(4.5, 2.2))

    # Histogram of correlation values
    ax1.hist(df_corr["Correlation"].abs(), bins=50, edgecolor="black", alpha=0.7)
    ax1.axvline(x=0.8, color="red", linestyle="--", label="Threshold (0.8)")
    ax1.set_xlabel("Absolute Correlation")
    ax1.set_ylabel("Frequency")
    ax1.set_title("Distribution of Correlations with FBC Parameters")
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # Highly correlated features
    high_corr = df_corr[df_corr["Correlation"].abs() >= 0.8].sort_values(
        "Correlation", ascending=False
    )
    if len(high_corr) > 0:
        top_n = min(15, len(high_corr))
        high_corr_top = high_corr.head(top_n)

        # Escape LaTeX special characters in labels
        labels = [
            latex_escape(f"{row['Feature']} vs {row['FBC_Parameter']}")
            for _, row in high_corr_top.iterrows()
        ]
        ax2.barh(range(len(labels)), high_corr_top["Correlation"].values, alpha=0.7)
        ax2.set_yticks(range(len(labels)))
        ax2.set_yticklabels(labels, fontsize=8)
        ax2.set_xlabel("Correlation")
        # Use >= instead of ≥ for LaTeX compatibility
        ax2.set_title(f"Top {top_n} Highly Correlated Features ($|r| \geq 0.8$)")
        ax2.axvline(x=0.8, color="red", linestyle="--", alpha=0.5)
        ax2.grid(True, alpha=0.3, axis="x")
    else:
        ax2.text(
            0.5,
            0.5,
            r"No features with $|r| \geq 0.8$",
            ha="center",
            va="center",
            transform=ax2.transAxes,
        )

    plt.tight_layout()
    plt.show()

    print(f"\nHighly correlated features (|r| >= 0.8): {len(high_corr)}")

### Should you use remove_correlated=True?

**Generally NO**, for these reasons:

1. **Information loss**: Highly correlated features may still contain unique information
2. **Context-dependent**: Correlation in one dataset may not hold in another
3. **Domain knowledge**: Some "redundant" features may be clinically important

**When to consider it**:
- You have extreme dimensionality issues (thousands of features)
- You've manually reviewed the correlation analysis and agree with the removal
- You're doing exploratory analysis where interpretability > completeness
- You plan to use the features as inputs to an ML model and want to analyse feature importances (gets split among correlated features usually)

## 7. Custom Cleaning Workflows

You can combine the processor with custom preprocessing steps for specialized workflows.

In [None]:
# Example: Add custom filtering before processing

# Step 1: Load raw data
df_raw = pd.read_csv("../data/temp/XN_SAMPLE_part1.csv", low_memory=False)
print(f"Raw data: {df_raw.shape}")

# Step 2: Apply custom filters
# Example: Keep only samples from a specific date range
df_raw["Date"] = pd.to_datetime(df_raw["Date"], errors="coerce")
date_mask = (df_raw["Date"] >= "2012-01-01") & (df_raw["Date"] <= "2019-12-31")
df_filtered = df_raw[date_mask].copy()
print(f"After date filter: {df_filtered.shape}")

# Step 3: Save temporarily
temp_file = "../data/temp_custom_filtered.csv"
df_filtered.to_csv(temp_file, index=False)

# Step 4: Process with XNSampleProcessor
processor = XNSampleProcessor(verbose=0)
df_clean = processor.process_files(temp_file, save_output=False)
print(f"After cleaning: {df_clean.shape}")

# Cleanup
Path(temp_file).unlink()

In [None]:
# Example: Post-processing after cleaning

# Clean with processor
processor = XNSampleProcessor(verbose=0)
df = processor.process_files("../data/temp/XN_SAMPLE_part1.csv", save_output=False)

print(f"After standard cleaning: {df.shape}")

# Custom post-processing: Add derived features
if "WBC(10^3/uL)" in df.columns and "LYMPH#(10^3/uL)" in df.columns:
    df["LYMPH_RATIO"] = df["LYMPH#(10^3/uL)"] / df["WBC(10^3/uL)"]
    print("Added LYMPH_RATIO feature")

if "RBC(10^6/uL)" in df.columns and "HGB(g/dL)" in df.columns:
    df["MCH_calculated"] = df["HGB(g/dL)"] / df["RBC(10^6/uL)"] * 10
    print("Added MCH_calculated feature")

# Custom filtering: Remove extreme outliers
if "WBC(10^3/uL)" in df.columns:
    wbc_before = len(df)
    q1 = df["WBC(10^3/uL)"].quantile(0.001)
    q99 = df["WBC(10^3/uL)"].quantile(0.999)
    df = df[(df["WBC(10^3/uL)"] >= q1) & (df["WBC(10^3/uL)"] <= q99)]
    print(f"Removed {wbc_before - len(df)} extreme WBC outliers")

print(f"\nFinal shape: {df.shape}")

## 8. Performance Tuning

Tips for processing very large datasets efficiently.

In [None]:
# Performance tuning configurations

print("Recommended configurations for different dataset sizes:\n")

configs = [
    {
        "name": "Small (<50k rows)",
        "params": {
            "use_memory_optimized": False,
            "enable_memory_monitoring": False,
            "verbose": 1,
        },
        "notes": "Standard processing is fine. No need for optimization.",
    },
    {
        "name": "Medium (50k-200k rows)",
        "params": {
            "use_memory_optimized": True,
            "chunk_size": 1000,
            "correlation_sample_size": 50000,
            "enable_memory_monitoring": True,
            "verbose": 1,
        },
        "notes": "Default settings work well. Monitor memory if issues arise.",
    },
    {
        "name": "Large (200k-1M rows)",
        "params": {
            "use_memory_optimized": True,
            "chunk_size": 500,
            "correlation_sample_size": 30000,
            "enable_memory_monitoring": True,
            "verbose": 1,
        },
        "notes": "Reduce chunk size and correlation sample size. Consider processing subsets separately.",
    },
    {
        "name": "Very Large (>1M rows)",
        "params": {
            "use_memory_optimized": True,
            "chunk_size": 250,
            "correlation_sample_size": 10000,
            "enable_memory_monitoring": True,
            "force_dask": True,
            "verbose": 1,
        },
        "notes": "Consider splitting into multiple files. Use force_dask if available. Process on high-memory machine.",
    },
]

for config in configs:
    print(f"\n{'='*70}")
    print(f"Configuration: {config['name']}")
    print(f"{'='*70}")
    print("\nParameters:")
    for param, value in config["params"].items():
        print(f"  {param:30s} = {value}")
    print(f"\nNotes: {config['notes']}")

## 9. Integration with Other Modules

How to prepare cleaned data for use with other SysmexCBCTools modules.

In [None]:
# Clean data for downstream analysis
processor = XNSampleProcessor(
    remove_clotintube=True,
    remove_multimeasurementsamples=True,
    std_threshold=1.0,
    verbose=1,
)

df_clean = processor.process_files(
    "../data/temp/XN_SAMPLE_part1.csv", save_output=False
)

print(f"Cleaned data shape: {df_clean.shape}")

### For Transfer Module (Cross-Analyser Alignment)

The transfer module works with both XN_SAMPLE.csv files and raw flow cytometry data. After cleaning:

```python
# Save cleaned data for transfer module
df_clean.to_csv('cleaned_source.csv', index=False)

# Then use with transfer module
# from sysmexcbctools.transfer import ...
# transform_xnsample(
#     source_dataset='cleaned_source.csv',
#     target_dataset='cleaned_target.csv',
#     ...
# )
```

### For Correction Module (GAM-based Domain Correction)

The correction module expects clean, numeric tabular data:

```python
# Ensure numeric columns for GAM fitting
# Select FBC parameters
fbc_cols = ['WBC(10^3/uL)', 'RBC(10^6/uL)', 'HGB(g/dL)', 'PLT(10^3/uL)']
df_for_gam = df_clean[fbc_cols].dropna()

# Add covariates to correct for
df_for_gam['sample_age_days'] = ...  # Calculate from dates
df_for_gam['day_of_week'] = ...      # Extract from dates

# Save for correction module
# df_for_gam.to_csv('for_gam_correction.csv', index=False)
```

### For Dis-AE 2 Module (Domain-Invariant Learning)

The Dis-AE 2 module requires numeric features and integer-encoded labels:

```python
# Select numeric features (FBC parameters + others)
feature_cols = ['WBC(10^3/uL)', 'RBC(10^6/uL)', 'HGB(g/dL)', 'HCT(%)',
                'PLT(10^3/uL)', 'MCV(fL)', 'MCH(pg)', 'MCHC(g/dL)']
X = df_clean[feature_cols].dropna()

# Prepare task labels (e.g., binary classification)
# y_task = ...

# Prepare domain labels (e.g., analyser ID, site, batch)
# y_domain = df_clean['Analyzer ID'].map({'analyzer1': 0, 'analyzer2': 1, ...})

# Then use with Dis-AE 2
# from sysmexcbctools.disae2.disae2 import DisAE
# model = DisAE(input_dim=len(feature_cols), ...)
# model.fit(X.values, y_task, y_domain)
```