# Diagnostics Interactive Inspector

**Purpose:** Hover over individual data points to inspect their EXFOR metadata — Entry ID, author, year, full REACTION string, reaction type, etc. This helps identify suspicious clusters and understand data provenance.

**Prerequisite:** Run ingestion with the `--diagnostics` flag to extract the additional metadata columns:

```bash
python scripts/ingest_exfor.py --x4-db data/x4sqlite1.db --test-subset --diagnostics
```

This adds `Author`, `Year`, `ReactionType`, `FullCode`, and `NDataPoints` columns to the Parquet output.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.graph_objects as go
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, HTML

# ── Load Parquet ───────────────────────────────────────────────────────────
PARQUET_PATH = Path('../data/exfor_processed.parquet')
if not PARQUET_PATH.exists():
    # Try alternate common locations
    for alt in [Path('data/exfor_processed.parquet'), Path('exfor_processed.parquet')]:
        if alt.exists():
            PARQUET_PATH = alt
            break

print(f"Loading: {PARQUET_PATH.resolve()}")
df = pd.read_parquet(PARQUET_PATH)
print(f"Loaded {len(df):,} data points  |  Columns: {list(df.columns)}")

# ── Detect diagnostic columns ──────────────────────────────────────────────
DIAGNOSTIC_COLS = ['Author', 'Year', 'ReactionType', 'NDataPoints', 'FullCode']
available_diag = [c for c in DIAGNOSTIC_COLS if c in df.columns]
missing_diag = [c for c in DIAGNOSTIC_COLS if c not in df.columns]

if available_diag:
    print(f"\n\u2705 Diagnostic columns found: {available_diag}")
else:
    print("\n\u26a0\ufe0f  No diagnostic columns found!")
    print("   Re-run ingestion with --diagnostics flag to enable hover metadata.")
    print("   The notebook will still work, but hover info will be limited.")

if missing_diag:
    print(f"   Missing (will use 'N/A' in hover): {missing_diag}")

In [None]:
# ══════════════════════════════════════════════════════════════════════════════
#  Element symbols & MT code descriptions (same as ThresholdExplorer)
# ══════════════════════════════════════════════════════════════════════════════

_SYMBOLS = {
    1: 'H', 2: 'He', 3: 'Li', 4: 'Be', 5: 'B', 6: 'C', 7: 'N', 8: 'O',
    9: 'F', 10: 'Ne', 11: 'Na', 12: 'Mg', 13: 'Al', 14: 'Si', 15: 'P',
    16: 'S', 17: 'Cl', 18: 'Ar', 19: 'K', 20: 'Ca', 21: 'Sc', 22: 'Ti',
    23: 'V', 24: 'Cr', 25: 'Mn', 26: 'Fe', 27: 'Co', 28: 'Ni', 29: 'Cu',
    30: 'Zn', 31: 'Ga', 32: 'Ge', 33: 'As', 34: 'Se', 35: 'Br', 36: 'Kr',
    37: 'Rb', 38: 'Sr', 39: 'Y', 40: 'Zr', 41: 'Nb', 42: 'Mo', 43: 'Tc',
    44: 'Ru', 45: 'Rh', 46: 'Pd', 47: 'Ag', 48: 'Cd', 49: 'In', 50: 'Sn',
    51: 'Sb', 52: 'Te', 53: 'I', 54: 'Xe', 55: 'Cs', 56: 'Ba', 57: 'La',
    58: 'Ce', 59: 'Pr', 60: 'Nd', 61: 'Pm', 62: 'Sm', 63: 'Eu', 64: 'Gd',
    65: 'Tb', 66: 'Dy', 67: 'Ho', 68: 'Er', 69: 'Tm', 70: 'Yb', 71: 'Lu',
    72: 'Hf', 73: 'Ta', 74: 'W', 75: 'Re', 76: 'Os', 77: 'Ir', 78: 'Pt',
    79: 'Au', 80: 'Hg', 81: 'Tl', 82: 'Pb', 83: 'Bi', 84: 'Po', 85: 'At',
    86: 'Rn', 87: 'Fr', 88: 'Ra', 89: 'Ac', 90: 'Th', 91: 'Pa', 92: 'U',
    93: 'Np', 94: 'Pu', 95: 'Am', 96: 'Cm', 97: 'Bk', 98: 'Cf', 99: 'Es',
}

_MT_NAMES = {
    1: 'Total', 2: 'Elastic', 4: 'Inelastic', 16: '(n,2n)',
    17: '(n,3n)', 18: 'Fission', 102: '(n,\u03b3) Capture',
    103: '(n,p)', 104: '(n,d)', 105: '(n,t)', 106: '(n,3He)',
    107: '(n,\u03b1)',
}


def isotope_str(Z, A):
    return f"{_SYMBOLS.get(int(Z), f'Z{Z}')}-{int(A)}"


def mt_str(mt):
    return _MT_NAMES.get(int(mt), f'MT-{int(mt)}')


# ── Build dropdown choices from available data ──────────────────────────────
z_values = sorted(df['Z'].unique())
z_options = {f"{int(z)} ({_SYMBOLS.get(int(z), '?')})": int(z) for z in z_values}

w_z = widgets.Dropdown(options=z_options, description='Element (Z):')
w_a = widgets.Dropdown(description='Isotope (A):')
w_mt = widgets.Dropdown(description='Reaction (MT):')
w_color = widgets.Dropdown(
    options=['Uniform', 'Color by Entry', 'Color by data_type'],
    value='Uniform',
    description='Coloring:',
)
output = widgets.Output()


def update_a(*args):
    """Cascade: Z changed -> update A dropdown."""
    z_val = w_z.value
    a_values = sorted(df[df['Z'] == z_val]['A'].unique())
    w_a.options = {str(int(a)): int(a) for a in a_values}
    if a_values:
        w_a.value = int(a_values[0])


def update_mt(*args):
    """Cascade: A changed -> update MT dropdown."""
    z_val, a_val = w_z.value, w_a.value
    if z_val is None or a_val is None:
        return
    mt_values = sorted(df[(df['Z'] == z_val) & (df['A'] == a_val)]['MT'].unique())
    mt_opts = {f"{int(mt)} {mt_str(mt)}": int(mt) for mt in mt_values}
    w_mt.options = mt_opts
    # Default to Fission (18) if available, else first
    if 18 in mt_values:
        w_mt.value = 18
    elif mt_values:
        w_mt.value = int(mt_values[0])


w_z.observe(update_a, names='value')
w_a.observe(update_mt, names='value')

# Trigger initial cascade
update_a()
update_mt()

display(widgets.HBox([w_z, w_a, w_mt, w_color]))

In [None]:
# ══════════════════════════════════════════════════════════════════════════════
#  Diagnostic Scatter Plot (Plotly WebGL)
# ══════════════════════════════════════════════════════════════════════════════

def create_diagnostic_scatter(df_group, title, color_mode='Uniform'):
    """
    Create an interactive Plotly scatter plot with hover metadata.

    Uses go.Scattergl (WebGL) for 50K+ point performance.
    Hover template shows all available metadata columns.
    """
    if len(df_group) == 0:
        fig = go.Figure()
        fig.add_annotation(text="No data for this selection", showarrow=False,
                          font=dict(size=20))
        fig.update_layout(title=title, width=1000, height=600)
        return fig

    # ── Build customdata array and hover template ──────────────────────────
    # Always-available columns
    hover_cols = ['Entry']
    hover_lines = ['Entry: %{customdata[0]}']
    idx = 1

    # Diagnostic columns (only if present)
    for col, label, fmt in [
        ('Author', 'Author', '%{customdata[IDX]}'),
        ('Year', 'Year', '%{customdata[IDX]}'),
        ('FullCode', 'FullCode', '%{customdata[IDX]}'),
        ('ReactionType', 'ReactionType', '%{customdata[IDX]}'),
        ('NDataPoints', 'N pts in dataset', '%{customdata[IDX]}'),
    ]:
        if col in df_group.columns:
            hover_cols.append(col)
            hover_lines.append(f"{label}: " + fmt.replace('IDX', str(idx)))
            idx += 1

    # Metadata filter columns
    for col, label in [('sf6', 'sf6'), ('sf8', 'sf8'), ('data_type', 'data_type')]:
        if col in df_group.columns:
            hover_cols.append(col)
            hover_lines.append(f"{label}: " + '%{customdata[' + str(idx) + ']}')
            idx += 1

    # Outlier z_score if present
    if 'z_score' in df_group.columns:
        hover_cols.append('z_score')
        hover_lines.append('z_score: %{customdata[' + str(idx) + ']:.2f}')
        idx += 1

    # Build hover template
    # NOTE: explicit + before '<br>'.join() is required to break Python's
    # implicit string concatenation, which would merge '<br>' with the
    # preceding literal and corrupt the join separator.
    hover_template = (
        'Energy: %{x:.4e} eV<br>'
        'XS: %{y:.4e} b<br>' +
        '<br>'.join(hover_lines) +
        '<extra></extra>'
    )

    # Fill missing diagnostic columns with 'N/A' for hover
    customdata_df = df_group[hover_cols].copy()
    for col in hover_cols:
        if customdata_df[col].dtype == 'object' or customdata_df[col].dtype.name == 'category':
            customdata_df[col] = customdata_df[col].fillna('N/A')
        else:
            customdata_df[col] = customdata_df[col].fillna(-1)

    customdata = customdata_df.values

    # ── Color mode ────────────────────────────────────────────────────────
    if color_mode == 'Color by Entry' and 'Entry' in df_group.columns:
        fig = px.scatter(
            df_group, x='Energy', y='CrossSection',
            color='Entry',
            log_x=True, log_y=True,
            title=title,
            render_mode='webgl',
            custom_data=hover_cols,
        )
        fig.update_traces(hovertemplate=hover_template, marker=dict(size=4))
        # If too many entries, hide legend
        n_entries = df_group['Entry'].nunique()
        if n_entries > 30:
            fig.update_layout(showlegend=False)
    elif color_mode == 'Color by data_type' and 'data_type' in df_group.columns:
        fig = px.scatter(
            df_group, x='Energy', y='CrossSection',
            color='data_type',
            log_x=True, log_y=True,
            title=title,
            render_mode='webgl',
            custom_data=hover_cols,
        )
        fig.update_traces(hovertemplate=hover_template, marker=dict(size=4))
    else:
        # Uniform coloring with Scattergl for max performance
        fig = go.Figure()
        fig.add_trace(go.Scattergl(
            x=df_group['Energy'],
            y=df_group['CrossSection'],
            mode='markers',
            marker=dict(size=3, color='#2E8B8B', opacity=0.6),
            customdata=customdata,
            hovertemplate=hover_template,
            name='data',
        ))
        fig.update_xaxes(type='log', title='Energy (eV)')
        fig.update_yaxes(type='log', title='Cross Section (b)')
        fig.update_layout(title=title)

    fig.update_layout(
        width=1100, height=650,
        hoverlabel=dict(font_size=12, font_family='monospace'),
        template='plotly_white',
    )
    return fig

In [None]:
# ══════════════════════════════════════════════════════════════════════════════
#  Wire widgets to plot
# ══════════════════════════════════════════════════════════════════════════════

def on_change(*args):
    output.clear_output(wait=True)
    with output:
        z_val, a_val, mt_val = w_z.value, w_a.value, w_mt.value
        if any(v is None for v in [z_val, a_val, mt_val]):
            print("Select Z, A, and MT above.")
            return

        mask = (df['Z'] == z_val) & (df['A'] == a_val) & (df['MT'] == mt_val)
        df_group = df[mask].copy()

        iso = isotope_str(z_val, a_val)
        rxn = mt_str(mt_val)
        title = f"{iso}  {rxn}  ({len(df_group):,} points)"

        fig = create_diagnostic_scatter(df_group, title, color_mode=w_color.value)
        fig.show()

        # Summary stats
        if len(df_group) > 0:
            n_entries = df_group['Entry'].nunique() if 'Entry' in df_group.columns else '?'
            print(f"\n{len(df_group):,} points from {n_entries} unique entries")
            if 'Author' in df_group.columns:
                top_authors = df_group['Author'].value_counts().head(5)
                print(f"\nTop authors:")
                for auth, cnt in top_authors.items():
                    print(f"  {auth}: {cnt:,} points")
            if 'Year' in df_group.columns and df_group['Year'].notna().any():
                print(f"Year range: {int(df_group['Year'].min())} - {int(df_group['Year'].max())}")
            if 'data_type' in df_group.columns:
                print(f"\ndata_type breakdown:")
                for dt, cnt in df_group['data_type'].value_counts().items():
                    print(f"  {dt}: {cnt:,}")


w_z.observe(on_change, names='value')
w_a.observe(on_change, names='value')
w_mt.observe(on_change, names='value')
w_color.observe(on_change, names='value')

display(output)

# Trigger initial plot
on_change()

In [None]:
# ══════════════════════════════════════════════════════════════════════════════
#  Color by Entry - visually identify dataset clusters
# ══════════════════════════════════════════════════════════════════════════════
# Select "Color by Entry" in the dropdown above to visually distinguish which
# datasets form suspicious clusters. Each entry gets a unique color.
#
# Alternatively, run this cell to generate a standalone color-by-entry plot
# for the current selection:

z_val, a_val, mt_val = w_z.value, w_a.value, w_mt.value
if all(v is not None for v in [z_val, a_val, mt_val]):
    mask = (df['Z'] == z_val) & (df['A'] == a_val) & (df['MT'] == mt_val)
    df_group = df[mask].copy()

    iso = isotope_str(z_val, a_val)
    rxn = mt_str(mt_val)

    # Show entries with most data points
    entry_counts = df_group['Entry'].value_counts()
    print(f"{iso} {rxn}: {len(entry_counts)} unique entries")
    print(f"\nTop 15 entries by point count:")
    print("-" * 70)
    for entry, cnt in entry_counts.head(15).items():
        row = df_group[df_group['Entry'] == entry].iloc[0]
        author = row.get('Author', 'N/A') if 'Author' in df_group.columns else 'N/A'
        year = int(row['Year']) if 'Year' in df_group.columns and pd.notna(row.get('Year')) else 'N/A'
        rtype = row.get('ReactionType', 'N/A') if 'ReactionType' in df_group.columns else 'N/A'
        fcode = row.get('FullCode', 'N/A') if 'FullCode' in df_group.columns else 'N/A'
        print(f"  {entry:20s}  {cnt:5d} pts  {author} ({year})  {rtype}  {fcode}")
else:
    print("Select Z, A, MT above first.")

## Investigation Tips

When inspecting data points, look for these patterns:

1. **Single author/year producing outlier clusters** — a single experiment that disagrees with the consensus may have systematic errors
2. **Small NDataPoints (< 5)** — may indicate low-quality digitizations or test measurements
3. **Unexpected FullCode modifiers** — check if the EXFOR REACTION string reveals modifiers the metadata filter missed (ratio, relative, etc.)
4. **Old measurements (pre-1970)** — may use obsolete experimental techniques with larger systematic uncertainties
5. **ReactionType \u2260 CS** — if diagnostics shows ReactionType other than 'CS' (cross section), the point may be a different observable
6. **data_type = 'pure_CS'** vs other — points classified as pure cross-section data should be most reliable

### Workflow
1. Use **Uniform** coloring to see the overall energy-XS distribution
2. Switch to **Color by Entry** to identify which datasets form suspicious clusters
3. Hover over outlier points to read their metadata
4. Note the Entry IDs, then look them up in EXFOR: `https://www-nds.iaea.org/exfor/servlet/X4sGetSubent?subID=<ENTRY>`
5. Use **Color by data_type** to check if non-pure data leaked through the filter