# NonBDNA Finder — Lightweight Comparative Notebook

## Purpose
High-performance detection of Non-B DNA structural motifs with **lightweight, comparative-only output**.
All heavy per-file visualisations (linear tracks, KDE, violin, pie, network, UpSet) are **skipped**.

## What this notebook produces
| Output | Description |
|--------|-------------|
| **Per-file summary table** | Sequences, bp, GC%, motifs, density/kb, coverage% |
| **Class statistics table** | Count, mean length/score, density/kb, coverage% |
| **Subclass statistics table** | Count, mean length/score, density/kb, coverage% (top 30) |
| **Cross-file class distribution** | Bar chart: motif counts per class across all files |
| **Cross-file density bar chart** | Motifs/kb comparison across files |
| **Cross-file coverage bar chart** | Coverage% comparison across files |
| **Class density heatmap** | Files × Classes (motifs/kb) |
| **Class coverage heatmap** | Files × Classes (coverage%) |
| **Hybrid & Cluster comparison** | Counts across files |
| **GFF feature × class heatmap** | When GFF annotation files are present |
| **Global comprehensive stats** | 25 structural metrics table |

## Performance strategy
- **ProcessPoolExecutor** for all sequence-level parallelism (bypasses Python GIL)
- **All available CPU cores** utilised adaptively (`os.cpu_count()`)
- **Parquet streaming** keeps RAM low for large genomes
- **Vectorised NumPy/Pandas** operations throughout
- **Lazy GC** (`gc.collect()`) after each file to free memory promptly

## How to run
1. Edit `FASTA_INPUT` and `OUTPUT_DIR` in **Cell 1**
2. Run **Cell 1** (setup)
3. Run **Cell 2** (detection + comparative outputs)


In [None]:
# =============================================================================
# CELL 1 * SETUP -- imports, configuration, helpers
# Edit FASTA_INPUT and OUTPUT_DIR, then run this cell before Cell 2.
# =============================================================================

import sys, os, importlib, glob, gc, time, datetime, re, warnings
import concurrent.futures
from pathlib import Path
from collections import defaultdict
warnings.filterwarnings('ignore')

_REPO_ROOT = os.path.abspath(os.getcwd())
if _REPO_ROOT not in sys.path:
    sys.path.insert(0, _REPO_ROOT)

# -- Auto-install missing packages -------------------------------------------
_REQUIRED = [
    ('psutil',    'psutil>=5.8'),
    ('pandas',    'pandas>=1.3'),
    ('numpy',     'numpy>=1.21'),
    ('matplotlib','matplotlib>=3.5'),
    ('seaborn',   'seaborn>=0.11'),
    ('openpyxl',  'openpyxl>=3.0'),
    ('tqdm',      'tqdm>=4.64'),
    ('pyarrow',   'pyarrow>=10.0'),
]
_miss = [p for m, p in _REQUIRED if importlib.util.find_spec(m) is None]
if _miss:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', *_miss, '-q'])

import pandas as pd
import numpy as np
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from IPython.display import display, HTML, Image
sns.set_theme(style='whitegrid')

# -- Optional fast FASTA parser -----------------------------------------------
try:
    import pyfastx as _pyfastx
    _HAS_PYFASTX = True
except ImportError:
    _HAS_PYFASTX = False
    try:
        from Bio import SeqIO as _SeqIO
        _HAS_SEQIO = True
    except ImportError:
        _HAS_SEQIO = False

# -- USER CONFIGURATION -------------------------------------------------------
FASTA_INPUT        = ['*.fna', '*.fasta']  # path, wildcard, or list
OUTPUT_DIR         = 'notebook_reports_lw' # output directory
ENABLED_CLASSES    = None                  # None = all; e.g. ['G-Quadruplex','Z-DNA']
RAM_OVERRIDE_BYTES = None                  # None = auto

# -- Large-chromosome chunking ------------------------------------------------
LARGE_CHR_THRESHOLD_MB  = 50  # split chromosomes larger than this (Mb)
GENOME_CHUNK_SIZE_MB    = 2   # sub-chunk size (Mb)
GENOME_CHUNK_OVERLAP_KB = 5   # overlap between sub-chunks (kb)

# -- GPU detection ------------------------------------------------------------
def _detect_gpu():
    for lib, _attr in [('torch', 'cuda'), ('cupy', None)]:
        try:
            m = importlib.import_module(lib)
            if lib == 'torch' and m.cuda.is_available():
                return 'cuda', m.cuda.get_device_name(0)
            elif lib == 'cupy':
                m.array([1]); return 'cupy', 'CUDA GPU'
        except Exception:
            pass
    return None, None

GPU_BACKEND, GPU_NAME = _detect_gpu()
print(f'\u2705 Deps OK | Python {sys.version.split()[0]} | '
      f'GPU: {GPU_BACKEND + "(" + GPU_NAME + ")" if GPU_BACKEND else "none (CPU)"}')

# -- Resolve FASTA files ------------------------------------------------------
def _resolve(inp):
    out = []
    for p in ([inp] if isinstance(inp, str) else list(inp)):
        hits = glob.glob(p); out.extend(hits)
        if not hits and os.path.isfile(p): out.append(p)
    return sorted({str(Path(f).resolve()) for f in out})

def _seq_lengths(p):
    L, c = [], 0
    with open(p) as fh:
        for ln in fh:
            s = ln.strip()
            if s.startswith('>'):
                if c: L.append(c); c = 0
            else:
                c += len(s)
    if c: L.append(c)
    return L

def _stream_fasta(fasta_path):
    """Stream (name, seq) pairs without loading the whole genome."""
    if _HAS_PYFASTX:
        for seq in _pyfastx.Fasta(str(fasta_path), build_index=False):
            yield seq.name, seq.seq
    elif _HAS_SEQIO:
        with open(fasta_path) as fh:
            for rec in _SeqIO.parse(fh, 'fasta'):
                yield rec.id, str(rec.seq)
    else:
        name, parts = None, []
        with open(fasta_path) as fh:
            for ln in fh:
                s = ln.rstrip('\n')
                if s.startswith('>'):
                    if name is not None:
                        yield name, ''.join(parts)
                    name = s[1:].split()[0]; parts = []
                else:
                    parts.append(s)
        if name is not None:
            yield name, ''.join(parts)

FASTA_FILES = _resolve(FASTA_INPUT)
if not FASTA_FILES:
    raise FileNotFoundError(f'No FASTA files found for: {FASTA_INPUT}')

FILE_TYPES = {}
for fp in FASTA_FILES:
    ls = _seq_lengths(fp)
    FILE_TYPES[fp] = ('single' if len(ls) == 1 else
                      'multi_equal' if len(set(ls)) == 1 else 'multi')

GFF_MAP = {}
for fp in FASTA_FILES:
    stem, parent = Path(fp).stem, Path(fp).parent
    for ext in ('.gff3', '.gff'):
        cand = parent / (stem + ext)
        if cand.exists(): GFF_MAP[fp] = str(cand); break

print(f'\n\U0001f4c2 Input files: {len(FASTA_FILES)}')
for fp in FASTA_FILES:
    gff_tag = f'  +GFF: {Path(GFF_MAP[fp]).name}' if fp in GFF_MAP else ''
    print(f'   [{FILE_TYPES[fp]:12s}]  {Path(fp).name}{gff_tag}')

# -- Adaptive resource planning -----------------------------------------------
from Utilities.system_resource_inspector import SystemResourceInspector
from Utilities.adaptive_chunk_planner    import AdaptiveChunkPlanner
from Utilities.nonbscanner               import analyze_sequence as _nbf_analyze
from Utilities.utilities                 import (
    read_fasta_file,
    compute_comprehensive_genome_stats,
)

_insp   = SystemResourceInspector()
_budget = RAM_OVERRIDE_BYTES or _insp.get_memory_budget()
_cpus   = _insp.get_cpu_count()
_total  = max(sum(os.path.getsize(f) for f in FASTA_FILES if os.path.exists(f)), 1_000)
_plan   = AdaptiveChunkPlanner().plan(_total, _budget, _cpus)
CHUNK_SIZE, CHUNK_OVERLAP = _plan['chunk_size'], _plan['overlap']

# Use all available cores; fallback of 2 keeps things safe on minimal hardware.
N_WORKERS = max(1, (os.cpu_count() or 2) - 1)
EXEC_MODE = _plan['mode']

# Derived chunking parameters (Mb / kb -> bp)
_LARGE_CHR_THRESHOLD  = LARGE_CHR_THRESHOLD_MB  * 1_000_000
_GENOME_CHUNK_SIZE    = GENOME_CHUNK_SIZE_MB     * 1_000_000
_GENOME_CHUNK_OVERLAP = GENOME_CHUNK_OVERLAP_KB  * 1_000

# Timestamp is UTC; output directory names include 'Z' suffix to reflect this.
_RUN_TS = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ')
_BASE   = Path(OUTPUT_DIR) / _RUN_TS
_BASE.mkdir(parents=True, exist_ok=True)

print(f'\u2699\ufe0f  RAM {_budget/1e9:.2f} GB | '
      f'chunk={CHUNK_SIZE:,} overlap={CHUNK_OVERLAP:,} '
      f'workers={N_WORKERS} mode={EXEC_MODE}')
_fasta_backend = 'pyfastx' if _HAS_PYFASTX else ('SeqIO' if _HAS_SEQIO else 'built-in')
print(f'   FASTA stream: {_fasta_backend} | '
      f'large-chr chunk: {GENOME_CHUNK_SIZE_MB} Mb + {GENOME_CHUNK_OVERLAP_KB} kb overlap')
print(f'\U0001f4c2 Run output: {_BASE}')

# -- Shared helpers -----------------------------------------------------------
def _savefig(fig, path):
    """Save figure; display inline as a small PNG (lightweight output)."""
    fig.savefig(str(path), dpi=120, bbox_inches='tight')
    plt.close(fig)
    display(Image(str(path)))

def _safe_fname(s):
    return re.sub(r'[^\w\-]', '_', str(s))

def _gc_and_length(fasta_path):
    gc = total = 0
    with open(fasta_path) as fh:
        for ln in fh:
            s = ln.strip()
            if not s or s.startswith('>'): continue
            su = s.upper(); gc += su.count('G') + su.count('C'); total += len(su)
    return (round(gc / total * 100, 2) if total else 0.0), total

def _merge_coverage(intervals, cap=None):
    """Vectorised merge of intervals; returns total covered bases."""
    if len(intervals) == 0: return 0
    intervals = np.array(intervals, dtype=np.int64)
    if cap is not None:
        intervals[:, 1] = np.minimum(intervals[:, 1], cap)
    intervals = intervals[intervals[:, 1] > intervals[:, 0]]
    if len(intervals) == 0: return 0
    intervals = intervals[np.argsort(intervals[:, 0])]
    s, e = intervals[0]; covered = 0
    for cs, ce in intervals[1:]:
        if cs <= e:
            e = max(e, ce)
        else:
            covered += e - s; s, e = cs, ce
    return covered + e - s

def _coverage(df, seq_lengths_dict):
    if df.empty or not seq_lengths_dict: return 0.0
    total_len = sum(seq_lengths_dict.values())
    if total_len == 0: return 0.0
    covered = sum(
        _merge_coverage(grp[['Start', 'End']].values, cap=seq_lengths_dict.get(sn, 0))
        for sn, grp in df.groupby('Sequence_Name')
    )
    return round(covered / total_len * 100, 2)

def _class_density_coverage(df, all_results):
    total_bp = sum(sum(r['seq_lengths'].values()) for r in all_results.values())
    if total_bp == 0 or df.empty: return {}
    out = {}
    for cls, grp in df.groupby('Class'):
        cov_bp = 0
        for stem, res in all_results.items():
            sub = grp[grp['Source_File'] == Path(res['path']).name]
            if sub.empty: continue
            for sn, sg in sub.groupby('Sequence_Name'):
                sq_len = res['seq_lengths'].get(sn, 0)
                cov_bp += _merge_coverage(sg[['Start', 'End']].values, cap=sq_len)
        out[cls] = {'Density_per_kb': round(len(grp) / total_bp * 1000, 4),
                    'Coverage_pct':   round(cov_bp / total_bp * 100, 3)}
    return out

def _subclass_density_coverage(df, all_results):
    total_bp = sum(sum(r['seq_lengths'].values()) for r in all_results.values())
    if total_bp == 0 or df.empty: return {}
    out = {}
    for sc, grp in df.groupby('Subclass'):
        cov_bp = 0
        for stem, res in all_results.items():
            sub = grp[grp['Source_File'] == Path(res['path']).name]
            if sub.empty: continue
            for sn, sg in sub.groupby('Sequence_Name'):
                sq_len = res['seq_lengths'].get(sn, 0)
                cov_bp += _merge_coverage(sg[['Start', 'End']].values, cap=sq_len)
        out[sc] = {'Density_per_kb': round(len(grp) / total_bp * 1000, 4),
                   'Coverage_pct':   round(cov_bp / total_bp * 100, 3)}
    return out

def _parse_gff(gff_path):
    rows = []
    with open(gff_path) as fh:
        for ln in fh:
            if ln.startswith('#'): continue
            p = ln.strip().split('\t')
            if len(p) < 9: continue
            rows.append({'Seq': p[0], 'GFF_Type': p[2],
                         'GFF_Start': int(p[3]), 'GFF_End': int(p[4])})
    return pd.DataFrame(rows) if rows else pd.DataFrame()


In [None]:
# =============================================================================
# CELL 2 * ANALYSIS -- parallel detection + comparative outputs only
# Run Cell 1 first.
# =============================================================================

_WALL_START = time.perf_counter()

try:
    import psutil as _psutil
    _proc = _psutil.Process()
    def _mem_mb(): return _proc.memory_info().rss / 1e6
except ImportError:
    def _mem_mb(): return float('nan')

# -- Detection: ProcessPoolExecutor for maximum parallelism -------------------
from concurrent.futures import ProcessPoolExecutor, as_completed
from Utilities.genome_worker import process_chromosome

RESULTS_BY_FILE = {}
GFF_RESULTS     = {}

for fasta_path in tqdm(FASTA_FILES, desc='Files', unit='file'):
    stem  = Path(fasta_path).stem
    ftype = FILE_TYPES[fasta_path]
    fdir  = _BASE / stem
    fdir.mkdir(parents=True, exist_ok=True)
    _parquet_dir = str(fdir / '_parquet')
    Path(_parquet_dir).mkdir(exist_ok=True)
    tqdm.write(f'\n\u2500\u2500 {stem}  [{ftype}] \u2500\u2500')

    _seq_items = list(_stream_fasta(fasta_path))
    sl_map = {sn: len(sq) for sn, sq in _seq_items}

    _worker_args = [
        (
            sn, sq,
            Path(fasta_path).name, ftype,
            _LARGE_CHR_THRESHOLD,
            _GENOME_CHUNK_SIZE,
            _GENOME_CHUNK_OVERLAP,
            ENABLED_CLASSES,
            _parquet_dir,
            CHUNK_SIZE, CHUNK_OVERLAP,
        )
        for sn, sq in _seq_items
    ]
    del _seq_items; gc.collect()

    _t0 = time.perf_counter()
    _parquet_paths = []
    _total_motifs  = 0

    with ProcessPoolExecutor(max_workers=N_WORKERS) as _pool:
        # NOTE: sequence strings are pickled per worker — for very large chromosomes
        # (>100 Mb) this increases peak RSS. The genome_worker writes results to
        # Parquet on disk, so the parent process only holds metadata in memory.
        _futs = {_pool.submit(process_chromosome, a): a[0] for a in _worker_args}
        for _fut in tqdm(as_completed(_futs), total=len(_futs),
                         desc=f'  seqs({stem})', leave=False):
            _sn = _futs[_fut]
            try:
                _sn_r, _ppath, _n, _t = _fut.result()
                if _ppath:
                    _parquet_paths.append(_ppath)
                _total_motifs += _n
                tqdm.write(f'  \u25b8 {_sn_r[:55]}  \u2192 {_n:,} motifs ({_t:.1f}s)')
            except Exception as _e:
                tqdm.write(f'  \u26a0\ufe0f  {_sn[:55]}  \u2192 skipped ({_e})')

    _elapsed = time.perf_counter() - _t0
    tqdm.write(f'  \u2705 {_total_motifs:,} motifs in {_elapsed:.1f}s | RAM {_mem_mb():.0f} MB')

    if _parquet_paths:
        import pyarrow.parquet as _pq_local
        df = pd.concat([pd.read_parquet(p) for p in _parquet_paths], ignore_index=True)
    else:
        df = pd.DataFrame()

    for _col, _dflt in [('Class', 'Unknown'), ('Subclass', 'Other'),
                         ('Start', 0), ('End', 0), ('Length', 0),
                         ('Score', 0.0), ('Strand', '+'), ('Sequence_Name', '')]:
        if _col not in df.columns: df[_col] = _dflt
    if not df.empty:
        df['Length'] = np.where(df['Length'] == 0,
                                (df['End'] - df['Start']).clip(lower=0), df['Length'])
    df['Source_File'] = Path(fasta_path).name
    df['File_Type']   = ftype

    if not df.empty:
        df.to_parquet(str(fdir / 'motifs.parquet'), index=False)
        df.to_csv(str(fdir / 'motifs.csv'), encoding='utf-8-sig', index=False)

    # GFF region tagging (if available)
    if fasta_path in GFF_MAP:
        _gff_raw = _parse_gff(GFF_MAP[fasta_path])
        if not _gff_raw.empty and not df.empty:
            _gff_tagged = df.merge(
                _gff_raw, left_on='Sequence_Name', right_on='Seq', how='inner')
            _gff_tagged = _gff_tagged[
                (_gff_tagged['Start'] >= _gff_tagged['GFF_Start']) &
                (_gff_tagged['End']   <= _gff_tagged['GFF_End'])
            ]
            GFF_RESULTS[stem] = {'df': _gff_tagged, 'path': GFF_MAP[fasta_path]}

    RESULTS_BY_FILE[stem] = {
        'df': df, 'folder': fdir, 'file_type': ftype,
        'path': fasta_path, 'seq_lengths': sl_map,
    }
    gc.collect()

_wall_detect = time.perf_counter() - _WALL_START
print(f'\n\u2705 Detection complete -- {len(RESULTS_BY_FILE)} file(s) in {_wall_detect:.1f}s | '
      f'RAM {_mem_mb():.0f} MB')

# =============================================================================
# STATISTICS & COMPARATIVE TABLES
# =============================================================================

_dfs        = [r['df'] for r in RESULTS_BY_FILE.values() if not r['df'].empty]
_master_df  = pd.concat(_dfs, ignore_index=True) if _dfs else pd.DataFrame()
_master_dir = _BASE / '_master'; _master_dir.mkdir(exist_ok=True)

_gff_dfs = [v['df'] for v in GFF_RESULTS.values() if not v['df'].empty]
_gff_df  = pd.concat(_gff_dfs, ignore_index=True) if _gff_dfs else pd.DataFrame()

_tables = {}

if not _master_df.empty:
    # Table 1: Per-file summary
    _pf_rows = []
    for stem, res in RESULTS_BY_FILE.items():
        df_r = res['df']
        gc_pct, seq_len = _gc_and_length(res['path'])
        n = len(df_r)
        _pf_rows.append({
            'File':           Path(res['path']).name,
            'File_Type':      res['file_type'],
            'Sequences':      len(res['seq_lengths']),
            'Total_bp':       seq_len,
            'GC_Percent':     gc_pct,
            'Total_Motifs':   n,
            'Classes':        df_r['Class'].nunique()    if not df_r.empty else 0,
            'Subclasses':     df_r['Subclass'].nunique() if not df_r.empty else 0,
            'Hybrids':        int((df_r['Class'] == 'Hybrid').sum())             if not df_r.empty else 0,
            'Clusters':       int((df_r['Class'] == 'Non-B_DNA_Clusters').sum()) if not df_r.empty else 0,
            'Density_per_kb': round(n / seq_len * 1000, 4) if seq_len else 0.0,
            'Coverage_pct':   _coverage(df_r, res['seq_lengths']),
        })
    _tables['1_per_file_summary'] = pd.DataFrame(_pf_rows)

    # Table 2: Class statistics
    _cls_dc = _class_density_coverage(_master_df, RESULTS_BY_FILE)
    _tables['2_class_statistics'] = (
        _master_df.groupby('Class')
        .agg(Total_Count=('Class', 'count'),
             Mean_Length=('Length', 'mean'),
             Mean_Score=('Score', 'mean'))
        .round(3).reset_index()
        .assign(
            Density_per_kb=lambda d: d['Class'].map(
                lambda c: _cls_dc.get(c, {}).get('Density_per_kb', 0)),
            Coverage_pct=lambda d: d['Class'].map(
                lambda c: _cls_dc.get(c, {}).get('Coverage_pct', 0)),
        )
        .sort_values('Total_Count', ascending=False)
        [['Class', 'Total_Count', 'Mean_Length', 'Mean_Score', 'Density_per_kb', 'Coverage_pct']]
    )

    # Table 3: Subclass statistics (top 30 by count)
    _sc_dc = _subclass_density_coverage(_master_df, RESULTS_BY_FILE)
    _tables['3_subclass_statistics'] = (
        _master_df.groupby('Subclass')
        .agg(Total_Count=('Subclass', 'count'),
             Mean_Length=('Length', 'mean'),
             Mean_Score=('Score', 'mean'))
        .round(3).reset_index()
        .assign(
            Density_per_kb=lambda d: d['Subclass'].map(
                lambda s: _sc_dc.get(s, {}).get('Density_per_kb', 0)),
            Coverage_pct=lambda d: d['Subclass'].map(
                lambda s: _sc_dc.get(s, {}).get('Coverage_pct', 0)),
        )
        .sort_values('Total_Count', ascending=False)
        .head(30)
        [['Subclass', 'Total_Count', 'Mean_Length', 'Mean_Score', 'Density_per_kb', 'Coverage_pct']]
    )

    # Table 4: File x Class density pivot
    _dens_rows = []
    for (fname, cls), grp in _master_df.groupby(['Source_File', 'Class']):
        stem_k = Path(fname).stem
        res    = RESULTS_BY_FILE.get(stem_k, {})
        slen   = max(sum(res.get('seq_lengths', {1: 1}).values()), 1)
        _dens_rows.append({'Source_File': fname, 'Class': cls,
                            'Density_per_kb': round(len(grp) / slen * 1000, 4)})
    _tables['4_class_density_pivot'] = (
        pd.DataFrame(_dens_rows)
        .pivot_table(index='Source_File', columns='Class',
                     values='Density_per_kb', fill_value=0)
        .reset_index()
    )

    # Table 5: File x Class coverage pivot
    _cov_rows = []
    for fname in _master_df['Source_File'].unique():
        stem_k = Path(fname).stem
        res    = RESULTS_BY_FILE.get(stem_k)
        if not res: continue
        sub_file = _master_df[_master_df['Source_File'] == fname]
        slen     = max(sum(res['seq_lengths'].values()), 1)
        for cls, grp in sub_file.groupby('Class'):
            cov = sum(
                _merge_coverage(sg[['Start', 'End']].values,
                                cap=res['seq_lengths'].get(sn, 0))
                for sn, sg in grp.groupby('Sequence_Name')
            )
            _cov_rows.append({'Source_File': fname, 'Class': cls,
                               'Coverage_pct': round(cov / slen * 100, 3)})
    _tables['5_class_coverage_pivot'] = (
        pd.DataFrame(_cov_rows)
        .pivot_table(index='Source_File', columns='Class',
                     values='Coverage_pct', fill_value=0)
        .reset_index()
    )

if not _gff_df.empty:
    _tables['6_gff_motifs_per_feature'] = (
        _gff_df.groupby(['GFF_Type', 'Class']).size().reset_index(name='Count')
        .sort_values('Count', ascending=False)
    )

# -- Export tables to CSV -----------------------------------------------------
if not _master_df.empty:
    _master_df.to_parquet(str(_master_dir / 'master_motifs.parquet'), index=False)
    _master_df.to_csv(str(_master_dir / 'master_motifs.csv'), encoding='utf-8-sig', index=False)
if not _gff_df.empty:
    _gff_df.to_csv(str(_master_dir / 'gff_region_motifs_all.csv'),
                   encoding='utf-8-sig', index=False)
for tname, tdf in _tables.items():
    tdf.to_csv(str(_master_dir / f'{tname}.csv'), encoding='utf-8-sig', index=False)

# =============================================================================
# COMPARATIVE VISUALISATIONS (final summary charts only)
# =============================================================================

print('\n' + '=' * 70)
print('COMPARATIVE VISUALISATIONS')
print('=' * 70)

if not _master_df.empty:
    _pf_summary = _tables.get('1_per_file_summary', pd.DataFrame())
    _n_files    = len(RESULTS_BY_FILE)

    # 1. Global class distribution (horizontal bar)
    cc = _master_df['Class'].value_counts()
    fig, ax = plt.subplots(figsize=(8, max(3, len(cc) * 0.45)))
    ax.barh(cc.index[::-1], cc.values[::-1], color='steelblue')
    ax.set_xlabel('Motif Count')
    ax.set_title('Global Class Distribution')
    for i, v in enumerate(cc.values[::-1]):
        ax.text(v + 0.3, i, str(v), va='center', fontsize=8)
    plt.tight_layout()
    _savefig(fig, _master_dir / 'comparative_class_distribution.png')

    # 2. Global class density (motifs/kb)
    cls_stat = _tables['2_class_statistics'].set_index('Class')['Density_per_kb']
    fig, ax = plt.subplots(figsize=(8, max(3, len(cls_stat) * 0.45)))
    ax.barh(cls_stat.index[::-1], cls_stat.values[::-1], color='teal')
    ax.set_xlabel('Motifs per kb')
    ax.set_title('Global Class Density (motifs/kb)')
    for i, v in enumerate(cls_stat.values[::-1]):
        ax.text(v, i, f'{v:.4f}', va='center', fontsize=8)
    plt.tight_layout()
    _savefig(fig, _master_dir / 'comparative_class_density.png')

    # 3. Global class coverage (%)
    cls_cov_s = _tables['2_class_statistics'].set_index('Class')['Coverage_pct']
    fig, ax = plt.subplots(figsize=(8, max(3, len(cls_cov_s) * 0.45)))
    ax.barh(cls_cov_s.index[::-1], cls_cov_s.values[::-1], color='mediumseagreen')
    ax.set_xlabel('Coverage (%)')
    ax.set_title('Global Class Coverage (%)')
    for i, v in enumerate(cls_cov_s.values[::-1]):
        ax.text(v, i, f'{v:.3f}%', va='center', fontsize=8)
    plt.tight_layout()
    _savefig(fig, _master_dir / 'comparative_class_coverage.png')

    # 4. Top-20 subclass distribution
    sc_all = _master_df['Subclass'].value_counts().head(20)
    fig, ax = plt.subplots(figsize=(8, max(4, len(sc_all) * 0.4)))
    ax.barh(sc_all.index[::-1], sc_all.values[::-1], color='darkorange')
    ax.set_xlabel('Count')
    ax.set_title('Global Subclass Distribution (top 20)')
    plt.tight_layout()
    _savefig(fig, _master_dir / 'comparative_subclass_distribution.png')

    # 5. File-level density & coverage comparison
    if not _pf_summary.empty:
        for col, label, color, title, fn in [
            ('Density_per_kb', 'Motifs per kb', 'steelblue',
             'Motif Density Across Files', 'comparative_density_across_files.png'),
            ('Coverage_pct', 'Coverage (%)', 'mediumseagreen',
             'Non-B DNA Coverage Across Files', 'comparative_coverage_across_files.png'),
        ]:
            _pfs = _pf_summary.sort_values(col, ascending=False)
            fig, ax = plt.subplots(figsize=(max(6, len(_pfs) * 1.4), 4))
            bars = ax.bar(
                _pfs['File'].apply(lambda x: Path(x).stem[:25]),
                _pfs[col], color=color,
            )
            ax.bar_label(bars,
                         fmt='%.3f' if col == 'Density_per_kb' else '%.1f%%',
                         padding=2, fontsize=8)
            ax.set_ylabel(label); ax.set_title(title)
            if col == 'Coverage_pct': ax.set_ylim(0, 100)
            plt.xticks(rotation=30, ha='right'); plt.tight_layout()
            _savefig(fig, _master_dir / fn)

    # 6. Class density heatmap (files x classes)
    if '4_class_density_pivot' in _tables and not _tables['4_class_density_pivot'].empty:
        _dp = _tables['4_class_density_pivot'].set_index('Source_File')
        if not _dp.empty:
            fig, ax = plt.subplots(figsize=(max(10, len(_dp.columns) * 1.2),
                                            max(4,  len(_dp) * 0.7)))
            sns.heatmap(_dp, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax,
                        linewidths=0.4, cbar_kws={'label': 'Motifs per kb'})
            ax.set_title('Class Density Heatmap (motifs/kb) -- Files x Classes')
            ax.set_xlabel('Non-B Class'); ax.set_ylabel('File')
            plt.tight_layout()
            _savefig(fig, _master_dir / 'comparative_class_density_heatmap.png')

    # 7. Class coverage heatmap (files x classes)
    if '5_class_coverage_pivot' in _tables and not _tables['5_class_coverage_pivot'].empty:
        _cp = _tables['5_class_coverage_pivot'].set_index('Source_File')
        if not _cp.empty:
            fig, ax = plt.subplots(figsize=(max(10, len(_cp.columns) * 1.2),
                                            max(4,  len(_cp) * 0.7)))
            sns.heatmap(_cp, annot=True, fmt='.3f', cmap='Blues', ax=ax,
                        linewidths=0.4, cbar_kws={'label': 'Coverage %'})
            ax.set_title('Class Coverage Heatmap (%) -- Files x Classes')
            ax.set_xlabel('Non-B Class'); ax.set_ylabel('File')
            plt.tight_layout()
            _savefig(fig, _master_dir / 'comparative_class_coverage_heatmap.png')

    # 8. Multi-file class count comparison (grouped bar; only when >= 2 files)
    if _n_files >= 2:
        _cls_pivot = _master_df.groupby(['Source_File', 'Class']).size().unstack(fill_value=0)
        fig, ax = plt.subplots(figsize=(max(10, len(_cls_pivot) * 1.4),
                                        max(4, len(_cls_pivot.columns) * 0.5)))
        _cls_pivot.plot(kind='bar', ax=ax, colormap='tab20', width=0.8)
        ax.set_xlabel('File'); ax.set_ylabel('Motif Count')
        ax.set_title('Class Distribution -- All Files Comparison')
        ax.legend(title='Class', bbox_to_anchor=(1, 1), fontsize=8)
        plt.xticks(rotation=30, ha='right'); plt.tight_layout()
        _savefig(fig, _master_dir / 'comparative_all_files_class_comparison.png')

        # Top-20 subclass comparison across files
        _top_subs  = _master_df['Subclass'].value_counts().head(20).index
        _sub_pivot = _master_df.groupby(['Source_File', 'Subclass']).size().unstack(fill_value=0)
        _sub_pivot = _sub_pivot[[c for c in _top_subs if c in _sub_pivot.columns]]
        if not _sub_pivot.empty:
            fig, ax = plt.subplots(figsize=(max(10, len(_sub_pivot) * 1.4),
                                            max(4, len(_sub_pivot.columns) * 0.4)))
            _sub_pivot.plot(kind='bar', ax=ax, colormap='tab20', width=0.8)
            ax.set_xlabel('File'); ax.set_ylabel('Motif Count')
            ax.set_title('Subclass Distribution -- All Files (top 20)')
            ax.legend(title='Subclass', bbox_to_anchor=(1, 1), fontsize=7)
            plt.xticks(rotation=30, ha='right'); plt.tight_layout()
            _savefig(fig, _master_dir / 'comparative_all_files_subclass_comparison.png')

    # 9. Hybrid & Cluster comparison
    if not _pf_summary.empty and (
        _pf_summary['Hybrids'].sum() > 0 or _pf_summary['Clusters'].sum() > 0
    ):
        x = np.arange(len(_pf_summary)); w = 0.35
        labels = _pf_summary['File'].apply(lambda x: Path(x).stem[:20])
        fig, ax = plt.subplots(figsize=(max(7, len(_pf_summary) * 1.5), 4))
        ax.bar(x - w / 2, _pf_summary['Hybrids'],  w, label='Hybrids',  color='tomato')
        ax.bar(x + w / 2, _pf_summary['Clusters'], w, label='Clusters', color='mediumpurple')
        ax.set_xticks(x); ax.set_xticklabels(labels, rotation=30, ha='right')
        ax.set_ylabel('Count'); ax.set_title('Hybrid & Cluster Motifs Across Files')
        ax.legend(); plt.tight_layout()
        _savefig(fig, _master_dir / 'comparative_hybrid_cluster.png')

    # 10. GFF feature x class heatmap (if GFF data available)
    if not _gff_df.empty:
        _piv = _gff_df.groupby(['GFF_Type', 'Class']).size().unstack(fill_value=0)
        fig, ax = plt.subplots(figsize=(max(10, len(_piv.columns) * 1.2),
                                        max(4,  len(_piv) * 0.6)))
        sns.heatmap(_piv, annot=True, fmt='d', cmap='YlOrRd', ax=ax,
                    linewidths=0.4, cbar_kws={'label': 'Motif count'})
        ax.set_title('GFF Feature Type x Non-B Class Heatmap')
        ax.set_xlabel('Non-B Class'); ax.set_ylabel('GFF Feature Type')
        plt.tight_layout()
        _savefig(fig, _master_dir / 'comparative_gff_class_heatmap.png')

# =============================================================================
# SUMMARY TABLES (displayed inline)
# =============================================================================

print('\n' + '=' * 70)
print('SUMMARY TABLES')
print('=' * 70)

for tname, tdf in _tables.items():
    print(f'\n--- {tname.replace("_", " ").title()} ---')
    display(tdf)

# =============================================================================
# GLOBAL COMPREHENSIVE GENOME STATISTICS
# =============================================================================

if not _master_df.empty:
    _all_motifs_global = _master_df.to_dict('records')
    _total_bp_global   = max(
        sum(sum(r['seq_lengths'].values()) for r in RESULTS_BY_FILE.values()), 1)
    try:
        gstats = compute_comprehensive_genome_stats(_all_motifs_global, _total_bp_global)
        _gsg_rows = [
            ('Genome Length',                    f"{gstats['genome_length']:,} bp"),
            ('Motifs (excl. Hybrid/Cluster)',     f"{gstats['n_motifs']:,}"),
            ('Motifs (incl. Hybrid/Cluster)',     f"{gstats['n_motifs_all']:,}"),
            ('Motif Classes',                     str(gstats['n_classes'])),
            ('Motif Density',                     f"{gstats['density_per_kb']:.4f} / kb"),
            ('Total Covered Bases',               f"{gstats['total_covered_bases']:,} bp"),
            ('Coverage Fraction',                 f"{gstats['coverage_fraction']:.6f}"),
            ('Coverage (%)',                      f"{gstats['coverage_pct']:.4f}%"),
            ('Raw Occupancy',                     f"{gstats['raw_occupancy_bp']:,} bp"),
            ('Normalized Occupancy (SLI)',        f"{gstats['normalized_occupancy']:.6f}"),
            ('Mean Overlap Depth',                f"{gstats['mean_overlap_depth']:.4f}"),
            ('SLI',                               f"{gstats['sli']:.6f}"),
            ('Structural Intensity',              f"{gstats['structural_intensity']:.6f}"),
            ('Weighted Structural Coverage',      f"{gstats['weighted_structural_coverage']:.6f}"),
            ('Mean Inter-Motif Distance',         f"{gstats['mean_inter_motif_distance']:.2f} bp"),
            ('CV (Clustering Coefficient)',       f"{gstats['cv_spatial_clustering']:.4f}"),
            (f"Max Local Density (W={gstats['window_size']:,} bp)",
                                                  f"{gstats['max_local_density']:.6f}"),
            ('Max Class Diversity',               str(gstats['max_class_diversity_window'])),
            ('Max Cluster Score',                 f"{gstats['max_cluster_score']:.6f}"),
            ('Hybrid Regions',                    f"{gstats['hybrid_count']:,}"),
            ('Hybrid Coverage',                   f"{gstats['hybrid_coverage_pct']:.4f}%"),
            ('Cluster Regions',                   f"{gstats['cluster_count']:,}"),
            ('Cluster Coverage',                  f"{gstats['cluster_coverage_pct']:.4f}%"),
            ('Mean Overlap Fraction',             f"{gstats['mean_overlap_fraction']:.4f}"),
            ('Simpson Diversity Index (D)',       f"{gstats['simpson_diversity_index']:.4f}"),
            ('Effective Class Number (Neff)',     f"{gstats['effective_class_number']:.4f}"),
            ('SCI (Structural Complexity Index)', f"{gstats['sci']:.4f}"),
            ('Structural Dominance Ratio',        f"{gstats['dominance_ratio']:.4f}"),
        ]
        _gsg_df = pd.DataFrame(_gsg_rows, columns=['Metric', 'Value'])
        print('\n' + '=' * 70)
        print('GLOBAL COMPREHENSIVE GENOME STATISTICS')
        print('=' * 70)
        display(_gsg_df)
        _gsg_df.to_csv(str(_master_dir / 'global_comprehensive_genome_stats.csv'),
                       encoding='utf-8-sig', index=False)
    except Exception as _gse:
        print(f'  \u26a0\ufe0f  Global comprehensive stats failed: {_gse}')

# =============================================================================
# DOWNLOAD LINKS
# =============================================================================

import base64

_MIME = {
    'csv':     'text/csv',
    'xlsx':    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
    'parquet': 'application/octet-stream',
    'png':     'image/png',
}

def _dl(path, label):
    with open(path, 'rb') as fh:
        b64 = base64.b64encode(fh.read()).decode()
    ext  = Path(path).suffix.lstrip('.')
    mime = _MIME.get(ext, 'application/octet-stream')
    return (f'<a href="data:{mime};base64,{b64}" download="{Path(path).name}" '
            f'style="margin:2px 6px;padding:3px 8px;border:1px solid #aaa;'
            f'border-radius:4px;text-decoration:none;">{label}</a>')

_html = ['<h2>\U0001f4e5 Downloads</h2><h3>Master Outputs</h3><div>']
for fmt, fn in [('CSV', 'master_motifs.csv'), ('Parquet', 'master_motifs.parquet')]:
    p = _master_dir / fn
    if p.exists(): _html.append(_dl(str(p), f'Master {fmt}'))
if (_master_dir / 'global_comprehensive_genome_stats.csv').exists():
    _html.append(_dl(str(_master_dir / 'global_comprehensive_genome_stats.csv'),
                     'Global Comprehensive Stats'))
_html.append('</div><h3>Statistics Tables</h3><div>')
for tn in _tables:
    p = _master_dir / f'{tn}.csv'
    if p.exists(): _html.append(_dl(str(p), tn.replace('_', ' ').title()))
_html.append('</div><h3>Comparative Charts (PNG)</h3><div>')
for fn in sorted(_master_dir.glob('comparative_*.png')):
    _html.append(_dl(str(fn), fn.stem.replace('_', ' ').title()))
_html.append('</div>')
display(HTML('\n'.join(_html)))

_wall_total = time.perf_counter() - _WALL_START
print(f'\n\u2705 All outputs saved to: {_BASE}')
print(f'   Total wall time: {_wall_total:.1f}s | Peak RAM: {_mem_mb():.0f} MB')
