# NonBDNA Finder — Analysis Notebook

## Overview
Detects and analyses **Non-B DNA structural motifs** in one or more FASTA files.  
The notebook is structured as **3 cells (tabs)**:

| Cell | Purpose |
|------|---------|
| **Cell 1 · Setup** | Imports, user config, helpers, engine initialisation |
| **Cell 2 · Analysis** | Detection, all statistics (class/subclass densities & coverages), plots, downloads |

Run **Cell 1 first**, then **Cell 2**.

---

## Detectors — 9 classes, 23+ subclasses

| Class | Key Subclasses |
|---|---|
| Curved DNA | Curved, Bent |
| Slipped DNA | Direct-repeat, Mirror |
| Cruciform | Cruciform |
| R-Loop | R-loop, G-rich, C-rich |
| Triplex | H-DNA, R·R·Y, Y·R·Y |
| G-Quadruplex | G4, Parallel, Anti-parallel… |
| i-Motif | iM-Canonical, iM-Partial, iM-C-rich |
| Z-DNA | ZH-score, CG-repeat |
| A-philic DNA | A-philic |

---

## Statistics produced

| Table | Columns |
|---|---|
| **Per-file summary** | Sequences, bp, GC%, Motifs, Classes, Subclasses, Density/kb, Coverage% |
| **Class statistics** | Count, Mean Length, Mean Score, **Density/kb**, **Coverage%** |
| **Subclass statistics** | Count, Mean Length, Mean Score, **Density/kb**, **Coverage%** |
| **File × Class pivot** | Motif counts per file per class |
| **Class density pivot** | Density (motifs/kb) per file per class |
| **Class coverage pivot** | Coverage (%) per file per class |

## Plots produced
- Class distribution (per-file + global)
- Subclass distribution (per-file + global)
- Density per class & per subclass
- Coverage per class & per subclass
- Density / coverage comparison across files
- Hybrid & Cluster breakdown
- Sequence-level density & coverage (multi-sequence files)
- Positional distribution (equal-length multiFASTA)


In [None]:
# ═══════════════════════════════════════════════════════════════════════════════
# CELL 1 · SETUP — imports, configuration, helpers
# Edit FASTA_INPUT and OUTPUT_DIR, then run this cell before Cell 2.
# ═══════════════════════════════════════════════════════════════════════════════

import sys, os, importlib, glob, gc, time, datetime, re, warnings
import concurrent.futures
from pathlib import Path
from collections import defaultdict
warnings.filterwarnings('ignore')

_REPO_ROOT = os.path.abspath(os.getcwd())
if _REPO_ROOT not in sys.path:
    sys.path.insert(0, _REPO_ROOT)

# ── Auto-install missing packages ─────────────────────────────────────────────
_REQUIRED = [('psutil','psutil>=5.8'),('pandas','pandas>=1.3'),('numpy','numpy>=1.21'),
             ('matplotlib','matplotlib>=3.5'),('seaborn','seaborn>=0.11'),
             ('openpyxl','openpyxl>=3.0'),('tqdm','tqdm>=4.64')]
_miss = [p for m,p in _REQUIRED if importlib.util.find_spec(m) is None]
if _miss:
    import subprocess; subprocess.check_call([sys.executable,'-m','pip','install',*_miss,'-q'])

import pandas as pd
import numpy as np
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from tqdm.auto import tqdm
from IPython.display import display, HTML, Image
sns.set_theme(style='whitegrid')

# ── USER CONFIGURATION ────────────────────────────────────────────────────────
FASTA_INPUT        = ['*.fna', '*.fasta']   # path, wildcard, or list
OUTPUT_DIR         = 'notebook_reports'
ENABLED_CLASSES    = None                   # None = all; e.g. ['G-Quadruplex','Z-DNA']
RAM_OVERRIDE_BYTES = None                   # None = auto

# ── GPU detection ─────────────────────────────────────────────────────────────
def _detect_gpu():
    for lib, attr in [('torch','cuda'),('cupy',None)]:
        try:
            m = importlib.import_module(lib)
            if lib == 'torch' and m.cuda.is_available():
                return 'cuda', m.cuda.get_device_name(0)
            elif lib == 'cupy':
                m.array([1]); return 'cupy', 'CUDA GPU'
        except Exception:
            pass
    return None, None

GPU_BACKEND, GPU_NAME = _detect_gpu()
print(f'\u2705 Deps OK | Python {sys.version.split()[0]} | '
      f'GPU: {GPU_BACKEND+"("+GPU_NAME+")" if GPU_BACKEND else "none (CPU)"}')

# ── Resolve FASTA files & classify types ─────────────────────────────────────
def _resolve(inp):
    out = []
    for p in ([inp] if isinstance(inp, str) else list(inp)):
        hits = glob.glob(p); out.extend(hits)
        if not hits and os.path.isfile(p): out.append(p)
    return sorted({str(Path(f).resolve()) for f in out})

def _seq_lengths(p):
    L, c = [], 0
    with open(p) as fh:
        for ln in fh:
            s = ln.strip()
            if s.startswith('>'):
                if c: L.append(c); c = 0
            else: c += len(s)
    if c: L.append(c)
    return L

FASTA_FILES = _resolve(FASTA_INPUT)
if not FASTA_FILES:
    raise FileNotFoundError(f'No FASTA files found for: {FASTA_INPUT}')

FILE_TYPES = {}
for fp in FASTA_FILES:
    ls = _seq_lengths(fp)
    FILE_TYPES[fp] = ('single' if len(ls)==1 else
                      'multi_equal' if len(set(ls))==1 else 'multi')

GFF_MAP = {}
for fp in FASTA_FILES:
    stem, parent = Path(fp).stem, Path(fp).parent
    for ext in ('.gff3','.gff'):
        cand = parent/(stem+ext)
        if cand.exists(): GFF_MAP[fp] = str(cand); break

print(f'\n\U0001f4c2 Input files: {len(FASTA_FILES)}')
for fp in FASTA_FILES:
    gff_tag = f'  +GFF: {Path(GFF_MAP[fp]).name}' if fp in GFF_MAP else ''
    print(f'   [{FILE_TYPES[fp]:12s}]  {Path(fp).name}{gff_tag}')

# ── Adaptive resource planning ────────────────────────────────────────────────
from Utilities.system_resource_inspector import SystemResourceInspector
from Utilities.adaptive_chunk_planner    import AdaptiveChunkPlanner
from Utilities.nonbscanner               import analyze_sequence as _nbf_analyze
from Utilities.utilities                 import read_fasta_file

_insp   = SystemResourceInspector()
_budget = RAM_OVERRIDE_BYTES or _insp.get_memory_budget()
_cpus   = _insp.get_cpu_count()
_total  = max(sum(os.path.getsize(f) for f in FASTA_FILES if os.path.exists(f)), 1_000)
_plan   = AdaptiveChunkPlanner().plan(_total, _budget, _cpus)
CHUNK_SIZE, CHUNK_OVERLAP = _plan['chunk_size'], _plan['overlap']
N_WORKERS, EXEC_MODE      = _plan['workers'], _plan['mode']
if GPU_BACKEND:
    N_WORKERS = min(N_WORKERS * 2, os.cpu_count() or 4)

_RUN_TS = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%d_%H%M%S')
_BASE   = Path(OUTPUT_DIR) / _RUN_TS
_BASE.mkdir(parents=True, exist_ok=True)
print(f'\u2699\ufe0f  RAM {_budget/1e9:.2f}GB | chunk={CHUNK_SIZE:,} overlap={CHUNK_OVERLAP:,} '
      f'workers={N_WORKERS} mode={EXEC_MODE}')
print(f'\U0001f4c2 Run output: {_BASE}')

# ── Core helpers ──────────────────────────────────────────────────────────────
def _scan(name, seq):
    return _nbf_analyze(sequence=seq, sequence_name=name, use_chunking=True,
                        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
                        use_parallel_chunks=(EXEC_MODE=='hybrid'),
                        enabled_classes=ENABLED_CLASSES)

def _savefig(fig, path, show=True):
    fig.savefig(str(path), dpi=150, bbox_inches='tight'); plt.close(fig)
    if show: display(Image(str(path)))

def _safe_fname(s): return re.sub(r'[^\w\-]', '_', str(s))

def _parse_species_region(stem):
    idx = stem.find('_')
    return (stem, 'unknown') if idx == -1 else (stem[:idx], stem[idx+1:])

def _gc_and_length(fasta_path):
    gc = total = 0
    with open(fasta_path) as fh:
        for ln in fh:
            s = ln.strip()
            if not s or s.startswith('>'): continue
            su = s.upper(); gc += su.count('G') + su.count('C'); total += len(su)
    return (round(gc/total*100, 2) if total else 0.0), total

def _merge_coverage(intervals, cap=None):
    """Sum of merged interval lengths; optionally capped at cap."""
    if len(intervals) == 0: return 0
    intervals = np.array(intervals, dtype=int)
    if cap is not None:
        intervals[:,1] = np.minimum(intervals[:,1], cap)
    intervals = intervals[intervals[:,1] > intervals[:,0]]
    if len(intervals) == 0: return 0
    intervals = intervals[np.argsort(intervals[:,0])]
    s, e = intervals[0]; covered = 0
    for cs, ce in intervals[1:]:
        if cs <= e: e = max(e, ce)
        else: covered += e-s; s,e = cs,ce
    return covered + e - s

def _coverage(df, seq_lengths_dict):
    """Overall coverage % across all sequences."""
    if df.empty or not seq_lengths_dict: return 0.0
    total_len = sum(seq_lengths_dict.values())
    if total_len == 0: return 0.0
    covered = sum(
        _merge_coverage(grp[['Start','End']].values, cap=seq_lengths_dict.get(sn, 0))
        for sn, grp in df.groupby('Sequence_Name')
    )
    return round(covered / total_len * 100, 2)

def _class_density_coverage(df, all_results):
    """Return dict: class -> {Density_per_kb, Coverage_pct} across all files."""
    total_bp = sum(sum(r['seq_lengths'].values()) for r in all_results.values())
    if total_bp == 0 or df.empty: return {}
    out = {}
    for cls, grp in df.groupby('Class'):
        cov_bp = 0
        for stem, res in all_results.items():
            sub = grp[grp['Source_File'] == Path(res['path']).name]
            if sub.empty: continue
            for sn, sg in sub.groupby('Sequence_Name'):
                sq_len = res['seq_lengths'].get(sn, 0)
                cov_bp += _merge_coverage(sg[['Start','End']].values, cap=sq_len)
        out[cls] = {'Density_per_kb': round(len(grp)/total_bp*1000, 4),
                    'Coverage_pct':   round(cov_bp/total_bp*100, 3)}
    return out

def _subclass_density_coverage(df, all_results):
    """Return dict: subclass -> {Density_per_kb, Coverage_pct} across all files."""
    total_bp = sum(sum(r['seq_lengths'].values()) for r in all_results.values())
    if total_bp == 0 or df.empty: return {}
    out = {}
    for sc, grp in df.groupby('Subclass'):
        cov_bp = 0
        for stem, res in all_results.items():
            sub = grp[grp['Source_File'] == Path(res['path']).name]
            if sub.empty: continue
            for sn, sg in sub.groupby('Sequence_Name'):
                sq_len = res['seq_lengths'].get(sn, 0)
                cov_bp += _merge_coverage(sg[['Start','End']].values, cap=sq_len)
        out[sc] = {'Density_per_kb': round(len(grp)/total_bp*1000, 4),
                   'Coverage_pct':   round(cov_bp/total_bp*100, 3)}
    return out

def _parse_gff(gff_path):
    feats = []
    with open(gff_path) as fh:
        for ln in fh:
            if ln.startswith('#') or not ln.strip(): continue
            p = ln.rstrip('\n').split('\t')
            if len(p) < 8: continue
            try:
                feats.append({'seqid':p[0],'type':p[2],'start':max(int(p[3])-1,0),
                              'end':int(p[4]),'strand':p[6],
                              'attrs':p[8] if len(p)>8 else ''})
            except ValueError: pass
    return feats

print('\u2705 Engine Ready')


In [None]:
# ═══════════════════════════════════════════════════════════════════════════════
# CELL 2 · ANALYSIS — detection, all statistics, plots, downloads
# Run Cell 1 first.
# ═══════════════════════════════════════════════════════════════════════════════

RESULTS_BY_FILE = {}   # stem -> {df, folder, file_type, path, seq_lengths}
GFF_RESULTS     = {}   # stem -> {region_df, gff_path, folder}

# ─────────────────────────────────────────────────────────────────────────────
# A. PER-FILE DETECTION
# ─────────────────────────────────────────────────────────────────────────────
for fasta_path in tqdm(FASTA_FILES, desc='Files', unit='file'):
    stem    = Path(fasta_path).stem
    ftype   = FILE_TYPES[fasta_path]
    fdir    = _BASE / stem
    fdir.mkdir(parents=True, exist_ok=True)
    tqdm.write(f'\n\u2500\u2500 {stem}  [{ftype}] \u2500\u2500')

    seqs = read_fasta_file(fasta_path)
    if not seqs:
        tqdm.write('  \u26a0\ufe0f  No sequences — skipping.')
        continue

    sl_map = {sn: len(sq) for sn, sq in seqs.items()}  # seq_name -> length

    # Parallel motif scanning
    motifs_file, t0 = [], time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor(max_workers=N_WORKERS) as pool:
        futs = {pool.submit(_scan, sn, sq): sn for sn, sq in seqs.items()}
        for fut in tqdm(concurrent.futures.as_completed(futs),
                        total=len(futs), desc=f'  seqs({stem})', leave=False):
            sn  = futs[fut]; res = fut.result()
            tqdm.write(f'  \u25b8 {sn[:55]}  \u2192 {len(res):,} motifs')
            motifs_file.extend(res)
    tqdm.write(f'  \u2705 {len(motifs_file):,} motifs in {time.perf_counter()-t0:.1f}s')
    gc.collect()

    # Build DataFrame
    df = pd.DataFrame(motifs_file) if motifs_file else pd.DataFrame()
    for col, dflt in [('Class','Unknown'),('Subclass','Other'),('Start',0),
                      ('End',0),('Length',0),('Score',0.0),('Strand','+'),('Sequence_Name','')]:
        if col not in df.columns: df[col] = dflt
    if not df.empty:
        df['Length'] = np.where(df['Length']==0,
                                (df['End']-df['Start']).clip(lower=0), df['Length'])
    df['Source_File'] = Path(fasta_path).name
    df['File_Type']   = ftype

    # Save CSV + Excel
    if not df.empty:
        df.to_csv(str(fdir/'motifs.csv'), encoding='utf-8-sig', index=False)
        df.to_excel(str(fdir/'motifs.xlsx'), index=False)

    # ── Per-file inline plots ─────────────────────────────────────────────────
    if not df.empty:
        total_bp = max(sum(sl_map.values()), 1)

        # 1. Class distribution
        cc = df['Class'].value_counts()
        fig, ax = plt.subplots(figsize=(8, max(3, len(cc)*0.45)))
        ax.barh(cc.index[::-1], cc.values[::-1], color='steelblue')
        ax.set_xlabel('Motif Count')
        ax.set_title(f'{stem} [{ftype}] — Class Distribution')
        for i,v in enumerate(cc.values[::-1]): ax.text(v+0.3, i, str(v), va='center', fontsize=8)
        plt.tight_layout(); _savefig(fig, fdir/'class_distribution.png')

        # 2. Subclass distribution (top 30)
        sc = df['Subclass'].value_counts().head(30)
        fig, ax = plt.subplots(figsize=(8, max(3, len(sc)*0.4)))
        ax.barh(sc.index[::-1], sc.values[::-1], color='darkorange')
        ax.set_xlabel('Motif Count')
        ax.set_title(f'{stem} — Subclass Distribution (top 30)')
        plt.tight_layout(); _savefig(fig, fdir/'subclass_distribution.png')

        # 3. Class density (motifs/kb)
        cls_dens = df.groupby('Class').apply(
            lambda g: round(len(g)/total_bp*1000, 4)).sort_values(ascending=False)
        fig, ax = plt.subplots(figsize=(8, max(3, len(cls_dens)*0.45)))
        ax.barh(cls_dens.index[::-1], cls_dens.values[::-1], color='teal')
        ax.set_xlabel('Motifs per kb'); ax.set_title(f'{stem} — Class Density (motifs/kb)')
        for i,v in enumerate(cls_dens.values[::-1]): ax.text(v, i, f'{v:.4f}', va='center', fontsize=8)
        plt.tight_layout(); _savefig(fig, fdir/'class_density.png')

        # 4. Class coverage (%)
        cls_cov = {}
        for cls, grp in df.groupby('Class'):
            cov = sum(_merge_coverage(sg[['Start','End']].values, cap=sl_map.get(sn,0))
                      for sn, sg in grp.groupby('Sequence_Name'))
            cls_cov[cls] = round(cov/total_bp*100, 3)
        _cov_s = pd.Series(cls_cov).sort_values(ascending=False)
        fig, ax = plt.subplots(figsize=(8, max(3, len(_cov_s)*0.45)))
        ax.barh(_cov_s.index[::-1], _cov_s.values[::-1], color='mediumseagreen')
        ax.set_xlabel('Coverage (%)'); ax.set_title(f'{stem} — Class Coverage (%)')
        ax.set_xlim(0, min(100, _cov_s.max()*1.15+0.5))
        for i,v in enumerate(_cov_s.values[::-1]): ax.text(v, i, f'{v:.3f}%', va='center', fontsize=8)
        plt.tight_layout(); _savefig(fig, fdir/'class_coverage.png')

        # 5. Subclass density (top 20)
        sc_dens = df.groupby('Subclass').apply(
            lambda g: round(len(g)/total_bp*1000, 4)).nlargest(20)
        fig, ax = plt.subplots(figsize=(8, max(3, len(sc_dens)*0.4)))
        ax.barh(sc_dens.index[::-1], sc_dens.values[::-1], color='coral')
        ax.set_xlabel('Motifs per kb'); ax.set_title(f'{stem} — Subclass Density (top 20, motifs/kb)')
        plt.tight_layout(); _savefig(fig, fdir/'subclass_density.png')

        # 6. Subclass coverage (top 20 by coverage)
        sc_cov = {}
        for sc_name, grp in df.groupby('Subclass'):
            cov = sum(_merge_coverage(sg[['Start','End']].values, cap=sl_map.get(sn,0))
                      for sn, sg in grp.groupby('Sequence_Name'))
            sc_cov[sc_name] = round(cov/total_bp*100, 3)
        _scov_s = pd.Series(sc_cov).nlargest(20)
        fig, ax = plt.subplots(figsize=(8, max(3, len(_scov_s)*0.4)))
        ax.barh(_scov_s.index[::-1], _scov_s.values[::-1], color='orchid')
        ax.set_xlabel('Coverage (%)'); ax.set_title(f'{stem} — Subclass Coverage (top 20, %)')
        plt.tight_layout(); _savefig(fig, fdir/'subclass_coverage.png')

        # 7. Hybrid & Cluster breakdown
        _special = df[df['Class'].isin(['Hybrid','Non-B_DNA_Clusters'])]
        if not _special.empty:
            sp_cnt = _special['Class'].value_counts()
            fig, ax = plt.subplots(figsize=(6,3))
            ax.bar(sp_cnt.index, sp_cnt.values, color=['tomato','mediumpurple'])
            ax.set_ylabel('Count'); ax.set_title(f'{stem} — Hybrid & Cluster Motifs')
            for i,v in enumerate(sp_cnt.values): ax.text(i, v+0.2, str(v), ha='center', fontsize=9)
            plt.tight_layout(); _savefig(fig, fdir/'hybrid_cluster_breakdown.png')

        # 8. Sequence-level density & coverage (multi-seq)
        if ftype in ('multi','multi_equal'):
            _rows_d, _rows_c = [], []
            for sn, sq_len in sl_map.items():
                sub = df[df['Sequence_Name']==sn]
                n   = len(sub)
                cov = (_merge_coverage(sub[['Start','End']].values, cap=sq_len)
                       if not sub.empty else 0)
                _rows_d.append({'Sequence':sn[:40],'Density_per_kb':round(n/sq_len*1000,4) if sq_len else 0})
                _rows_c.append({'Sequence':sn[:40],'Coverage_pct':round(cov/sq_len*100,2) if sq_len else 0})

            _dd = pd.DataFrame(_rows_d).sort_values('Density_per_kb',ascending=False).head(40)
            fig, ax = plt.subplots(figsize=(9, max(4, len(_dd)*0.35)))
            ax.barh(_dd['Sequence'][::-1], _dd['Density_per_kb'][::-1], color='teal')
            ax.set_xlabel('Motifs per kb'); ax.set_title(f'{stem} — Motif Density by Sequence (top 40)')
            plt.tight_layout(); _savefig(fig, fdir/'motif_density_by_sequence.png')

            _cd = pd.DataFrame(_rows_c).sort_values('Coverage_pct',ascending=False).head(40)
            fig, ax = plt.subplots(figsize=(9, max(4, len(_cd)*0.35)))
            ax.barh(_cd['Sequence'][::-1], _cd['Coverage_pct'][::-1], color='mediumseagreen')
            ax.set_xlabel('Coverage (%)'); ax.set_title(f'{stem} — Non-B DNA Coverage by Sequence (top 40)')
            ax.set_xlim(0,100); plt.tight_layout(); _savefig(fig, fdir/'sequence_coverage.png')

        # 9. Positional distribution (equal-length multiFASTA)
        if ftype == 'multi_equal':
            seq_len_val = list(sl_map.values())[0]
            for cls in df['Class'].unique():
                starts = df[df['Class']==cls]['Start'].dropna().astype(int)
                starts = starts[starts < seq_len_val]
                if starts.empty: continue
                fig, ax = plt.subplots(figsize=(10,3))
                ax.hist(starts, bins=min(100,seq_len_val), color='steelblue', alpha=0.8, edgecolor='none')
                ax.set_xlabel('Position (bp)'); ax.set_ylabel('Frequency')
                ax.set_title(f'{stem} — {cls} Positional Distribution (n={len(starts):,})')
                ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x,_: f'{int(x):,}'))
                plt.tight_layout(); _savefig(fig, fdir/f'positional_dist_{_safe_fname(cls)}.png', show=False)

        tqdm.write(f'  \U0001f4ca Plots saved: {fdir}')

    RESULTS_BY_FILE[stem] = {
        'df':df, 'folder':fdir, 'file_type':ftype,
        'path':fasta_path, 'seq_lengths':sl_map
    }

    # GFF region analysis
    if fasta_path in GFF_MAP:
        gff_path = GFF_MAP[fasta_path]
        tqdm.write(f'  \U0001f4cb GFF: {Path(gff_path).name}')
        features = _parse_gff(gff_path)
        gff_dir  = fdir/'gff_regions'; gff_dir.mkdir(exist_ok=True)
        region_rows = []
        for ftype_gff in tqdm(sorted({f['type'] for f in features}),
                              desc=f'  GFF({stem})', leave=False):
            type_feats  = [f for f in features if f['type']==ftype_gff]
            type_motifs = []
            for feat in type_feats:
                sid = feat['seqid']
                if sid not in seqs: continue
                rseq = seqs[sid][feat['start']:feat['end']]
                if len(rseq) < 12: continue
                rname = f"{sid}:{ftype_gff}:{feat['start']}-{feat['end']}({feat['strand']})"
                for m in _scan(rname, rseq):
                    m.update({'GFF_Type':ftype_gff,'GFF_SeqID':sid,
                               'GFF_Start':feat['start'],'GFF_End':feat['end'],
                               'GFF_Strand':feat['strand'],
                               'GFF_Attrs':feat['attrs'][:80]})
                    type_motifs.append(m)
            region_rows.extend(type_motifs)
            gc.collect()
        gff_df = pd.DataFrame(region_rows) if region_rows else pd.DataFrame()
        for col, dflt in [('Class','Unknown'),('Subclass','Other'),('Start',0),
                          ('End',0),('Length',0),('Score',0.0),
                          ('GFF_Type',''),('GFF_SeqID',''),('GFF_Start',0),
                          ('GFF_End',0),('GFF_Strand','+'),('GFF_Attrs','')]:
            if col not in gff_df.columns: gff_df[col] = dflt
        if not gff_df.empty:
            gff_df.to_csv(str(gff_dir/'gff_region_motifs.csv'), encoding='utf-8-sig', index=False)
            pivot = gff_df.groupby(['GFF_Type','Class']).size().unstack(fill_value=0)
            fig, ax = plt.subplots(figsize=(max(8,len(pivot)*1.4), 5))
            pivot.plot(kind='bar', ax=ax, colormap='tab20', width=0.8)
            ax.set_xlabel('GFF Feature Type'); ax.set_ylabel('Motif Count')
            ax.set_title(f'{stem} — Motifs per GFF Feature Type')
            ax.legend(title='Class', bbox_to_anchor=(1,1)); plt.tight_layout()
            _savefig(fig, gff_dir/'gff_motifs_by_type.png', show=False)
        GFF_RESULTS[stem] = {'region_df':gff_df,'gff_path':gff_path,'folder':gff_dir}
        tqdm.write(f'  \u2705 GFF: {len(gff_df):,} region motifs')

print(f'\n\u2705 Detection complete — {len(RESULTS_BY_FILE)} file(s) '
      f'({len(GFF_RESULTS)} with GFF)')

# ─────────────────────────────────────────────────────────────────────────────
# B. MASTER TABLES & GLOBAL STATISTICS
# ─────────────────────────────────────────────────────────────────────────────
_dfs       = [r['df'] for r in RESULTS_BY_FILE.values() if not r['df'].empty]
_master_df = pd.concat(_dfs, ignore_index=True) if _dfs else pd.DataFrame()
_master_dir = _BASE / '_master'; _master_dir.mkdir(exist_ok=True)
_gdfs   = [v['region_df'] for v in GFF_RESULTS.values() if not v['region_df'].empty]
_gff_df = pd.concat(_gdfs, ignore_index=True) if _gdfs else pd.DataFrame()
_tables = {}

if not _master_df.empty:
    total_bp_all = max(sum(sum(r['seq_lengths'].values())
                           for r in RESULTS_BY_FILE.values()), 1)

    # Table 1: Global class distribution (file × class counts)
    _tables['1_global_class_distribution'] = (
        _master_df.groupby(['Source_File','File_Type','Class'])
        .size().reset_index(name='Count'))

    # Table 2: Per-file summary
    _pf_rows = []
    for stem, res in RESULTS_BY_FILE.items():
        df, fp, ftype_v, sl = res['df'], res['path'], res['file_type'], res['seq_lengths']
        gc_pct, seq_len = _gc_and_length(fp)
        n = len(df)
        _pf_rows.append({
            'File':           Path(fp).name,
            'File_Type':      ftype_v,
            'Sequences':      len(sl),
            'Total_bp':       seq_len,
            'GC_Percent':     gc_pct,
            'Total_Motifs':   n,
            'Classes':        df['Class'].nunique()    if not df.empty else 0,
            'Subclasses':     df['Subclass'].nunique() if not df.empty else 0,
            'Hybrids':        int((df['Class']=='Hybrid').sum())             if not df.empty else 0,
            'Clusters':       int((df['Class']=='Non-B_DNA_Clusters').sum()) if not df.empty else 0,
            'Density_per_kb': round(n/seq_len*1000,4) if seq_len else 0.0,
            'Coverage_pct':   _coverage(df, sl),
        })
    _tables['2_per_file_summary'] = pd.DataFrame(_pf_rows)

    # Table 3: Class statistics WITH density and coverage
    _cls_dc = _class_density_coverage(_master_df, RESULTS_BY_FILE)
    _tables['3_class_statistics'] = (
        _master_df.groupby('Class')
        .agg(Total_Count=('Class','count'),
             Mean_Length=('Length','mean'),
             Mean_Score=('Score','mean'))
        .round(3).reset_index()
        .assign(Density_per_kb=lambda d: d['Class'].map(
                    lambda c: _cls_dc.get(c,{}).get('Density_per_kb',0)),
                Coverage_pct=lambda d: d['Class'].map(
                    lambda c: _cls_dc.get(c,{}).get('Coverage_pct',0)))
        .sort_values('Total_Count', ascending=False)
        [['Class','Total_Count','Mean_Length','Mean_Score','Density_per_kb','Coverage_pct']]
    )

    # Table 4: File × Class pivot (counts)
    _tables['4_file_class_pivot'] = (
        _master_df.groupby(['Source_File','Class'])
        .size().unstack(fill_value=0).reset_index())

    # Table 5: Subclass statistics WITH density and coverage
    _sc_dc = _subclass_density_coverage(_master_df, RESULTS_BY_FILE)
    _tables['5_subclass_statistics'] = (
        _master_df.groupby('Subclass')
        .agg(Total_Count=('Subclass','count'),
             Mean_Length=('Length','mean'),
             Mean_Score=('Score','mean'))
        .round(3).reset_index()
        .assign(Density_per_kb=lambda d: d['Subclass'].map(
                    lambda s: _sc_dc.get(s,{}).get('Density_per_kb',0)),
                Coverage_pct=lambda d: d['Subclass'].map(
                    lambda s: _sc_dc.get(s,{}).get('Coverage_pct',0)))
        .sort_values('Total_Count', ascending=False)
        [['Subclass','Total_Count','Mean_Length','Mean_Score','Density_per_kb','Coverage_pct']]
    )

    # Table 6: Class density pivot (density per file per class)
    _dens_rows_t = []
    for (fname, cls), grp in _master_df.groupby(['Source_File','Class']):
        stem = Path(fname).stem
        res  = RESULTS_BY_FILE.get(stem, {})
        slen = max(sum(res.get('seq_lengths',{1:1}).values()),1)
        _dens_rows_t.append({'Source_File':fname,'Class':cls,
                              'Density_per_kb':round(len(grp)/slen*1000,4)})
    _tables['6_class_density_pivot'] = (
        pd.DataFrame(_dens_rows_t)
        .pivot_table(index='Source_File',columns='Class',
                     values='Density_per_kb',fill_value=0).reset_index())

    # Table 7: Class coverage pivot (coverage % per file per class)
    _cov_rows_t = []
    for fname in _master_df['Source_File'].unique():
        stem = Path(fname).stem
        res  = RESULTS_BY_FILE.get(stem)
        if not res: continue
        sub_file = _master_df[_master_df['Source_File']==fname]
        slen = max(sum(res['seq_lengths'].values()),1)
        for cls, grp in sub_file.groupby('Class'):
            cov = sum(_merge_coverage(sg[['Start','End']].values, cap=res['seq_lengths'].get(sn,0))
                      for sn, sg in grp.groupby('Sequence_Name'))
            _cov_rows_t.append({'Source_File':fname,'Class':cls,
                                 'Coverage_pct':round(cov/slen*100,3)})
    _tables['7_class_coverage_pivot'] = (
        pd.DataFrame(_cov_rows_t)
        .pivot_table(index='Source_File',columns='Class',
                     values='Coverage_pct',fill_value=0).reset_index())

    # Equal-length positional table
    _eq_dfs = [r['df'] for r in RESULTS_BY_FILE.values()
               if r['file_type']=='multi_equal' and not r['df'].empty]
    if _eq_dfs:
        _eq = pd.concat(_eq_dfs, ignore_index=True)
        _tables['8_equal_length_positional'] = (
            _eq.groupby(['Source_File','Class','Start'])
            .size().reset_index(name='Frequency')
            .sort_values(['Source_File','Class','Frequency'], ascending=[True,True,False]))

if not _gff_df.empty:
    _tables['9_gff_motifs_per_feature'] = (
        _gff_df.groupby(['GFF_Type','Class']).size().reset_index(name='Count')
        .sort_values('Count',ascending=False))
    _tables['10_gff_density_per_feature'] = (
        _gff_df.assign(Region_Len=(_gff_df['GFF_End']-_gff_df['GFF_Start']).clip(lower=1))
        .groupby('GFF_Type')
        .agg(Total_Motifs=('Class','count'),Unique_Classes=('Class','nunique'),
             Mean_Region_Len=('Region_Len','mean'))
        .round(2).reset_index().sort_values('Total_Motifs',ascending=False))

# Export all tables + master CSVs
if not _master_df.empty:
    _master_df.to_csv(str(_master_dir/'master_motifs.csv'), encoding='utf-8-sig', index=False)
    _master_df.to_excel(str(_master_dir/'master_motifs.xlsx'), index=False)
if not _gff_df.empty:
    _gff_df.to_csv(str(_master_dir/'gff_region_motifs_all.csv'), encoding='utf-8-sig', index=False)
for tname, tdf in _tables.items():
    tdf.to_csv(str(_master_dir/f'{tname}.csv'), encoding='utf-8-sig', index=False)

# ─────────────────────────────────────────────────────────────────────────────
# C. GLOBAL SUMMARY PLOTS (all inline)
# ─────────────────────────────────────────────────────────────────────────────
if not _master_df.empty:
    _pf_summary = _tables.get('2_per_file_summary', pd.DataFrame())
    _n_files    = len(RESULTS_BY_FILE)

    # (a) Global class distribution
    cc = _master_df['Class'].value_counts()
    fig, ax = plt.subplots(figsize=(8, max(3, len(cc)*0.45)))
    ax.barh(cc.index[::-1], cc.values[::-1], color='steelblue')
    ax.set_xlabel('Count'); ax.set_title('Global Class Distribution')
    for i,v in enumerate(cc.values[::-1]): ax.text(v+0.3, i, str(v), va='center', fontsize=8)
    plt.tight_layout(); _savefig(fig, _master_dir/'global_class_distribution.png')

    # (b) Global class density (motifs/kb)
    cls_stat = _tables['3_class_statistics'].set_index('Class')['Density_per_kb']
    fig, ax = plt.subplots(figsize=(8, max(3, len(cls_stat)*0.45)))
    ax.barh(cls_stat.index[::-1], cls_stat.values[::-1], color='teal')
    ax.set_xlabel('Motifs per kb'); ax.set_title('Global Class Density (motifs/kb)')
    for i,v in enumerate(cls_stat.values[::-1]): ax.text(v, i, f'{v:.4f}', va='center', fontsize=8)
    plt.tight_layout(); _savefig(fig, _master_dir/'global_class_density.png')

    # (c) Global class coverage (%)
    cls_cov_s = _tables['3_class_statistics'].set_index('Class')['Coverage_pct']
    fig, ax = plt.subplots(figsize=(8, max(3, len(cls_cov_s)*0.45)))
    ax.barh(cls_cov_s.index[::-1], cls_cov_s.values[::-1], color='mediumseagreen')
    ax.set_xlabel('Coverage (%)'); ax.set_title('Global Class Coverage (%)')
    for i,v in enumerate(cls_cov_s.values[::-1]): ax.text(v, i, f'{v:.3f}%', va='center', fontsize=8)
    plt.tight_layout(); _savefig(fig, _master_dir/'global_class_coverage.png')

    # (d) Global subclass distribution (top 30)
    sc_all = _master_df['Subclass'].value_counts().head(30)
    fig, ax = plt.subplots(figsize=(8, max(4, len(sc_all)*0.4)))
    ax.barh(sc_all.index[::-1], sc_all.values[::-1], color='darkorange')
    ax.set_xlabel('Count'); ax.set_title('Global Subclass Distribution (top 30)')
    plt.tight_layout(); _savefig(fig, _master_dir/'global_subclass_distribution.png')

    # (e) Global subclass density (top 20)
    sc_dens_all = _tables['5_subclass_statistics'].set_index('Subclass')['Density_per_kb'].nlargest(20)
    fig, ax = plt.subplots(figsize=(8, max(3, len(sc_dens_all)*0.4)))
    ax.barh(sc_dens_all.index[::-1], sc_dens_all.values[::-1], color='coral')
    ax.set_xlabel('Motifs per kb'); ax.set_title('Global Subclass Density (top 20, motifs/kb)')
    plt.tight_layout(); _savefig(fig, _master_dir/'global_subclass_density.png')

    # (f) Global subclass coverage (top 20)
    sc_cov_all = _tables['5_subclass_statistics'].set_index('Subclass')['Coverage_pct'].nlargest(20)
    fig, ax = plt.subplots(figsize=(8, max(3, len(sc_cov_all)*0.4)))
    ax.barh(sc_cov_all.index[::-1], sc_cov_all.values[::-1], color='orchid')
    ax.set_xlabel('Coverage (%)'); ax.set_title('Global Subclass Coverage (top 20, %)')
    plt.tight_layout(); _savefig(fig, _master_dir/'global_subclass_coverage.png')

    # (g) File-level density & coverage comparison
    if not _pf_summary.empty:
        for col, label, color, title in [
            ('Density_per_kb','Motifs per kb','steelblue','Motif Density Comparison Across Files'),
            ('Coverage_pct',  'Coverage (%)','mediumseagreen','Non-B DNA Coverage Comparison Across Files'),
        ]:
            _pfs = _pf_summary.sort_values(col, ascending=False)
            fig, ax = plt.subplots(figsize=(max(6, len(_pfs)*1.4), 4))
            bars = ax.bar(_pfs['File'].apply(lambda x: Path(x).stem[:25]),
                          _pfs[col], color=color)
            ax.bar_label(bars, fmt='%.3f' if col=='Density_per_kb' else '%.1f%%',
                         padding=2, fontsize=8)
            ax.set_ylabel(label); ax.set_title(title)
            if col=='Coverage_pct': ax.set_ylim(0,100)
            plt.xticks(rotation=30, ha='right'); plt.tight_layout()
            _savefig(fig, _master_dir/f'{"density" if col=="Density_per_kb" else "coverage"}_comparison.png')

    # (h) Class density heatmap (files × classes)
    if '6_class_density_pivot' in _tables and not _tables['6_class_density_pivot'].empty:
        _dens_piv = _tables['6_class_density_pivot'].set_index('Source_File')
        if not _dens_piv.empty:
            fig, ax = plt.subplots(figsize=(max(10, len(_dens_piv.columns)*1.2),
                                            max(4,  len(_dens_piv)*0.7)))
            sns.heatmap(_dens_piv, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax,
                        linewidths=0.4, cbar_kws={'label':'Motifs per kb'})
            ax.set_title('Class Density Heatmap (motifs/kb) — Files × Classes')
            ax.set_xlabel('Non-B Class'); ax.set_ylabel('File')
            plt.tight_layout(); _savefig(fig, _master_dir/'class_density_heatmap.png')

    # (i) Class coverage heatmap (files × classes)
    if '7_class_coverage_pivot' in _tables and not _tables['7_class_coverage_pivot'].empty:
        _cov_piv = _tables['7_class_coverage_pivot'].set_index('Source_File')
        if not _cov_piv.empty:
            fig, ax = plt.subplots(figsize=(max(10, len(_cov_piv.columns)*1.2),
                                            max(4,  len(_cov_piv)*0.7)))
            sns.heatmap(_cov_piv, annot=True, fmt='.3f', cmap='Blues', ax=ax,
                        linewidths=0.4, cbar_kws={'label':'Coverage %'})
            ax.set_title('Class Coverage Heatmap (%) — Files × Classes')
            ax.set_xlabel('Non-B Class'); ax.set_ylabel('File')
            plt.tight_layout(); _savefig(fig, _master_dir/'class_coverage_heatmap.png')

    # (j) Hybrid & Cluster comparison
    if not _pf_summary.empty and (_pf_summary['Hybrids'].sum()>0 or _pf_summary['Clusters'].sum()>0):
        x = np.arange(len(_pf_summary)); w = 0.35
        labels = _pf_summary['File'].apply(lambda x: Path(x).stem[:20])
        fig, ax = plt.subplots(figsize=(max(7, len(_pf_summary)*1.5), 4))
        ax.bar(x-w/2, _pf_summary['Hybrids'],  w, label='Hybrids',  color='tomato')
        ax.bar(x+w/2, _pf_summary['Clusters'], w, label='Clusters', color='mediumpurple')
        ax.set_xticks(x); ax.set_xticklabels(labels, rotation=30, ha='right')
        ax.set_ylabel('Count'); ax.set_title('Hybrid & Cluster Motifs Across Files')
        ax.legend(); plt.tight_layout()
        _savefig(fig, _master_dir/'hybrid_cluster_comparison.png')

    # (k) GFF heatmap
    if not _gff_df.empty and '9_gff_motifs_per_feature' in _tables:
        _piv = _gff_df.groupby(['GFF_Type','Class']).size().unstack(fill_value=0)
        fig2, ax2 = plt.subplots(figsize=(max(10,len(_piv.columns)*1.2), max(4,len(_piv)*0.6)))
        sns.heatmap(_piv, annot=True, fmt='d', cmap='YlOrRd', ax=ax2,
                    linewidths=0.4, cbar_kws={'label':'Motif count'})
        ax2.set_title('GFF Feature Type × Non-B Class Heatmap')
        ax2.set_xlabel('Non-B Class'); ax2.set_ylabel('GFF Feature Type')
        plt.tight_layout(); _savefig(fig2, _master_dir/'gff_class_heatmap.png')

# Display all statistics tables
print('\n' + '='*70)
print('ALL STATISTICS TABLES')
print('='*70)
for tname, tdf in _tables.items():
    print(f"\n{'─'*60}")
    print(f"  {tname.replace('_',' ').upper()}")
    print(f"{'─'*60}")
    display(tdf)

# ─────────────────────────────────────────────────────────────────────────────
# D. COMPARATIVE ANALYSIS
# ─────────────────────────────────────────────────────────────────────────────
_comp_rows = []
for stem, res in RESULTS_BY_FILE.items():
    species, region = _parse_species_region(stem)
    df, fp, sl = res['df'], res['path'], res['seq_lengths']
    gc_pct, seq_len = _gc_and_length(fp)
    n = len(df)
    _comp_rows.append({
        'Stem':stem,'Species':species,'Region':region,
        'Total_Motifs':n,'Seq_Length_bp':seq_len,
        'Density_per_kb':round(n/seq_len*1000,4) if seq_len else 0.0,
        'Coverage_pct':_coverage(df,sl),'GC_Percent':gc_pct,
        'Mean_Motif_Length':  round(df['Length'].mean(),2)   if not df.empty else 0.0,
        'Median_Motif_Length':round(df['Length'].median(),2) if not df.empty else 0.0,
        'Unique_Classes':   df['Class'].nunique()    if not df.empty else 0,
        'Unique_Subclasses':df['Subclass'].nunique() if not df.empty else 0,
        'Hybrids':  int((df['Class']=='Hybrid').sum())             if not df.empty else 0,
        'Clusters': int((df['Class']=='Non-B_DNA_Clusters').sum()) if not df.empty else 0,
    })
_comp_df      = pd.DataFrame(_comp_rows)
_species_list = sorted(_comp_df['Species'].unique())
_region_list  = sorted(_comp_df['Region'].unique())
_cmp_dir = _BASE / '_comparisons'; _cmp_dir.mkdir(exist_ok=True)
_comp_df.to_csv(str(_cmp_dir/'all_comparisons_summary.csv'), encoding='utf-8-sig', index=False)
_comp_df.to_excel(str(_cmp_dir/'all_comparisons_summary.xlsx'), index=False)
print(f'\nSpecies: {_species_list}  |  Regions: {_region_list}')

# Multi-file comparative plots
if len(RESULTS_BY_FILE) >= 2 and not _master_df.empty:
    # Class comparison
    _cls_pivot = _master_df.groupby(['Source_File','Class']).size().unstack(fill_value=0)
    fig, ax = plt.subplots(figsize=(max(10,len(_cls_pivot)*1.4), max(4,len(_cls_pivot.columns)*0.5)))
    _cls_pivot.plot(kind='bar', ax=ax, colormap='tab20', width=0.8)
    ax.set_xlabel('File'); ax.set_ylabel('Motif Count')
    ax.set_title('Class Distribution — All Files Comparison')
    ax.legend(title='Class', bbox_to_anchor=(1,1), fontsize=8)
    plt.xticks(rotation=30, ha='right'); plt.tight_layout()
    _savefig(fig, _cmp_dir/'all_files_class_comparison.png')

    # Subclass comparison (top 20)
    _top_subs = _master_df['Subclass'].value_counts().head(20).index
    _sub_pivot = _master_df.groupby(['Source_File','Subclass']).size().unstack(fill_value=0)
    _sub_pivot = _sub_pivot[[c for c in _top_subs if c in _sub_pivot.columns]]
    if not _sub_pivot.empty:
        fig, ax = plt.subplots(figsize=(max(10,len(_sub_pivot)*1.4), max(4,len(_sub_pivot.columns)*0.4)))
        _sub_pivot.plot(kind='bar', ax=ax, colormap='tab20', width=0.8)
        ax.set_xlabel('File'); ax.set_ylabel('Motif Count')
        ax.set_title('Subclass Distribution — All Files (top 20)')
        ax.legend(title='Subclass', bbox_to_anchor=(1,1), fontsize=7)
        plt.xticks(rotation=30, ha='right'); plt.tight_layout()
        _savefig(fig, _cmp_dir/'all_files_subclass_comparison.png')

# Within-species comparisons
_sep = '\u2550'*60
for species in _species_list:
    sp_rows  = _comp_df[_comp_df['Species']==species].copy()
    sp_stems = sp_rows['Stem'].tolist()
    sp_dir   = _cmp_dir/_safe_fname(species); sp_dir.mkdir(exist_ok=True)
    if len(sp_stems) < 2:
        print(f"\n\u26a0  '{species}' — single region, skipping within-species plots.")
        continue
    print(f'\n{_sep}\nWithin-species: {species}\n{_sep}')

    # Class + subclass by region
    for attr, label, fname, colors in [
        ('Class',   'Class',   'class_by_region.png',   'tab20'),
        ('Subclass','Subclass','subclass_by_region.png','tab20'),
    ]:
        _by_reg = {sp_rows.loc[sp_rows['Stem']==st,'Region'].values[0]:
                   RESULTS_BY_FILE[st]['df'][attr].value_counts()
                   if not RESULTS_BY_FILE[st]['df'].empty else pd.Series(dtype=int)
                   for st in sp_stems}
        _all_v = sorted({c for s in _by_reg.values() for c in s.index})
        if not _all_v: continue
        _mat = pd.DataFrame({r:s.reindex(_all_v,fill_value=0) for r,s in _by_reg.items()}).T
        fig, ax = plt.subplots(figsize=(max(8,len(_all_v)*1.2),4))
        _mat.plot(kind='bar', ax=ax, colormap=colors, width=0.8)
        ax.set_title(f'{species} — {label} Distribution by Region')
        ax.set_xlabel('Region'); ax.set_ylabel('Motif Count')
        ax.legend(title=label, bbox_to_anchor=(1,1), fontsize=7)
        plt.xticks(rotation=30, ha='right'); plt.tight_layout()
        _savefig(fig, sp_dir/fname, show=False)

    # Density, coverage, GC, length by region
    for col, label, color, fmt, ylim, fn in [
        ('Density_per_kb','Motifs per kb',    'steelblue',     '%.3f', None, 'density_by_region.png'),
        ('Coverage_pct',  'Coverage (%)',     'mediumseagreen','%.1f%%',(0,100),'coverage_by_region.png'),
        ('GC_Percent',    'GC (%)',           'goldenrod',     '%.1f%%',(0,100),'gc_by_region.png'),
    ]:
        fig, ax = plt.subplots(figsize=(max(6,len(sp_rows)*1.2),4))
        bars = ax.bar(sp_rows['Region'], sp_rows[col], color=color)
        ax.bar_label(bars, fmt=fmt, padding=2)
        ax.set_title(f'{species} — {label} by Region')
        ax.set_xlabel('Region'); ax.set_ylabel(label)
        if ylim: ax.set_ylim(*ylim)
        plt.xticks(rotation=30, ha='right'); plt.tight_layout()
        _savefig(fig, sp_dir/fn, show=False)

    # Motif length boxplot
    _len_parts = []
    for st in sp_stems:
        df = RESULTS_BY_FILE[st]['df']
        reg = sp_rows.loc[sp_rows['Stem']==st,'Region'].values[0]
        if not df.empty and 'Length' in df.columns:
            tmp = df[['Length']].copy(); tmp['Region'] = reg; _len_parts.append(tmp)
    if _len_parts:
        _len_df = pd.concat(_len_parts)[lambda d: d['Length']>0]
        if not _len_df.empty:
            fig, ax = plt.subplots(figsize=(max(8,len(sp_stems)*2),4))
            sns.boxplot(data=_len_df, x='Region', y='Length', ax=ax, palette='Set2')
            ax.set_title(f'{species} — Motif Length Distribution by Region')
            plt.xticks(rotation=30, ha='right'); plt.tight_layout()
            _savefig(fig, sp_dir/'length_by_region.png', show=False)

    _sp_summary = sp_rows.set_index('Region')[[
        'Total_Motifs','Seq_Length_bp','Density_per_kb','Coverage_pct',
        'GC_Percent','Mean_Motif_Length','Median_Motif_Length',
        'Unique_Classes','Unique_Subclasses','Hybrids','Clusters',
    ]]
    _sp_summary.to_csv(str(sp_dir/'within_species_summary.csv'), encoding='utf-8-sig')
    print(f'{species} Summary:'); display(_sp_summary)

# Cross-species
if len(_species_list) >= 2:
    _xs_dir = _cmp_dir/'_cross_species'; _xs_dir.mkdir(exist_ok=True)
    print(f'\n{_sep}\nCross-species: {_species_list}\n{_sep}')
    _all_regs = sorted(set.union(*[set(_comp_df[_comp_df['Species']==sp]['Region'])
                                   for sp in _species_list]))
    for col, label, cmap, fmt, fn in [
        ('Density_per_kb','Motifs/kb','YlOrRd','.3f','cross_species_density_heatmap.png'),
        ('Coverage_pct',  'Coverage %','Blues', '.1f','cross_species_coverage_heatmap.png'),
        ('GC_Percent',    'GC %',     'YlGn',  '.1f','cross_species_gc_heatmap.png'),
    ]:
        _piv = _comp_df.pivot_table(index='Species',columns='Region',values=col,aggfunc='mean')
        if not _piv.empty:
            fig, ax = plt.subplots(figsize=(max(8,len(_all_regs)*1.4), max(4,len(_species_list)*0.8)))
            sns.heatmap(_piv, annot=True, fmt=fmt, cmap=cmap, ax=ax,
                        linewidths=0.4, cbar_kws={'label':label})
            ax.set_title(f'Cross-Species — {label} Heatmap')
            ax.set_xlabel('Region'); ax.set_ylabel('Species')
            plt.tight_layout(); _savefig(fig, _xs_dir/fn, show=False)
    _xs_summary = _comp_df.sort_values(['Species','Region'])
    _xs_summary.to_csv(str(_xs_dir/'cross_species_summary.csv'), encoding='utf-8-sig', index=False)
    print('Cross-Species Summary:'); display(_xs_summary)

# ─────────────────────────────────────────────────────────────────────────────
# E. DOWNLOAD LINKS
# ─────────────────────────────────────────────────────────────────────────────
import base64

_MIME = {'csv':'text/csv',
         'xlsx':'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
         'png':'image/png'}

def _dl(path, label):
    with open(path,'rb') as fh: b64 = base64.b64encode(fh.read()).decode()
    ext  = Path(path).suffix.lstrip('.')
    mime = _MIME.get(ext,'application/octet-stream')
    return (f'<a href="data:{mime};base64,{b64}" download="{Path(path).name}" '
            f'style="margin:2px 6px;padding:3px 8px;border:1px solid #aaa;'
            f'border-radius:4px;text-decoration:none;">{label}</a>')

_html = ['<h2>\U0001f4e5 Downloads</h2><h3>Master Outputs</h3><div>']
for fmt, fn in [('CSV','master_motifs.csv'),('Excel','master_motifs.xlsx')]:
    p = _master_dir/fn
    if p.exists(): _html.append(_dl(str(p), f'Master {fmt}'))
if (_master_dir/'gff_region_motifs_all.csv').exists():
    _html.append(_dl(str(_master_dir/'gff_region_motifs_all.csv'),'GFF Regions CSV'))
_html.append('</div><h3>Statistics Tables</h3><div>')
for tn in _tables:
    p = _master_dir/f'{tn}.csv'
    if p.exists(): _html.append(_dl(str(p), tn.replace('_',' ').title()))
_html.append('</div><h3>Comparative Analysis</h3><div>')
for fn in ['all_comparisons_summary.csv','all_comparisons_summary.xlsx']:
    p = _cmp_dir/fn
    if p.exists(): _html.append(_dl(str(p), fn))
_html.append('</div><h3>Per-File Outputs</h3>')
for stem, res in RESULTS_BY_FILE.items():
    _html.append(f'<details style="margin:4px 0"><summary><b>{stem}</b> '
                 f'<em>[{res["file_type"]}]</em></summary><div style="margin:4px 12px">')
    for fmt, fn in [('CSV','motifs.csv'),('Excel','motifs.xlsx')]:
        p = res['folder']/fn
        if p.exists(): _html.append(_dl(str(p), fmt))
    for fn in sorted(res['folder'].glob('*.png')):
        _html.append(_dl(str(fn), fn.stem.replace('_',' ').title()))
    _html.append('</div></details>')

display(HTML('\n'.join(_html)))
print(f'\n\u2705 All outputs saved to: {_BASE}')
