In [None]:
# ═══════════════════════════════════════════════════════════════════════════════
# CELL 1 — Setup, Configuration & File-Type Detection
# Run once. Edit FASTA_INPUT and OUTPUT_DIR to match your environment.
# ═══════════════════════════════════════════════════════════════════════════════

import sys, os, importlib, glob
from pathlib import Path

# Ensure repository root is on PYTHONPATH
_REPO_ROOT = os.path.abspath(os.getcwd())
if _REPO_ROOT not in sys.path:
    sys.path.insert(0, _REPO_ROOT)

# ── Install missing packages ──────────────────────────────────────────────────
_REQUIRED = [
    ('psutil',     'psutil>=5.8.0'),
    ('pandas',     'pandas>=1.3.0'),
    ('numpy',      'numpy>=1.21.0'),
    ('matplotlib', 'matplotlib>=3.5.0'),
    ('seaborn',    'seaborn>=0.11.0'),
    ('openpyxl',   'openpyxl>=3.0.0'),
]
_missing = [pkg for mod, pkg in _REQUIRED if importlib.util.find_spec(mod) is None]
if _missing:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', *_missing, '-q'])

import pandas as pd, numpy as np

# ── User Configuration ────────────────────────────────────────────────────────
# FASTA_INPUT: single path, wildcard, or list  (e.g. '*.fna', ['a.fasta','b.fa'])
FASTA_INPUT        = ['*.fna', '*.fasta']
OUTPUT_DIR         = 'notebook_reports'
ENABLED_CLASSES    = None   # None = all 9 detectors
RAM_OVERRIDE_BYTES = None   # None = auto-detect
EXPORT_CSV         = True
EXPORT_BED         = True
EXPORT_JSON        = True
EXPORT_EXCEL       = True

# ── Resolve input files ───────────────────────────────────────────────────────
def _resolve_inputs(inp):
    patterns = [inp] if isinstance(inp, str) else list(inp)
    files = []
    for p in patterns:
        hits = glob.glob(p)
        files.extend(hits)
        if not hits and os.path.isfile(p):
            files.append(p)
    return sorted({str(Path(f).resolve()) for f in files})

FASTA_FILES = _resolve_inputs(FASTA_INPUT)
if not FASTA_FILES:
    raise FileNotFoundError(f'No FASTA files found matching: {FASTA_INPUT}')

# ── File-type detection (single / multi / multi_equal) ───────────────────────
def _get_seq_lengths(fasta_path):
    lengths, cur = [], 0
    with open(fasta_path) as fh:
        for line in fh:
            line = line.strip()
            if line.startswith('>'):
                if cur:
                    lengths.append(cur)
                cur = 0
            else:
                cur += len(line)
    if cur:
        lengths.append(cur)
    return lengths

FILE_TYPES = {}   # maps abs path -> 'single' | 'multi' | 'multi_equal'
for fp in FASTA_FILES:
    lens = _get_seq_lengths(fp)
    n    = len(lens)
    if n == 1:
        FILE_TYPES[fp] = 'single'
    elif len(set(lens)) == 1:
        FILE_TYPES[fp] = 'multi_equal'
    else:
        FILE_TYPES[fp] = 'multi'

# ── Summary ───────────────────────────────────────────────────────────────────
print(f'\u2705 Dependencies OK | Python {sys.version.split()[0]} | '
      f'pandas {pd.__version__} | numpy {np.__version__}')
print(f'\n\U0001f4c2 FASTA files detected: {len(FASTA_FILES)}')
for fp in FASTA_FILES:
    print(f'   [{FILE_TYPES[fp]:12s}]  {Path(fp).name}')
print(f'\n\U0001f4c1 Output directory: {OUTPUT_DIR}')


In [None]:
# ═══════════════════════════════════════════════════════════════════════════════
# CELL 2 — Adaptive Resource Planning, Motif Analysis & Per-File Output
# ═══════════════════════════════════════════════════════════════════════════════

import gc, time, datetime, warnings
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

from Utilities.system_resource_inspector import SystemResourceInspector
from Utilities.adaptive_chunk_planner    import AdaptiveChunkPlanner
from Utilities.nonbscanner               import analyze_sequence as _nbf_analyze
from Utilities.utilities                 import (
    read_fasta_file, export_to_csv, export_to_bed,
    export_to_json, export_to_excel,
)

# ── Adaptive resource plan ────────────────────────────────────────────────────
_insp    = SystemResourceInspector()
_budget  = RAM_OVERRIDE_BYTES or _insp.get_memory_budget()
_cpus    = _insp.get_cpu_count()
_total_sz = max(sum(os.path.getsize(f) for f in FASTA_FILES if os.path.exists(f)), 1_000)
_plan    = AdaptiveChunkPlanner().plan(
    genome_length=_total_sz, ram_budget=_budget, cpu_count=_cpus)

CHUNK_SIZE    = _plan['chunk_size']
CHUNK_OVERLAP = _plan['overlap']
N_WORKERS     = _plan['workers']
EXEC_MODE     = _plan['mode']

print(f'\u2699\ufe0f  RAM {_budget/1e9:.2f} GB | chunk={CHUNK_SIZE:,} '
      f'overlap={CHUNK_OVERLAP:,} workers={N_WORKERS} mode={EXEC_MODE}')

# ── Timestamped run folder ────────────────────────────────────────────────────
_RUN_TS = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
_BASE   = Path(OUTPUT_DIR) / _RUN_TS
_BASE.mkdir(parents=True, exist_ok=True)
print(f'\U0001f4c2 Run output: {_BASE}')

# ── Per-file analysis loop ────────────────────────────────────────────────────
RESULTS_BY_FILE = {}   # stem -> {df, folder, file_type, path}
sns.set_theme(style='whitegrid')

for fasta_path in FASTA_FILES:
    stem     = Path(fasta_path).stem
    ftype    = FILE_TYPES[fasta_path]
    file_dir = _BASE / stem
    file_dir.mkdir(parents=True, exist_ok=True)

    print(f'\n\u2500\u2500 {stem}  [{ftype}] \u2500\u2500')

    seqs = read_fasta_file(fasta_path)
    if not seqs:
        print(f'  \u26a0\ufe0f  No sequences found in {Path(fasta_path).name} \u2014 skipping.')
        continue

    motifs_file, t0 = [], time.perf_counter()
    for sname, seq in seqs.items():
        print(f'  \u25b8 {sname[:60]}  ({len(seq):,} bp)', end='', flush=True)
        mots = _nbf_analyze(
            sequence=seq, sequence_name=sname,
            use_chunking=True,
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP,
            use_parallel_chunks=(EXEC_MODE == 'hybrid'),
            enabled_classes=ENABLED_CLASSES,
        )
        print(f' \u2192 {len(mots):,} motifs')
        motifs_file.extend(mots)
        gc.collect()

    elapsed = time.perf_counter() - t0
    print(f'  \u2705 {len(motifs_file):,} motifs in {elapsed:.1f}s')

    # Build per-file DataFrame
    df = pd.DataFrame(motifs_file) if motifs_file else pd.DataFrame()
    for col, default in [
        ('Class','Unknown'), ('Subclass','Other'), ('Start',0),
        ('End',0), ('Length',0), ('Score',0.0), ('Strand','+'), ('Sequence_Name',''),
    ]:
        if col not in df.columns:
            df[col] = default
    if not df.empty and 'Length' in df.columns:
        m = df['Length'] == 0
        df.loc[m, 'Length'] = (df.loc[m,'End'] - df.loc[m,'Start']).clip(lower=0)
    df['Source_File'] = Path(fasta_path).name
    df['File_Type']   = ftype

    # Per-file exports
    if not df.empty:
        rows = df.to_dict(orient='records')
        if EXPORT_CSV:   export_to_csv(rows,   filename=str(file_dir / 'motifs.csv'))
        if EXPORT_BED:   export_to_bed(rows,   filename=str(file_dir / 'motifs.bed'))
        if EXPORT_JSON:  export_to_json(rows,  filename=str(file_dir / 'motifs.json'))
        if EXPORT_EXCEL: export_to_excel(rows, filename=str(file_dir / 'motifs.xlsx'))

    # Per-file class distribution plot
    if not df.empty:
        cc = df['Class'].value_counts()
        fig, ax = plt.subplots(figsize=(8, 3))
        ax.barh(cc.index[::-1], cc.values[::-1])
        ax.set_xlabel('Motif Count')
        ax.set_title(f'{stem}  [{ftype}] — Class Distribution')
        plt.tight_layout()
        fig.savefig(str(file_dir / 'class_distribution.png'), dpi=150)
        plt.close(fig)

    RESULTS_BY_FILE[stem] = {
        'df': df, 'folder': file_dir,
        'file_type': ftype, 'path': fasta_path,
    }
    print(f'  \U0001f4c1 Output folder: {file_dir}')

print(f'\n\u2705 Analysis complete \u2014 {len(RESULTS_BY_FILE)} file(s) processed.')


In [None]:
# ═══════════════════════════════════════════════════════════════════════════════
# CELL 3 — Comprehensive Master Tables & Downloads
# ═══════════════════════════════════════════════════════════════════════════════

import base64
from IPython.display import display, HTML, Image

# ── Combine all results into master DataFrame ─────────────────────────────────
_dfs = [r['df'] for r in RESULTS_BY_FILE.values() if not r['df'].empty]
_master_df = pd.concat(_dfs, ignore_index=True) if _dfs else pd.DataFrame()

_master_dir = _BASE / '_master'
_master_dir.mkdir(exist_ok=True)

# ── Comprehensive master tables ───────────────────────────────────────────────
_tables = {}
if not _master_df.empty:
    # 1. Global class distribution across all files
    _tables['1_global_class_distribution'] = (
        _master_df.groupby(['Source_File', 'File_Type', 'Class'])
        .size().reset_index(name='Count')
    )
    # 2. Per-file summary (file type, sequence count, motif count, unique classes)
    _tables['2_per_file_summary'] = pd.DataFrame([
        {
            'File':         Path(r['path']).name,
            'File_Type':    r['file_type'],
            'Sequences':    r['df']['Sequence_Name'].nunique() if not r['df'].empty else 0,
            'Total_Motifs': len(r['df']),
            'Classes':      r['df']['Class'].nunique() if not r['df'].empty else 0,
        }
        for r in RESULTS_BY_FILE.values()
    ])
    # 3. Class-level statistics (count, mean length, mean score)
    _tables['3_class_statistics'] = (
        _master_df.groupby('Class')
        .agg(
            Total_Count  = ('Class',  'count'),
            Mean_Length  = ('Length', 'mean'),
            Mean_Score   = ('Score',  'mean'),
        )
        .round(3).reset_index().sort_values('Total_Count', ascending=False)
    )
    # 4. Cross-file x class pivot table
    _tables['4_file_class_pivot'] = (
        _master_df.groupby(['Source_File', 'Class'])
        .size().unstack(fill_value=0).reset_index()
    )
    # 5. Equal-length multi-sequence positional summary (if applicable)
    _eq_dfs = [r['df'] for r in RESULTS_BY_FILE.values()
               if r['file_type'] == 'multi_equal' and not r['df'].empty]
    if _eq_dfs:
        _eq_df = pd.concat(_eq_dfs, ignore_index=True)
        _tables['5_equal_length_positional'] = (
            _eq_df.groupby(['Source_File', 'Class', 'Start'])
            .size().reset_index(name='Frequency')
            .sort_values(['Source_File', 'Class', 'Frequency'],
                         ascending=[True, True, False])
        )

    # Export master motifs in all enabled formats
    _rows = _master_df.to_dict(orient='records')
    if EXPORT_CSV:   export_to_csv(_rows,   filename=str(_master_dir / 'master_motifs.csv'))
    if EXPORT_BED:   export_to_bed(_rows,   filename=str(_master_dir / 'master_motifs.bed'))
    if EXPORT_JSON:  export_to_json(_rows,  filename=str(_master_dir / 'master_motifs.json'))
    if EXPORT_EXCEL: export_to_excel(_rows, filename=str(_master_dir / 'master_motifs.xlsx'))

    # Export each summary table as CSV
    for tname, tdf in _tables.items():
        tdf.to_csv(str(_master_dir / f'{tname}.csv'), index=False)

    # Master summary plot
    _cc   = _master_df['Class'].value_counts()
    _ft   = _master_df.groupby('File_Type').size()
    _ncol = 3 if len(RESULTS_BY_FILE) > 1 else 2
    fig, axes = plt.subplots(1, _ncol, figsize=(6 * _ncol, 4))

    axes[0].barh(_cc.index[::-1], _cc.values[::-1], color='steelblue')
    axes[0].set_xlabel('Motif Count')
    axes[0].set_title('Global Class Distribution')

    axes[1].pie(_ft.values, labels=_ft.index, autopct='%1.1f%%', startangle=90)
    axes[1].set_title('Motifs by File Type')

    if _ncol == 3:
        _pf = _master_df.groupby('Source_File').size().sort_values(ascending=False)
        axes[2].barh([Path(n).stem[:25] for n in _pf.index[::-1]], _pf.values[::-1],
                     color='coral')
        axes[2].set_xlabel('Motif Count')
        axes[2].set_title('Motifs per File')

    plt.tight_layout()
    _plot_path = str(_master_dir / 'master_summary.png')
    fig.savefig(_plot_path, dpi=150)
    plt.close(fig)
    display(Image(_plot_path))

# ── Display master tables ─────────────────────────────────────────────────────
for tname, tdf in _tables.items():
    print(f"\n{'='*60}\n{tname.replace('_',' ').upper()}\n{'='*60}")
    display(tdf)

# ── Download links (base64 data URIs — work in any Jupyter environment) ───────
_MIME = {
    'csv':  'text/csv',
    'bed':  'text/plain',
    'json': 'application/json',
    'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
    'png':  'image/png',
    'html': 'text/html',
}

def _dl_link(path, label):
    with open(path, 'rb') as fh:
        b64  = base64.b64encode(fh.read()).decode()
    ext  = Path(path).suffix.lstrip('.')
    mime = _MIME.get(ext, 'application/octet-stream')
    return (f'<a href="data:{mime};base64,{b64}" '
            f'download="{Path(path).name}" '
            f'style="margin:2px 6px;padding:3px 8px;border:1px solid #aaa;'
            f'border-radius:4px;text-decoration:none;">'
            f'{label}</a>')

_html_parts = ['<h2>\U0001f4e5 Downloads</h2>']

_html_parts.append('<h3>Master Outputs</h3><div>')
for fmt, fname in [
    ('CSV',   'master_motifs.csv'),
    ('BED',   'master_motifs.bed'),
    ('JSON',  'master_motifs.json'),
    ('Excel', 'master_motifs.xlsx'),
]:
    p = _master_dir / fname
    if p.exists():
        _html_parts.append(_dl_link(str(p), f'Master {fmt}'))
_html_parts.append('</div>')

_html_parts.append('<h3>Summary Tables</h3><div>')
for tname in _tables:
    p = _master_dir / f'{tname}.csv'
    if p.exists():
        _html_parts.append(_dl_link(str(p), tname.replace('_', ' ').title()))
_html_parts.append('</div>')

_html_parts.append('<h3>Per-File Outputs</h3>')
for stem, res in RESULTS_BY_FILE.items():
    _html_parts.append(
        f'<details style="margin:4px 0"><summary>'
        f'<b>{stem}</b> &nbsp;<em>[{res["file_type"]}]</em></summary>'
        f'<div style="margin:4px 12px">'
    )
    for fmt, fname in [
        ('CSV', 'motifs.csv'), ('BED', 'motifs.bed'),
        ('JSON', 'motifs.json'), ('Excel', 'motifs.xlsx'),
    ]:
        p = res['folder'] / fname
        if p.exists():
            _html_parts.append(_dl_link(str(p), fmt))
    _img = res['folder'] / 'class_distribution.png'
    if _img.exists():
        _html_parts.append(_dl_link(str(_img), 'Plot'))
    _html_parts.append('</div></details>')

display(HTML('\n'.join(_html_parts)))
print(f'\n\u2705 All outputs saved in: {_BASE}')
