In [3]:
# === SETUP & CONFIG (Using References folder) ===

from pathlib import Path
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import re, gzip, json, subprocess, sys

print("="*60)
print("OPTIMIZED MODE: P. cubensis References from Files")
print("="*60)
print("Reading references from: 03_Sequences_Paper/References/")
print("="*60 + "\n")

# --- Project roots ---
BASE = Path.cwd().resolve()
DATA_CANDIDATES = [
    BASE / "03_Sequences_Paper",
    BASE.parent / "03_Sequences_Paper",
]
for candidate in DATA_CANDIDATES:
    if candidate.exists():
        DATA = candidate
        break
else:
    raise FileNotFoundError("Unable to locate the 03_Sequences_Paper directory relative to this notebook.")

def resolve_dir(parent: Path, name: str) -> Path:
    """Resolve directory with optional whitespace handling"""
    direct = parent / name
    if direct.exists():
        return direct
    normalized = [child for child in parent.iterdir() if child.name.strip() == name]
    if normalized:
        return normalized[0]
    raise FileNotFoundError(f'Expected directory "{name}" under {parent}')

# --- Input folders ---
SCAFF_DIR = resolve_dir(DATA, "Assembly_scaffolds")
REF_SOURCE_DIR = resolve_dir(DATA, "References")  # Your reference FASTA files

# --- Output folders ---
REF_DIR = BASE / "reference_panels"
OUT_DIR = BASE / "cds_extraction_outputs"
REF_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Enzymes we care about ---
ENZYMES = ("PsiD", "PsiK", "PsiM", "PsiH")

# --- Build reference panels from your FASTA files ---
print("Building reference panels from FASTA files...")

REFS = {}
for enzyme in ENZYMES:
    # Create output directory for this enzyme
    enzyme_dir = REF_DIR / f"Proteins_{enzyme}"
    enzyme_dir.mkdir(parents=True, exist_ok=True)

    # Look for reference file for this enzyme
    ref_file = REF_SOURCE_DIR / f"{enzyme}_Psilocybe_cubensis.fasta"

    if not ref_file.exists():
        print(f"WARNING: Reference file not found: {ref_file}")
        continue

    # Read the reference FASTA file
    records = list(SeqIO.parse(ref_file, "fasta"))

    if not records:
        print(f"WARNING: No sequences found in {ref_file}")
        continue

    # Write to reference panel
    output_fasta = enzyme_dir / f"{enzyme}_refs.faa"
    SeqIO.write(records, output_fasta, "fasta")

    REFS[enzyme] = output_fasta
    print(f"  {enzyme}: {len(records)} sequence(s) -> {output_fasta}")

print(f"\nReference panels created: {len(REFS)}/{len(ENZYMES)} enzymes")
print("="*60 + "\n")

# Verify all enzymes have references
missing = set(ENZYMES) - set(REFS.keys())
if missing:
    raise FileNotFoundError(f"Missing references for: {', '.join(missing)}")

print("Setup complete! Ready to extract genes.")


OPTIMIZED MODE: P. cubensis References from Files
Reading references from: 03_Sequences_Paper/References/

Building reference panels from FASTA files...
  PsiD: 1 sequence(s) -> /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/reference_panels/Proteins_PsiD/PsiD_refs.faa
  PsiK: 1 sequence(s) -> /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/reference_panels/Proteins_PsiK/PsiK_refs.faa
  PsiM: 1 sequence(s) -> /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/reference_panels/Proteins_PsiM/PsiM_refs.faa
  PsiH: 1 sequence(s) -> /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/reference_panels/Proteins_PsiH/PsiH_refs.faa

Reference panels created: 4/4 enzymes

Setup complete! Ready to extract genes.


In [14]:
# === CELL 2: Single-Species Extraction (OPTIMIZED) ===

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import time

try:
    from tqdm.auto import tqdm
    TQDM_AVAILABLE = True
except ImportError:
    TQDM_AVAILABLE = False
    print("Warning: tqdm not installed. Progress bars disabled. Install with: pip install tqdm")

# --- OPTIMIZED PARAMETERS FOR SPEED ---
BESTN   = 1     # Best hit only (3x faster than bestn=3)
PERCENT = 30    # Minimal percent identity cutoff

# Optional soft length bands (AA) to prefer intact ORFs
AA_BANDS = {
    "PsiD": (350, 600),   # TDC typically ~480 aa
    "PsiK": (250, 420),   # kinase
    "PsiM": (180, 320),   # methyltransferase
    "PsiH": (400, 650),   # P450 ~510 aa
}

print(f"Extraction parameters: BESTN={BESTN}, PERCENT={PERCENT}")
print(f"Single reference mode + bestn=1 = ~3-4x faster!\n")

def run(cmd):
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if p.returncode != 0:
        raise RuntimeError(f"Command failed: {' '.join(map(str,cmd))}\nSTDERR:\n{p.stderr}")
    return p.stdout

def index_fasta_dict(fasta_path):
    return {rec.id: str(rec.seq).upper().replace("-", "") for rec in SeqIO.parse(fasta_path, "fasta")}

def reverse_complement(seq):
    return str(Seq(seq).reverse_complement())

def translate_orf(seq):
    prot = str(Seq(seq).translate(to_stop=False))
    in_frame = (len(seq) % 3 == 0)
    has_internal_stop = "*" in prot[:-1]
    return prot, in_frame, has_internal_stop

def parse_exonerate_gff(gff_text):
    """
    Parse exonerate --showtargetgff output.
    Returns list of hits dicts with keys: seqid,strand,score,exons(list of (s,e,phase))
    """
    lines = [ln for ln in gff_text.splitlines() if ln and not ln.startswith("#")]
    hits, current = [], None
    for ln in lines:
        if "\t" not in ln: 
            continue
        seqid, source, feature, start, end, score, strand, phase, attrs = ln.split("\t")
        start, end = int(start), int(end)
        score = 0.0 if score == "." else float(score)
        f = feature.lower()
        if f == "gene":
            if current and current.get("exons"):
                hits.append(current)
            current = {"seqid": seqid, "strand": strand, "score": score, "exons": [], "attrs": attrs}
        elif f == "cds":
            if current is None:
                current = {"seqid": seqid, "strand": strand, "score": score, "exons": [], "attrs": attrs}
            ph = 0 if phase == "." else int(phase)
            current["exons"].append((start, end, ph))
    if current and current.get("exons"):
        hits.append(current)
    for h in hits:
        h["exons"].sort(key=lambda x: x[0])
    return hits

def stitch_cds(genome_dict, hit):
    seq_parts = [genome_dict[hit["seqid"]][s-1:e] for (s,e,ph) in hit["exons"]]
    cds = "".join(seq_parts)
    if hit["strand"] == "-":
        cds = reverse_complement(cds)
    # trim to multiple of 3 if needed
    if len(cds) % 3 != 0:
        cds = cds[:len(cds) - (len(cds) % 3)]
    return cds

def exonerate_protein2genome(ref_faa, scaff_fa, bestn=BESTN, minpct=PERCENT):
    return run([
        "exonerate", "--model", "protein2genome",
        "--showtargetgff", "yes", "--showalignment", "no", "--showvulgar", "no",
        "--bestn", str(bestn), "--percent", str(minpct),
        str(ref_faa), str(scaff_fa)
    ])

def score_candidate(enzyme, prot, in_frame, has_stop, score_raw, nt_len):
    """Lower is better; prefer in-frame, no stops, within AA band, higher score, longer."""
    aa_len = len(prot.rstrip("*"))
    band_pen = 0
    if AA_BANDS.get(enzyme):
        lo, hi = AA_BANDS[enzyme]
        if not (lo <= aa_len <= hi):
            band_pen = 1
    return (
        int(has_stop),          # 0 preferred
        int(not in_frame),      # 0 preferred
        band_pen,               # 0 preferred
        -score_raw,             # higher exonerate score preferred
        -nt_len,                # longer CDS preferred
    )

def run_species(scaff_fa_path, out_dir=OUT_DIR, pbar=None, verbose=True):
    """Extract PsiD/K/M/H for one species FASTA; returns a summary dict."""
    species = Path(scaff_fa_path).stem
    genome = index_fasta_dict(scaff_fa_path)
    species_out = out_dir / species
    species_out.mkdir(parents=True, exist_ok=True)
    summary = {"species": species, "results": []}

    # Count scaffolds for info
    n_scaffolds = len(genome)
    if verbose and pbar is None:
        print(f"[{species}] Processing genome with {n_scaffolds:,} scaffolds...")

    # Create enzyme-level progress bar if tqdm available
    enzyme_iter = REFS.items()
    if TQDM_AVAILABLE and pbar is None:  # only show enzyme bar in single-species mode
        enzyme_iter = tqdm(enzyme_iter, desc=f"{species}", leave=False, total=len(REFS))

    for enzyme, ref_path in enzyme_iter:
        if verbose and pbar is None:
            print(f"  [{species}] Processing {enzyme}...", end=" ", flush=True)
        
        start_time = time.time()
        candidates = []
        
        # Single reference mode - ref_path is a Path object, not a list
        if not ref_path.exists():
            if verbose and pbar is None:
                print(f"ERROR: Reference not found")
            summary["results"].append({
                "enzyme": enzyme,
                "status": "ERROR",
                "error": f"Reference file not found: {ref_path}"
            })
            continue
        
        gff_text = exonerate_protein2genome(ref_path, scaff_fa_path, bestn=BESTN, minpct=PERCENT)
        for h in parse_exonerate_gff(gff_text):
            cds = stitch_cds(genome, h)
            if not cds:
                continue
            prot, in_frame, has_stop = translate_orf(cds)
            cand = {
                "enzyme": enzyme, "score": h["score"], "nt_len": len(cds),
                "in_frame": in_frame, "has_stop": has_stop, "blocks": len(h["exons"]),
                "strand": h["strand"], "seqid": h["seqid"], "cds": cds, "prot": prot
            }
            candidates.append(cand)
        
        elapsed = time.time() - start_time
        if verbose and pbar is None:
            print(f"done ({elapsed:.1f}s)", flush=True)

        if not candidates:
            summary["results"].append({"enzyme": enzyme, "status": "NO_HIT"})
            continue

        # pick the best candidate (with bestn=1, usually only 1 candidate anyway)
        best = sorted(
            candidates,
            key=lambda c: score_candidate(enzyme, c["prot"], c["in_frame"], c["has_stop"], c["score"], c["nt_len"])
        )[0]

        # write outputs
        cds_rec  = SeqRecord(Seq(best["cds"]), id=f"{species}|{enzyme}", description=f"score={best['score']};blocks={best['blocks']};strand={best['strand']}")
        prot_rec = SeqRecord(Seq(best["prot"]), id=f"{species}|{enzyme}", description=f"in_frame={best['in_frame']};has_stop={best['has_stop']}")
        SeqIO.write([cds_rec],  species_out / f"{species}_{enzyme}.cds.fa",  "fasta")
        SeqIO.write([prot_rec], species_out / f"{species}_{enzyme}.prot.fa", "fasta")

        status = "OK" if (best["in_frame"] and not best["has_stop"]) else "CHECK_ORF"
        summary["results"].append({
            "enzyme": enzyme, "status": status, "score": best["score"],
            "nt_len": best["nt_len"], "blocks": best["blocks"], "strand": best["strand"], "seqid": best["seqid"]
        })

    # write per-species JSON summary
    with open((species_out / f"{species}_summary.json"), "w") as fh:
        json.dump(summary, fh, indent=2)
    
    # Update batch progress bar if provided
    if pbar is not None:
        pbar.update(1)
    else:
        print(f"[{species}] Done! Output → {species_out}")
    
    return summary

# === EXAMPLE: run one species ===
print("="*60)
print("TESTING: Single Species Extraction")
print("="*60 + "\n")

one_species = sorted(SCAFF_DIR.glob("Psilocybe_baeocystis*.fa*"))[0]
print(f"Test species: {one_species.name}\n")
test_summary = run_species(one_species, OUT_DIR)

# === VALIDATION: Compare with Example Outputs ===
print("\n" + "="*60)
print("VALIDATION: Comparing with Example Outputs")
print("="*60 + "\n")

EXAMPLE_DIR = BASE.parent / "02_ColabResults_Example"
if EXAMPLE_DIR.exists():
    species_name = one_species.stem
    our_out = OUT_DIR / species_name
    
    # Only PsiD has example output
    enzyme = "PsiD"
    
    # Compare CDS
    our_cds = our_out / f"{species_name}_{enzyme}.cds.fa"
    example_cds = EXAMPLE_DIR / "P_baeocystis_PsiD.cds.fa"
    
    # Compare Protein
    our_prot = our_out / f"{species_name}_{enzyme}.prot.fa"
    example_prot = EXAMPLE_DIR / "P_baeocystis_PsiD.prot.fa"
    
    if our_cds.exists() and example_cds.exists():
        our_cds_seq = str(list(SeqIO.parse(our_cds, "fasta"))[0].seq)
        example_cds_seq = str(list(SeqIO.parse(example_cds, "fasta"))[0].seq)
        
        print(f"📋 {enzyme} CDS Comparison:")
        print(f"   Our length:     {len(our_cds_seq):,} bp")
        print(f"   Example length: {len(example_cds_seq):,} bp")
        
        if our_cds_seq == example_cds_seq:
            print(f"   ✅ PERFECT MATCH - CDS sequences identical!\n")
        else:
            # Check if they're close
            from difflib import SequenceMatcher
            similarity = SequenceMatcher(None, our_cds_seq, example_cds_seq).ratio()
            print(f"   ⚠️  Sequences differ (similarity: {similarity*100:.1f}%)")
            
            # Show first difference
            for i, (a, b) in enumerate(zip(our_cds_seq, example_cds_seq)):
                if a != b:
                    print(f"   First difference at position {i}: {a} vs {b}\n")
                    break
    
    if our_prot.exists() and example_prot.exists():
        our_prot_seq = str(list(SeqIO.parse(our_prot, "fasta"))[0].seq)
        example_prot_seq = str(list(SeqIO.parse(example_prot, "fasta"))[0].seq)
        
        print(f"📋 {enzyme} Protein Comparison:")
        print(f"   Our length:     {len(our_prot_seq)} aa")
        print(f"   Example length: {len(example_prot_seq)} aa")
        
        if our_prot_seq == example_prot_seq:
            print(f"   ✅ PERFECT MATCH - Protein sequences identical!")
        else:
            similarity = SequenceMatcher(None, our_prot_seq, example_prot_seq).ratio()
            print(f"   ⚠️  Sequences differ (similarity: {similarity*100:.1f}%)")
            
            # Show first difference
            for i, (a, b) in enumerate(zip(our_prot_seq, example_prot_seq)):
                if a != b:
                    print(f"   First difference at position {i}: {a} vs {b}")
                    break
    
    print("\n" + "="*60)
    print("Validation complete!")
    print("="*60 + "\n")
else:
    print(f"⚠️  Example directory not found: {EXAMPLE_DIR}")
    print("   Skipping validation comparison.\n")


Extraction parameters: BESTN=1, PERCENT=30
Single reference mode + bestn=1 = ~3-4x faster!

TESTING: Single Species Extraction

Test species: Psilocybe_baeocystis_WTU-F-011245.scaffolds.fasta

[Psilocybe_baeocystis_WTU-F-011245.scaffolds] Processing genome with 111,256 scaffolds...


Psilocybe_baeocystis_WTU-F-011245.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

  [Psilocybe_baeocystis_WTU-F-011245.scaffolds] Processing PsiD... done (4.0s)
  [Psilocybe_baeocystis_WTU-F-011245.scaffolds] Processing PsiK... done (3.8s)
  [Psilocybe_baeocystis_WTU-F-011245.scaffolds] Processing PsiM... done (3.7s)
  [Psilocybe_baeocystis_WTU-F-011245.scaffolds] Processing PsiH... done (5.4s)
[Psilocybe_baeocystis_WTU-F-011245.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_baeocystis_WTU-F-011245.scaffolds

VALIDATION: Comparing with Example Outputs

📋 PsiD CDS Comparison:
   Our length:     618 bp
   Example length: 618 bp
   ✅ PERFECT MATCH - CDS sequences identical!

📋 PsiD Protein Comparison:
   Our length:     206 aa
   Example length: 206 aa
   ✅ PERFECT MATCH - Protein sequences identical!

Validation complete!



In [15]:
# === CELL 3: Batch Processing with Parallel Execution ===

import concurrent.futures as cf
import os
import csv
from datetime import datetime
from collections import Counter, defaultdict

all_scaff = sorted(SCAFF_DIR.glob("*.fa*"))
print(f"Total species to process: {len(all_scaff)}")

# ThreadPoolExecutor works in Jupyter (unlike ProcessPoolExecutor)
# Good for I/O-bound tasks like running exonerate subprocesses
max_workers = min(12, os.cpu_count() or 12)  # Adjust based on your system
print(f"Using {max_workers} parallel threads")
print("="*60 + "\n")

def process_one_species(scaff_path):
    """Worker function to process a single species"""
    try:
        return run_species(scaff_path, OUT_DIR, pbar=None, verbose=False)
    except Exception as e:
        import traceback
        return {
            "species": Path(scaff_path).stem,
            "error": str(e),
            "traceback": traceback.format_exc()
        }

started = datetime.now()
summaries = []

# Create progress bar
if TQDM_AVAILABLE:
    pbar = tqdm(total=len(all_scaff), desc="Processing species", unit="species")
else:
    pbar = None
    print(f"Processing {len(all_scaff)} species in parallel...")

# Parallel processing with ThreadPoolExecutor
with cf.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_species = {
        executor.submit(process_one_species, scaff): scaff.stem 
        for scaff in all_scaff
    }
    
    # Collect results as they complete
    for future in cf.as_completed(future_to_species):
        species_name = future_to_species[future]
        try:
            result = future.result()
            summaries.append(result)
            
            # Check for errors in result
            if "error" in result:
                print(f"\n✗ ERROR in {species_name}: {result['error']}")
            
            # Update progress
            if pbar:
                pbar.update(1)
            else:
                print(f"  [{len(summaries)}/{len(all_scaff)}] {species_name} - Done")
                
        except Exception as exc:
            print(f"\n✗ EXCEPTION in {species_name}: {exc}")
            summaries.append({
                "species": species_name,
                "error": str(exc)
            })
            if pbar:
                pbar.update(1)

if pbar:
    pbar.close()

elapsed = datetime.now() - started
print(f"\n{'='*60}")
print(f"Batch processing complete!")
print(f"Total time: {elapsed}")
print(f"Processed: {len(summaries)} species")
print(f"Average: {elapsed.total_seconds() / len(summaries):.1f} seconds per species")
print(f"{'='*60}\n")

# === Generate QC Reports ===
print("Generating QC reports...")

# Write extraction log
log_file = OUT_DIR / "extraction_log.json"
with open(log_file, "w") as fh:
    json.dump({
        "timestamp": started.isoformat(),
        "elapsed": str(elapsed),
        "total_species": len(all_scaff),
        "parallel_workers": max_workers,
        "summaries": summaries
    }, fh, indent=2)
print(f"  ✓ Extraction log: {log_file}")

# Collect problems for QC reports
problems = []
species_problem_counts = defaultdict(lambda: {"CHECK_ORF": 0, "NO_HIT": 0, "ERROR": 0})

for summary in summaries:
    species = summary.get("species", "unknown")
    
    # Handle species-level errors
    if "error" in summary:
        problems.append({
            "species": species,
            "enzyme": "ALL",
            "status": "ERROR",
            "details": summary["error"]
        })
        species_problem_counts[species]["ERROR"] += 1
        continue
    
    # Check each enzyme result
    for result in summary.get("results", []):
        enzyme = result.get("enzyme", "unknown")
        status = result.get("status", "UNKNOWN")
        
        if status in ["CHECK_ORF", "NO_HIT", "ERROR"]:
            details = result.get("error", f"Status: {status}")
            problems.append({
                "species": species,
                "enzyme": enzyme,
                "status": status,
                "details": details
            })
            species_problem_counts[species][status] += 1

# Write problems report
if problems:
    problems_tsv = OUT_DIR / "problems_report.tsv"
    with open(problems_tsv, "w", newline="") as fh:
        writer = csv.DictWriter(fh, fieldnames=["species", "enzyme", "status", "details"], delimiter="\t")
        writer.writeheader()
        writer.writerows(problems)
    print(f"  ✓ Problems report: {problems_tsv} ({len(problems)} issues)")
else:
    print("  ✓ No problems found - all extractions OK!")

# Write species problem summary
if species_problem_counts:
    summary_tsv = OUT_DIR / "species_problem_summary.tsv"
    with open(summary_tsv, "w", newline="") as fh:
        writer = csv.writer(fh, delimiter="\t")
        writer.writerow(["species", "total_problems", "CHECK_ORF", "NO_HIT", "ERROR"])
        for species in sorted(species_problem_counts.keys()):
            counts = species_problem_counts[species]
            total = sum(counts.values())
            writer.writerow([species, total, counts["CHECK_ORF"], counts["NO_HIT"], counts["ERROR"]])
    print(f"  ✓ Species summary: {summary_tsv} ({len(species_problem_counts)} species with issues)")

print(f"\n{'='*60}")
print("QC Reports Complete!")
print(f"{'='*60}")


Total species to process: 71
Using 12 parallel threads



Processing species:   0%|          | 0/71 [00:00<?, ?species/s]

Psilocybe_angustipleurocystidiata_ISOTYPE_NY-761597.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_argentina_SFSU-F-029894.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_azurescens_WTU-F19095-HOLOTYPE.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_argentipes_NY-1595850.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_azurescens_Lg-bg-ut.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_bohemica_SFSU-F-029930.scaffolds:   0%|          | 0/4 [00:01<?, ?it/s]

Psilocybe_arcana_ISOTYPE_SFSU-F-000737.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_baeocystis_WTU-F-011245.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_aztecorum_var_bonetii_NY-1595856.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_aztecorum_SFSU-F-029933.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_acutissima_GAM00011063.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_apelliculosa_UBC-F17545.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_azurescens_WTU-F19095-HOLOTYPE.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_azurescens_WTU-F19095-HOLOTYPE.scaffolds


Psilocybe_caerulescens_NY-1920304.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_argentipes_NY-1595850.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_argentipes_NY-1595850.scaffolds
[Psilocybe_argentina_SFSU-F-029894.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_argentina_SFSU-F-029894.scaffolds
[Psilocybe_angustipleurocystidiata_ISOTYPE_NY-761597.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_angustipleurocystidiata_ISOTYPE_NY-761597.scaffolds


Psilocybe_caerulipes_PUL00030154.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_callosa_NY-1595861.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_bohemica_SFSU-F-029930.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_bohemica_SFSU-F-029930.scaffolds
[Psilocybe_arcana_ISOTYPE_SFSU-F-000737.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_arcana_ISOTYPE_SFSU-F-000737.scaffolds
[Psilocybe_baeocystis_WTU-F-011245.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_baeocystis_WTU-F-011245.scaffolds


Psilocybe_chionophila_OSC-113991.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_azurescens_Lg-bg-ut.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_azurescens_Lg-bg-ut.scaffolds


Psilocybe_columbiana_ISOTYPE_NY-761607.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_aztecorum_SFSU-F-029933.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_aztecorum_SFSU-F-029933.scaffolds


Psilocybe_clavata_ISOTYPE_NY-761604.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_cubensis_IBUG-4367.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_caerulescens_var_mazatecorum_SFSU-F-029971.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_congolensis_ISOTYPE_NY-1652567.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_caerulescens_NY-1920304.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_caerulescens_NY-1920304.scaffolds
[Psilocybe_aztecorum_var_bonetii_NY-1595856.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_aztecorum_var_bonetii_NY-1595856.scaffolds


Psilocybe_cyanescens_WTU-F-011306.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_cyanofibrillosa_ISOTYPE_NY-761605.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_cubensis_IBUG-4367.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_cubensis_IBUG-4367.scaffolds


Psilocybe_fagicola_var_mesocystidiata_NY-761608.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_columbiana_ISOTYPE_NY-761607.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_columbiana_ISOTYPE_NY-761607.scaffolds


Psilocybe_fimetaria_UBC-F30923.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_caerulipes_PUL00030154.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_caerulipes_PUL00030154.scaffolds
[Psilocybe_cyanescens_WTU-F-011306.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_cyanescens_WTU-F-011306.scaffolds
[Psilocybe_chionophila_OSC-113991.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_chionophila_OSC-113991.scaffolds


Psilocybe_galindii_ISOTYPE_NY-761609.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_fuliginosa_NY-1901148.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_callosa_NY-1595861.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_callosa_NY-1595861.scaffolds


Psilocybe_guilartensis_PARATYPE_CFMR-PR-5680.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_heimii_ISOTYPE_NY-761610.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_acutissima_GAM00011063.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_acutissima_GAM00011063.scaffolds


Psilocybe_hoogshageni_SFSU-F-029980.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_apelliculosa_UBC-F17545.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_apelliculosa_UBC-F17545.scaffolds
[Psilocybe_cyanofibrillosa_ISOTYPE_NY-761605.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_cyanofibrillosa_ISOTYPE_NY-761605.scaffolds


Psilocybe_hoogshagenii_var_convexa_ISOTYPE_NY-761612.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_hopii_ISOTYPE_XAL-#.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]


✗ ERROR in Psilocybe_fuliginosa_NY-1901148.scaffolds: Command failed: exonerate --model protein2genome --showtargetgff yes --showalignment no --showvulgar no --bestn 1 --percent 30 /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/reference_panels/Proteins_PsiM/PsiM_refs.faa /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/03_Sequences_Paper/Assembly_scaffolds/Psilocybe_fuliginosa_NY-1901148.scaffolds.fasta
STDERR:



Psilocybe_laticystis_UBC-F16759.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_fagicola_var_mesocystidiata_NY-761608.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_fagicola_var_mesocystidiata_NY-761608.scaffolds


Psilocybe_lazoi_ISOTYPE_NY-761614.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_clavata_ISOTYPE_NY-761604.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_clavata_ISOTYPE_NY-761604.scaffolds


Psilocybe_liniformans_var_americana_NY-1797145.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_heimii_ISOTYPE_NY-761610.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_heimii_ISOTYPE_NY-761610.scaffolds


Psilocybe_magica_OSC-111954.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_fimetaria_UBC-F30923.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_fimetaria_UBC-F30923.scaffolds


Psilocybe_mexicana_IBUG-13593.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_galindii_ISOTYPE_NY-761609.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_galindii_ISOTYPE_NY-761609.scaffolds


Psilocybe_montana_OSC-113978.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_hoogshageni_SFSU-F-029980.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_hoogshageni_SFSU-F-029980.scaffolds


Psilocybe_moravica_ISOTYPE_SFSU-F-000732.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_hopii_ISOTYPE_XAL-#.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_hopii_ISOTYPE_XAL-#.scaffolds


Psilocybe_muliercula_GAM00011071.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_guilartensis_PARATYPE_CFMR-PR-5680.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_guilartensis_PARATYPE_CFMR-PR-5680.scaffolds


Psilocybe_ovoideocystidiata_ISOTYPE_XAL-51B.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_congolensis_ISOTYPE_NY-1652567.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_congolensis_ISOTYPE_NY-1652567.scaffolds
[Psilocybe_mexicana_IBUG-13593.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_mexicana_IBUG-13593.scaffolds


Psilocybe_pelliculosa_WTU-F-012331.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]


✗ ERROR in Psilocybe_moravica_ISOTYPE_SFSU-F-000732.scaffolds: Command failed: exonerate --model protein2genome --showtargetgff yes --showalignment no --showvulgar no --bestn 1 --percent 30 /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/reference_panels/Proteins_PsiK/PsiK_refs.faa /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/03_Sequences_Paper/Assembly_scaffolds/Psilocybe_moravica_ISOTYPE_SFSU-F-000732.scaffolds.fasta
STDERR:



Psilocybe_phyllogena_OSC-114015.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_pleurocystidiosa_ISOTYPE_NY-761619.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_laticystis_UBC-F16759.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_laticystis_UBC-F16759.scaffolds


Psilocybe_polytrichophila_NY-1901129.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_hoogshagenii_var_convexa_ISOTYPE_NY-761612.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_hoogshagenii_var_convexa_ISOTYPE_NY-761612.scaffolds


Psilocybe_portoricensis_CFMR-PR-4572.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_lazoi_ISOTYPE_NY-761614.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_lazoi_ISOTYPE_NY-761614.scaffolds
[Psilocybe_montana_OSC-113978.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_montana_OSC-113978.scaffolds

✗ ERROR in Psilocybe_polytrichophila_NY-1901129.scaffolds: Command failed: exonerate --model protein2genome --showtargetgff yes --showalignment no --showvulgar no --bestn 1 --percent 30 /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/reference_panels/Proteins_PsiK/PsiK_refs.faa /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/03_Sequences_Paper/Assembly_scaffolds/Psilocybe_polytrichophila_NY-1901129.scaffolds.fasta
STDERR:



Psilocybe_quebecensis_NY-1901130.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_rhomboidospora_CMMF003424.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_magica_OSC-111954.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_magica_OSC-111954.scaffolds


Psilocybe_samuiensis_WTU-F-055014.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_sabulosa_UBC-F13505.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_ovoideocystidiata_ISOTYPE_XAL-51B.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_ovoideocystidiata_ISOTYPE_XAL-51B.scaffolds
[Psilocybe_phyllogena_OSC-114015.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_phyllogena_OSC-114015.scaffolds
[Psilocybe_samuiensis_WTU-F-055014.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_samuiensis_WTU-F-055014.scaffolds


Psilocybe_semilanceata_SFSU-F-029972.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_pelliculosa_WTU-F-012331.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_pelliculosa_WTU-F-012331.scaffolds


Psilocybe_silvatica_VPI-F-0003693.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_singeri_ISOTYPE_NY-761622.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_strictipes_WTU-F-011411.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_pleurocystidiosa_ISOTYPE_NY-761619.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_pleurocystidiosa_ISOTYPE_NY-761619.scaffolds


Psilocybe_stuntzii_WTU-F-011520.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_liniformans_var_americana_NY-1797145.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_liniformans_var_americana_NY-1797145.scaffolds
[Psilocybe_rhomboidospora_CMMF003424.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_rhomboidospora_CMMF003424.scaffolds


Psilocybe_subcubensis_SFSU-29974.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_subcoprophila_UBC-F977.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_sabulosa_UBC-F13505.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_sabulosa_UBC-F13505.scaffolds


Psilocybe_subfimetaria_SFSU-F-029945.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_stuntzii_WTU-F-011520.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_stuntzii_WTU-F-011520.scaffolds


Psilocybe_subhoogshagenii_NY-915004.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_subcubensis_SFSU-29974.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_subcubensis_SFSU-29974.scaffolds


Psilocybe_subpsilocybioides_HOLOTYPE_CFMR-PR-5689.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_strictipes_WTU-F-011411.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_strictipes_WTU-F-011411.scaffolds


Psilocybe_subviscida_VPI-F-0003697.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_portoricensis_CFMR-PR-4572.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_portoricensis_CFMR-PR-4572.scaffolds


Psilocybe_subyungensis_ISOTYPE_NY-1197500.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_silvatica_VPI-F-0003693.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_silvatica_VPI-F-0003693.scaffolds


Psilocybe_tampanensis_UBC-F10177.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_singeri_ISOTYPE_NY-761622.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_singeri_ISOTYPE_NY-761622.scaffolds
[Psilocybe_semilanceata_SFSU-F-029972.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_semilanceata_SFSU-F-029972.scaffolds


Psilocybe_tuberosa_WTU-F-011378.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_washingtonensis_WTU-F-055019.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_subhoogshagenii_NY-915004.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_subhoogshagenii_NY-915004.scaffolds


Psilocybe_weilii_ISOTYPE_WTU-F-063525.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_subfimetaria_SFSU-F-029945.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_subfimetaria_SFSU-F-029945.scaffolds
[Psilocybe_tampanensis_UBC-F10177.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_tampanensis_UBC-F10177.scaffolds


Psilocybe_xalapensis_ISOTYPE_NY-761630.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

Psilocybe_wrightii_ISOTYPE_NY-761629.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_muliercula_GAM00011071.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_muliercula_GAM00011071.scaffolds


Psilocybe_yungensis_SFSU-29944.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_weilii_ISOTYPE_WTU-F-063525.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_weilii_ISOTYPE_WTU-F-063525.scaffolds


Psilocybe_zapotecorum_FFCL689.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_subpsilocybioides_HOLOTYPE_CFMR-PR-5689.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_subpsilocybioides_HOLOTYPE_CFMR-PR-5689.scaffolds
[Psilocybe_subviscida_VPI-F-0003697.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_subviscida_VPI-F-0003697.scaffolds


Psilocybe_zapotecorum_GAM00011076.scaffolds:   0%|          | 0/4 [00:00<?, ?it/s]

[Psilocybe_yungensis_SFSU-29944.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_yungensis_SFSU-29944.scaffolds
[Psilocybe_quebecensis_NY-1901130.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_quebecensis_NY-1901130.scaffolds
[Psilocybe_zapotecorum_FFCL689.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_zapotecorum_FFCL689.scaffolds
[Psilocybe_xalapensis_ISOTYPE_NY-761630.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_xalapensis_ISOTYPE_NY-761630.scaffolds
[Psilocybe_zapotecorum_GAM00011076.scaffolds] Done! Output → /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/cds_extraction_outputs/Psilocybe_zapotecorum_GAM00011076.scaffolds
[Psilocybe_caerulesce

In [4]:
# === CELL 4: Build MAFFT alignments & intron summary ===
import json
import shutil
import subprocess
import multiprocessing
from collections import defaultdict

import pandas as pd
from Bio import SeqIO, AlignIO

ALIGN_DIR = BASE / "mafft_alignments"
ALIGN_DIR.mkdir(parents=True, exist_ok=True)

mafft_bin = shutil.which("mafft")
if mafft_bin is None:
    raise FileNotFoundError("MAFFT executable not found. Install MAFFT and ensure it is on PATH before running this cell.")

alignment_inputs = defaultdict(list)
summary_rows = []

for species_dir in sorted(OUT_DIR.iterdir()):
    if not species_dir.is_dir():
        continue
    summary_path = species_dir / f"{species_dir.name}_summary.json"
    if not summary_path.exists():
        continue

    with open(summary_path) as handle:
        species_summary = json.load(handle)

    species_name = species_summary.get("species", species_dir.name)
    for result in species_summary.get("results", []):
        enzyme = result.get("enzyme")
        if not enzyme:
            continue

        status = result.get("status", "UNKNOWN")
        blocks = result.get("blocks") or 0
        intron_count = max(blocks - 1, 0)

        prot_path = species_dir / f"{species_dir.name}_{enzyme}.prot.fa"
        cds_path = species_dir / f"{species_dir.name}_{enzyme}.cds.fa"

        row = {
            "species": species_name,
            "enzyme": enzyme,
            "status": status,
            "nt_len": result.get("nt_len"),
            "aa_len": None,
            "blocks": blocks,
            "introns": intron_count,
            "strand": result.get("strand"),
            "seqid": result.get("seqid"),
            "prot_path": str(prot_path if prot_path.exists() else ""),
            "cds_path": str(cds_path if cds_path.exists() else ""),
        }

        if status in {"NO_HIT", "ERROR"} or not prot_path.exists():
            summary_rows.append(row)
            continue

        record = next(SeqIO.parse(prot_path, "fasta"))
        record.id = f"{species_name}|{enzyme}"
        record.description = f"status={status};introns={intron_count};nt_len={result.get('nt_len')}"
        row["aa_len"] = len(record.seq)

        alignment_inputs[enzyme].append(record)
        summary_rows.append(row)

# add reference sequences so each alignment is anchored to Ps. cubensis
for enzyme, ref_path in REFS.items():
    ref_records = list(SeqIO.parse(ref_path, "fasta"))
    for rec in ref_records:
        rec.id = f"Reference|{rec.id}"
        rec.description = rec.description or "reference"
    alignment_inputs[enzyme] = ref_records + alignment_inputs[enzyme]

ALIGNMENTS = {}
ALIGNMENT_PATHS = {}
ALIGNMENT_TABLES = {}
ALIGNMENT_LONG = {}
threads = max(1, (multiprocessing.cpu_count() or 1) - 1)

for enzyme, records in alignment_inputs.items():
    if not records:
        continue

    input_fasta = ALIGN_DIR / f"{enzyme}_mafft_input.faa"
    output_fasta = ALIGN_DIR / f"{enzyme}_mafft_aligned.faa"

    SeqIO.write(records, input_fasta, "fasta")
    cmd = [mafft_bin, "--auto", "--thread", str(threads), str(input_fasta)]
    run = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if run.returncode != 0:
        raise RuntimeError(f"MAFFT failed for {enzyme}:\n{run.stderr}")

    output_fasta.write_text(run.stdout)
    alignment = AlignIO.read(output_fasta, "fasta")
    ALIGNMENTS[enzyme] = alignment
    ALIGNMENT_PATHS[enzyme] = output_fasta

    table = pd.DataFrame([list(rec.seq) for rec in alignment], index=[rec.id for rec in alignment])
    table.columns = range(1, table.shape[1] + 1)
    ALIGNMENT_TABLES[enzyme] = table

    long_df = table.reset_index().melt(id_vars="index", var_name="position", value_name="residue")
    ALIGNMENT_LONG[enzyme] = long_df.rename(columns={"index": "sequence_id"})

alignment_metadata = pd.DataFrame(summary_rows)
alignment_metadata.to_csv(ALIGN_DIR / "alignment_metadata.tsv", sep="\t", index=False)

print(f"MAFFT alignments written to {ALIGN_DIR}")
print("Enzymes aligned:", ", ".join(sorted(ALIGNMENTS)))


MAFFT alignments written to /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/mafft_alignments
Enzymes aligned: PsiD, PsiH, PsiK, PsiM


In [8]:
# === CELL 5: Plotly amino-acid alignment explorer ===
import plotly.graph_objects as go
from plotly.colors import qualitative

try:
    import ipywidgets as widgets
    HAVE_WIDGETS = True
except ImportError:
    HAVE_WIDGETS = False
    print("ipywidgets not found; install it for the dropdown interface (pip install ipywidgets)")

BASE_PALETTE = (
    qualitative.Dark24
    + qualitative.Light24
    + qualitative.Set3
    + qualitative.Bold
)

def plot_alignment(enzyme: str) -> go.Figure:
    if enzyme not in ALIGNMENT_TABLES:
        raise ValueError(f"No alignment prepared for {enzyme}")

    table = ALIGNMENT_TABLES[enzyme].copy()
    table.index = table.index.map(str)
    seq_ids = table.index.tolist()
    positions = table.columns.astype(int).tolist()
    residues = table.astype(str).values

    uniques = sorted({res for row in residues for res in row})
    if "-" in uniques:
        uniques.remove("-")
        uniques.append("-")

    color_lookup = {
        symbol: BASE_PALETTE[i % len(BASE_PALETTE)]
        for i, symbol in enumerate(uniques)
    }
    color_lookup.setdefault("-", "#e0e0e0")

    codes = {symbol: i for i, symbol in enumerate(uniques)}
    z = [[codes[aa] for aa in row] for row in residues]
    colorscale = [
        (0 if len(uniques) == 1 else i / (len(uniques) - 1), color_lookup[symbol])
        for i, symbol in enumerate(uniques)
    ]
    colorbar_ticks = list(codes.values())
    colorbar_labels = list(codes.keys())

    hover = [
        [
            f"{seq_ids[r]}<br>Pos {positions[c]}<br>Residue: {residues[r][c]}"
            for c in range(len(positions))
        ]
        for r in range(len(seq_ids))
    ]

    meta = alignment_metadata[alignment_metadata["enzyme"] == enzyme].copy()
    meta.index = meta["species"] + "|" + meta["enzyme"]
    intron_text = [f"{meta.loc[s,'introns']} introns"
                   if s in meta.index else "reference"
                   for s in seq_ids]

    fig = go.Figure(
        data=[
            go.Heatmap(
                z=z,
                x=positions,
                y=seq_ids,
                colorscale=colorscale,
                zmin=0,
                zmax=len(uniques) - 1 if uniques else 1,
                colorbar=dict(
                    title="Residue",
                    tickvals=colorbar_ticks,
                    ticktext=colorbar_labels,
                ),
                hoverinfo="text",
                text=hover,
            ),
            go.Scatter(
                x=[positions[-1] + 5] * len(seq_ids),
                y=seq_ids,
                mode="text",
                text=intron_text,
                name="Introns",
                showlegend=False,
                textfont=dict(size=10, color="#303030"),
                hoverinfo="skip",
            ),
        ]
    )

    reference_rows = [sid for sid in seq_ids if sid.startswith("Reference|")]
    for ref_id in reference_rows:
        fig.add_hrect(
            y0=ref_id,
            y1=ref_id,
            yref="y",
            line_width=1.5,
            line_color="#222222",
            opacity=0.15,
        )

    fig.update_layout(
        title=f"{enzyme}: MAFFT amino-acid alignment ({len(seq_ids)} sequences)",
        xaxis=dict(title="Alignment position"),
        yaxis=dict(title="Sequence", automargin=True),
        margin=dict(l=150, r=180, t=60, b=60),
        height=400 + 20 * len(seq_ids),
    )
    return fig

def display_alignment(enzyme: str):
    fig = plot_alignment(enzyme)
    fig.show()

if HAVE_WIDGETS:
    widgets.interact(display_alignment, enzyme=sorted(ALIGNMENT_TABLES.keys()))
else:
    display_alignment(sorted(ALIGNMENT_TABLES.keys())[0])


interactive(children=(Dropdown(description='enzyme', options=('PsiD', 'PsiH', 'PsiK', 'PsiM'), value='PsiD'), …

In [15]:
# === CELL 5 (rev): Publication-style Plotly heatmap ===
import plotly.graph_objects as go
from plotly.colors import hex_to_rgb
try:
    import ipywidgets as widgets
    HAVE_WIDGETS = True
except ImportError:
    HAVE_WIDGETS = False
    print("ipywidgets not installed; the dropdown selector will be unavailable.")

# Residue categories inspired by biochemical properties
PROPERTY_MAP = {
    "Hydrophobic": list("AILMVFWP"),
    "Polar": list("STNQCYG"),
    "Positive": list("KRH"),
    "Negative": list("DE"),
    "Aromatic": list("FYWH"),
    "Special": ["G", "P"],
    "Stop/Other": ["*", "X", "U"],
}
CATEGORY_COLORS = {
    "Hydrophobic": "#1b9e77",
    "Polar": "#66a61e",
    "Positive": "#386cb0",
    "Negative": "#ef3b2c",
    "Aromatic": "#a6761d",
    "Special": "#7570b3",
    "Stop/Other": "#b8b8b8",
    "Gap": "#e6e6e6",
}
RESIDUE_TO_COLOR = {}
for category, residues in PROPERTY_MAP.items():
    for res in residues:
        RESIDUE_TO_COLOR.setdefault(res, CATEGORY_COLORS[category])
RESIDUE_TO_COLOR["-"] = CATEGORY_COLORS["Gap"]

def _build_colorscale(symbols):
    palette = []
    for i, symbol in enumerate(symbols):
        color = RESIDUE_TO_COLOR.get(symbol, CATEGORY_COLORS["Stop/Other"])
        if len(symbols) == 1:
            palette.append((0.0, color))
        else:
            palette.append((i / (len(symbols) - 1), color))
    return palette

def _make_hover(seq_ids, positions, residues, metadata, enzyme):
    hover = []
    for r, seq in enumerate(seq_ids):
        rows = []
        introns = "reference"
        if seq in metadata.index:
            introns = f"{metadata.loc[seq, 'introns']} introns"
        for c, pos in enumerate(positions):
            residue = residues[r][c]
            rows.append(
                f"<b>{seq}</b><br>Position {pos}<br>Residue: {residue}<br>{introns}"
            )
        hover.append(rows)
    return hover

def _add_property_legend(fig):
    for i, (category, color) in enumerate(CATEGORY_COLORS.items()):
        if category == "Gap":
            continue
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                marker=dict(size=12, symbol="square", color=color),
                legendgroup="residue_legend",
                showlegend=True,
                name=category,
            )
        )

def plot_alignment_pretty(enzyme: str) -> go.Figure:
    if enzyme not in ALIGNMENT_TABLES:
        raise ValueError(f"No MAFFT alignment found for {enzyme}")

    table = ALIGNMENT_TABLES[enzyme].copy()
    seq_ids = table.index.map(str).tolist()
    positions = table.columns.astype(int).tolist()
    residues = table.astype(str).values

    symbols = sorted({res for row in residues for res in row})
    if "-" in symbols:
        symbols.remove("-")
        symbols.append("-")
    colorscale = _build_colorscale(symbols)
    symbol_to_idx = {sym: idx for idx, sym in enumerate(symbols)}
    z = [[symbol_to_idx.get(res, symbol_to_idx[symbols[-1]]) for res in row] for row in residues]

    meta = alignment_metadata[alignment_metadata["enzyme"] == enzyme].copy()
    meta.index = meta["species"] + "|" + meta["enzyme"]
    hover = _make_hover(seq_ids, positions, residues, meta, enzyme)

    fig = go.Figure(
        go.Heatmap(
            z=z,
            x=positions,
            y=seq_ids,
            colorscale=colorscale,
            zmin=0,
            zmax=len(symbols) - 1 if symbols else 1,
            hoverinfo="text",
            text=hover,
            showscale=False,
        )
    )

    annotations = []
    for seq in seq_ids:
        introns = "reference"
        if seq in meta.index:
            introns = f"{meta.loc[seq, 'introns']} introns"
        annotations.append(
            dict(
                x=positions[-1] + max(positions) * 0.02,
                y=seq,
                text=introns,
                showarrow=False,
                font=dict(size=12, color="#4a4a4a"),
                xref="x",
                yref="y",
            )
        )
    fig.update_layout(annotations=annotations)

    reference_rows = [sid for sid in seq_ids if sid.startswith("Reference|")]
    for ref_id in reference_rows:
        fig.add_shape(
            type="rect",
            xref="paper",
            yref="y",
            x0=0,
            x1=1,
            y0=ref_id,
            y1=ref_id,
            line=dict(color="#2f2f2f", width=1.5),
            fillcolor="rgba(0,0,0,0)",
        )

    _add_property_legend(fig)
    fig.update_layout(
        title=dict(
            text=f"<b>{enzyme}</b> MAFFT alignment (amino acids)",
            x=0.02,
            y=0.98,
            xanchor="left",
            font=dict(family="Helvetica Neue", size=22, color="#222222"),
        ),
        template="simple_white",
        width=1700,  # widened figure
        height=480 + 24 * len(seq_ids),
        margin=dict(l=200, r=260, t=90, b=70),
        font=dict(family="Helvetica Neue", size=14, color="#222222"),
        legend=dict(
            title="Residue class",
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=0.98,
            font=dict(size=14),
        ),
    )
    fig.update_xaxes(
        title="Alignment position",
        showgrid=False,
        zeroline=False,
        ticks="outside",
        tickcolor="#8a8a8a",
        title_font=dict(size=16),
    )
    fig.update_yaxes(
        title="Sequence",
        autorange="reversed",
        showgrid=False,
        ticks="outside",
        tickcolor="#8a8a8a",
        title_font=dict(size=16),
    )
    return fig

def display_alignment_pretty(enzyme: str):
    fig = plot_alignment_pretty(enzyme)
    fig.show(renderer="notebook")

if HAVE_WIDGETS:
    widgets.interact(display_alignment_pretty, enzyme=sorted(ALIGNMENT_TABLES.keys()))
else:
    display_alignment_pretty(sorted(ALIGNMENT_TABLES.keys())[0])


interactive(children=(Dropdown(description='enzyme', options=('PsiD', 'PsiH', 'PsiK', 'PsiM'), value='PsiD'), …

In [16]:
from ipywidgets.embed import embed_minimal_html
from IPython.display import display, clear_output

# Build the widget
enzyme_selector = widgets.Dropdown(
    options=sorted(ALIGNMENT_TABLES.keys()),
    description="Enzyme",
    layout=widgets.Layout(width="220px"),
)
plot_output = widgets.Output()

def update_plot(change):
    fig = plot_alignment_pretty(change["new"])
    with plot_output:
        clear_output(wait=True)
        fig.show(renderer="notebook")

enzyme_selector.observe(update_plot, names="value")
update_plot({"new": enzyme_selector.value})

alignment_viewer = widgets.VBox([enzyme_selector, plot_output])
display(alignment_viewer)

# Export the entire dropdown + plot
export_path = ALIGN_DIR / "alignment_viewer.html"
embed_minimal_html(export_path, alignment_viewer, title="Psilocybe Alignment Explorer")
print("Saved interactive viewer to", export_path)


VBox(children=(Dropdown(description='Enzyme', layout=Layout(width='220px'), options=('PsiD', 'PsiH', 'PsiK', '…

Saved interactive viewer to /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/mafft_alignments/alignment_viewer.html


In [18]:
# === CELL 6: MAFFT nucleotide alignments (CDS) ===
from collections import defaultdict
from pathlib import Path
from Bio import SeqIO, AlignIO
from Bio.Seq import Seq

NUC_ALIGN_DIR = ALIGN_DIR / "nucleotide_alignments"
NUC_ALIGN_DIR.mkdir(parents=True, exist_ok=True)

REFERENCE_KEYWORD = "Psilocybe_cubensis"

nuc_inputs = defaultdict(list)
meta_with_paths = alignment_metadata.dropna(subset=["cds_path"])

for _, row in meta_with_paths.iterrows():
    if row["status"] in {"NO_HIT", "ERROR"}:
        continue
    cds_path = Path(row["cds_path"])
    if not cds_path.exists():
        continue

    record = next(SeqIO.parse(str(cds_path), "fasta"))
    record.seq = Seq(str(record.seq).upper())
    record.id = f"{row['species']}|{row['enzyme']}"
    record.description = (
        f"status={row['status']};introns={row['introns']};nt_len={row['nt_len']}"
    )
    nuc_inputs[row["enzyme"]].append(record)

ALIGNMENTS_NT = {}
ALIGNMENT_PATHS_NT = {}
ALIGNMENT_TABLES_NT = {}
ALIGNMENT_LONG_NT = {}

for enzyme, records in nuc_inputs.items():
    if not records:
        continue

    ref_records = [
        rec for rec in records
        if REFERENCE_KEYWORD.lower() in rec.id.lower()
    ]
    other_records = [
        rec for rec in records
        if REFERENCE_KEYWORD.lower() not in rec.id.lower()
    ]
    other_records.sort(key=lambda r: r.id)
    ordered_records = ref_records + other_records

    input_fasta = NUC_ALIGN_DIR / f"{enzyme}_cds_input.fna"
    output_fasta = NUC_ALIGN_DIR / f"{enzyme}_cds_aligned.fna"

    SeqIO.write(ordered_records, input_fasta, "fasta")
    cmd = [mafft_bin, "--auto", "--thread", str(threads), str(input_fasta)]
    run = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if run.returncode != 0:
        raise RuntimeError(f"MAFFT failed for {enzyme} (nucleotide):\n{run.stderr}")

    output_fasta.write_text(run.stdout)

    alignment = AlignIO.read(output_fasta, "fasta")
    ALIGNMENTS_NT[enzyme] = alignment
    ALIGNMENT_PATHS_NT[enzyme] = output_fasta

    table = pd.DataFrame(
        [list(str(rec.seq)) for rec in alignment],
        index=[rec.id for rec in alignment],
    )
    table.columns = range(1, table.shape[1] + 1)
    ALIGNMENT_TABLES_NT[enzyme] = table

    long_df = table.reset_index().melt(
        id_vars="index", var_name="position", value_name="base"
    )
    ALIGNMENT_LONG_NT[enzyme] = long_df.rename(columns={"index": "sequence_id"})

print("Nucleotide MAFFT alignments stored in", NUC_ALIGN_DIR)
print("Enzymes aligned (nt):", ", ".join(sorted(ALIGNMENTS_NT)))


Nucleotide MAFFT alignments stored in /Users/felix/Documents/20251015_Meeting_PsiloProteins_Felix/05_codex/mafft_alignments/nucleotide_alignments
Enzymes aligned (nt): PsiD, PsiH, PsiK, PsiM


In [22]:
# === CELL 7: Plotly nucleotide alignment explorer ===
import plotly.graph_objects as go

try:
    widgets  # reuse existing import if already performed
    HAVE_WIDGETS = HAVE_WIDGETS and True
except (NameError, UnboundLocalError):
    try:
        import ipywidgets as widgets
        HAVE_WIDGETS = True
    except ImportError:
        HAVE_WIDGETS = False
        print("ipywidgets not installed; the dropdown selector will be unavailable.")

NUC_COLOR_MAP = {
    "A": "#1b9e77",
    "C": "#d95f02",
    "G": "#7570b3",
    "T": "#e7298a",
    "U": "#e7298a",
    "N": "#555555",
    "-": "#e6e6e6",
    "R": "#a6761d",  # A/G
    "Y": "#a6761d",  # C/T
    "S": "#66a61e",  # G/C
    "W": "#66a61e",  # A/T
    "K": "#386cb0",  # G/T
    "M": "#386cb0",  # A/C
    "B": "#bdbdbd",  # C/G/T
    "D": "#bdbdbd",  # A/G/T
    "H": "#bdbdbd",  # A/C/T
    "V": "#bdbdbd",  # A/C/G
    "X": "#bdbdbd",
}
DEFAULT_NUC_COLOR = "#bdbdbd"
REFERENCE_ID_PREFIX = "Reference|"

def _nuc_colorscale(symbols):
    symbols = list(symbols)
    if not symbols:
        return [(0.0, "#ffffff"), (1.0, "#ffffff")]
    if len(symbols) == 1:
        col = NUC_COLOR_MAP.get(symbols[0], DEFAULT_NUC_COLOR)
        return [(0.0, col), (1.0, col)]
    return [
        (i / (len(symbols) - 1), NUC_COLOR_MAP.get(sym, DEFAULT_NUC_COLOR))
        for i, sym in enumerate(symbols)
    ]

def _nuc_hover(seq_ids, positions, bases, metadata):
    hover = []
    for r, seq in enumerate(seq_ids):
        introns = "reference"
        if seq in metadata.index:
            introns = f"{metadata.loc[seq, 'introns']} introns"
        row_hover = [
            f"<b>{seq}</b><br>Position {pos}<br>Base: {bases[r][c]}<br>{introns}"
            for c, pos in enumerate(positions)
        ]
        hover.append(row_hover)
    return hover

def _add_nuc_legend(fig):
    for sym, color in NUC_COLOR_MAP.items():
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                marker=dict(size=12, symbol="square", color=color),
                legendgroup="nt",
                showlegend=True,
                name=sym,
            )
        )

def plot_nt_alignment_pretty(enzyme: str) -> go.Figure:
    if enzyme not in ALIGNMENT_TABLES_NT:
        raise ValueError(f"No nucleotide alignment found for {enzyme}")

    table = ALIGNMENT_TABLES_NT[enzyme].copy()
    seq_ids = table.index.astype(str).tolist()
    positions = table.columns.astype(int).tolist()
    bases = table.applymap(lambda x: str(x).upper()).values

    symbols = sorted({b for row in bases for b in row})
    if "-" in symbols:
        symbols.remove("-")
        symbols.append("-")
    colorscale = _nuc_colorscale(symbols)
    symbol_to_idx = {sym: idx for idx, sym in enumerate(symbols)}
    z = [[symbol_to_idx.get(b, symbol_to_idx[symbols[-1]]) for b in row] for row in bases]

    meta = alignment_metadata[alignment_metadata["enzyme"] == enzyme].copy()
    meta.index = meta["species"] + "|" + meta["enzyme"]
    hover = _nuc_hover(seq_ids, positions, bases, meta)

    fig = go.Figure(
        go.Heatmap(
            z=z,
            x=positions,
            y=seq_ids,
            colorscale=colorscale,
            zmin=0,
            zmax=len(symbols) - 1 if symbols else 1,
            hoverinfo="text",
            text=hover,
            showscale=False,
        )
    )

    annotations = []
    x_offset = positions[-1] + positions[-1] * 0.02
    for seq in seq_ids:
        introns = "reference"
        if seq in meta.index:
            introns = f"{meta.loc[seq, 'introns']} introns"
        annotations.append(
            dict(
                x=x_offset,
                y=seq,
                text=introns,
                showarrow=False,
                font=dict(size=12, color="#4a4a4a"),
                xref="x",
                yref="y",
            )
        )
    fig.update_layout(annotations=annotations)

    reference_ids = [sid for sid in seq_ids if sid.startswith(REFERENCE_ID_PREFIX)]
    for ref_id in reference_ids:
        idx = seq_ids.index(ref_id)
        fig.add_hrect(
            xref="paper",
            yref="y",
            x0=0,
            x1=1,
            y0=idx - 0.5,
            y1=idx + 0.5,
            line_width=1.5,
            line_color="#2f2f2f",
            fillcolor="rgba(0,0,0,0)",
        )

    _add_nuc_legend(fig)
    fig.update_layout(
        title=dict(
            text=f"<b>{enzyme}</b> MAFFT alignment (nucleotide CDS)",
            x=0.02,
            y=0.98,
            xanchor="left",
            font=dict(family="Helvetica Neue", size=22, color="#222222"),
        ),
        template="simple_white",
        width=1700,
        height=480 + 24 * len(seq_ids),
        margin=dict(l=200, r=260, t=90, b=70),
        font=dict(family="Helvetica Neue", size=14, color="#222222"),
        legend=dict(
            title="Nucleotide",
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=0.98,
            font=dict(size=14),
        ),
    )
    fig.update_xaxes(
        title="Alignment position",
        showgrid=False,
        zeroline=False,
        ticks="outside",
        tickcolor="#8a8a8a",
        title_font=dict(size=16),
    )
    fig.update_yaxes(
        title="Sequence",
        autorange="reversed",
        showgrid=False,
        ticks="outside",
        tickcolor="#8a8a8a",
        title_font=dict(size=16),
    )
    return fig

def display_nt_alignment(enzyme: str):
    fig = plot_nt_alignment_pretty(enzyme)
    fig.show(renderer="notebook")

if HAVE_WIDGETS:
    widgets.interact(display_nt_alignment, enzyme=sorted(ALIGNMENT_TABLES_NT.keys()))
else:
    display_nt_alignment(sorted(ALIGNMENT_TABLES_NT.keys())[0])


interactive(children=(Dropdown(description='enzyme', options=('PsiD', 'PsiH', 'PsiK', 'PsiM'), value='PsiD'), …