<a href="https://colab.research.google.com/github/RobBurnap/Bioinformatics-MICR4203-MICR5203/blob/main/%20%20%20%20notebooks/Species_Tree_Diversity_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# BIOINFO4/5203 —
species diveristy:

##A. Mount Google Drive, Import Coding Libraries Necessary for Running Subsequent Code

In [11]:

# Install FIRST, then import
%pip install -q biopython       # Install the Biopython package quietly (-q suppresses most output) so we can work with biological sequence files

from google.colab import drive  # Import the module that lets Colab interact with Google Drive
drive.mount('/content/drive')   # Mount your Google Drive so it appears in Colab's file system under /content/drive

import os, pandas as pd          # Import 'os' for file/directory operations, and pandas for working with data tables
from Bio import SeqIO            # Import SeqIO from Biopython for reading/writing biological sequence files (FASTA, GenBank, etc.)
import matplotlib.pyplot as plt  # Import Matplotlib's plotting library to create figures and graphs

print("✅ Dependencies installed & Drive mounted.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Dependencies installed & Drive mounted.



## B. Course folders: Define the course folders for places to load data to be processed and output to be saved

Edit only `LECTURE_CODE` and `TOPIC` if needed. All inputs will live in `Data/LECTURE_TOPIC` and outputs in `Outputs/LECTURE_TOPIC`.


In [13]:

# --- Course folder config (customize LECTURE_CODE/TOPIC only) ---
COURSE_DIR   = "/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25"
LECTURE_CODE = "L0-species"            # change per week (e.g., L02, L03, ...)
TOPIC        = "diveristy"    # short slug for the exercise

# Derived paths (do not change)
DATA_DIR   = f"{COURSE_DIR}/Data/{LECTURE_CODE}_{TOPIC}"
OUTPUT_DIR = f"{COURSE_DIR}/Outputs/{LECTURE_CODE}_{TOPIC}"

# Create folder structure if missing
for p in [f"{COURSE_DIR}/Data", f"{COURSE_DIR}/Outputs", f"{COURSE_DIR}/Notebooks", DATA_DIR, OUTPUT_DIR]:
    os.makedirs(p, exist_ok=True)

print("📁 COURSE_DIR :", COURSE_DIR)
print("📁 DATA_DIR   :", DATA_DIR)
print("📁 OUTPUT_DIR :", OUTPUT_DIR)


📁 COURSE_DIR : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25
📁 DATA_DIR   : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L0-species_diveristy
📁 OUTPUT_DIR : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L0-species_diveristy


##C.



In [14]:
# --- BLASTP vs NR with taxonomy filter from Data/L0-species_diversity ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
from pathlib import Path
import io, csv, re, time, sys

# 0) REQUIRED: your email (API key optional)
Entrez.email = "you@university.edu"      # <- change me
# Entrez.api_key = "YOUR_NCBI_API_KEY"   # <- optional but helpful

# 1) Resolve paths (use your course vars if present; else fall back to the Data folder in your screenshot)
if 'DATA_DIR' in globals():
    DATA_DIR = Path(DATA_DIR)
else:
    DATA_DIR = Path("/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L0-species_diversity")

if 'OUTPUT_DIR' in globals():
    OUTPUT_DIR = Path(OUTPUT_DIR)
else:
    OUTPUT_DIR = DATA_DIR   # keep outputs next to inputs if course OUTPUT_DIR not set

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# 2) Pick the protein FASTA (your file is query_proteins.fasta)
from pathlib import Path
from Bio import SeqIO

DATA_DIR = Path("/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L0-species_diversity")
FASTA_PATH = DATA_DIR / "query_proteins.fasta"   # direct reference

rec = next(SeqIO.parse(str(FASTA_PATH), "fasta"))
seq = str(rec.seq).upper().replace("*","")
# 3) Load TaxIDs (prefer Data folder; yours is here)
def load_taxids():
    for p in (DATA_DIR/"taxids.txt", OUTPUT_DIR/"taxids.txt"):
        if p.exists():
            ids = [l.strip() for l in p.read_text().splitlines() if l.strip()]
            ids = [t for t in ids if t.isdigit()]
            if ids:
                print(f"🧬 Loaded {len(ids)} TaxIDs from {p}")
                return ids
    print("⚠️ No taxids.txt found. Running without taxonomy filter.")
    return []

taxids = load_taxids()

def make_query_chunks(ids, field="ORGN", chunk_size=30):
    """field='ORGN' is safest for BLAST. Use 'ORGN:exp' to include descendants (may not be supported for all cases)."""
    if not ids: return [None]
    chunks = []
    for i in range(0, len(ids), chunk_size):
        sub = ids[i:i+chunk_size]
        q = " OR ".join([f"txid{t}[{field}]" for t in sub])
        chunks.append(q)
    print(f"🔎 Built {len(chunks)} ENTREZ_QUERY chunk(s) with [{field}]")
    return chunks

chunks = make_query_chunks(taxids, field="ORGN", chunk_size=30)

# 4) Minimal BLASTP runner
def run_blastp(query_seq, entrez_query=None, evalue=1e-5, hitlist=100):
    prev = (entrez_query[:160] + " …") if (entrez_query and len(entrez_query)>160) else (entrez_query or "(none)")
    print(f"⏳ BLASTP nr  E={evalue}  hits={hitlist}\n   ENTREZ_QUERY: {prev}")
    h = NCBIWWW.qblast(
        program="blastp", database="nr", sequence=query_seq,
        expect=evalue, entrez_query=entrez_query,
        hitlist_size=hitlist, descriptions=hitlist, alignments=hitlist
    )
    xml = h.read(); h.close()
    rec = NCBIXML.read(io.StringIO(xml))
    return rec, xml

def extract_accession(hit_id, hit_def, accession_attr):
    if accession_attr:
        return accession_attr
    m = re.search(r"([A-Z]{1,3}_?\d+\.\d+)", hit_id or "")
    if m: return m.group(1)
    m = re.search(r"([A-Z]{1,3}_?\d+\.\d+)", hit_def or "")
    if m: return m.group(1)
    return (hit_id or "unknown")

# 5) Try chunks with [ORGN]; if still nothing, optional retry with [ORGN:exp]; final fallback is no filter
all_alignments, xml_paths = [], []
for i, EQ in enumerate(chunks, 1):
    try:
        rec_xml, xml = run_blastp(seq, entrez_query=EQ, evalue=1e-5, hitlist=100)
        if rec_xml.alignments:
            path = OUTPUT_DIR / (f"blastp_nr_chunk{i}_ORGN.xml" if EQ else "blastp_nr_noFilter.xml")
            path.write_text(xml); xml_paths.append(path)
            all_alignments.extend(rec_xml.alignments)
            print(f"✅ Hits in chunk {i}: {len(rec_xml.alignments)}; saved {path.name}")
            # (keep going to gather from all chunks; comment the next line if you only want the first)
            # break
        else:
            print(f"— No hits for chunk {i}")
    except Exception as e:
        # Most common message here: "... is not supported" if the field syntax is wrong
        print(f"⚠️ Chunk {i} failed: {e}")
    time.sleep(0.25)

if not all_alignments and taxids:
    print("🟡 Retrying with descendant expansion [ORGN:exp] (may not be supported for all cases)…")
    for i, EQ in enumerate(make_query_chunks(taxids, field="ORGN:exp", chunk_size=20), 1):
        try:
            rec_xml, xml = run_blastp(seq, entrez_query=EQ, evalue=1e-5, hitlist=100)
            if rec_xml.alignments:
                path = OUTPUT_DIR / f"blastp_nr_chunk{i}_ORGNexp.xml"
                path.write_text(xml); xml_paths.append(path)
                all_alignments.extend(rec_xml.alignments)
                print(f"✅ Hits in chunk {i} (exp): {len(rec_xml.alignments)}; saved {path.name}")
            else:
                print(f"— No hits for chunk {i} (exp)")
        except Exception as e:
            print(f"⚠️ Chunk {i} (exp) failed: {e}")
        time.sleep(0.25)

if not all_alignments:
    print("🟡 Final fallback: no taxonomy filter")
    try:
        rec_xml, xml = run_blastp(seq, entrez_query=None, evalue=1e-3, hitlist=100)
        if rec_xml.alignments:
            p = OUTPUT_DIR / "blastp_nr_fallback.xml"
            p.write_text(xml); xml_paths.append(p)
            all_alignments.extend(rec_xml.alignments)
            print(f"✅ Fallback hits: {len(rec_xml.alignments)}; saved {p.name}")
        else:
            print("🔴 Still no hits.")
    except Exception as e:
        print(f"🔴 Fallback failed: {e}")

# 6) Save a compact TSV of the best HSP per hit
tsv_path = OUTPUT_DIR / "blastp_summary.tsv"
with open(tsv_path, "w", newline="") as f:
    w = csv.writer(f, delimiter="\t")
    w.writerow(["accession","title","length","evalue","identity","align_len","pct_identity","q_start","q_end","s_start","s_end"])
    for aln in all_alignments:
        best = sorted(aln.hsps, key=lambda h: (h.expect, -h.identities))[0]
        acc = extract_accession(aln.hit_id, aln.hit_def, getattr(aln, "accession", None))
        aln_len = best.align_length
        pct_id = round(100.0 * best.identities / aln_len, 2) if aln_len else ""
        q0, q1 = best.query_start, best.query_end
        s0, s1 = best.sbjct_start, best.sbjct_end
        w.writerow([acc, aln.title, aln.length, best.expect, best.identities, aln_len, pct_id, min(q0,q1), max(q0,q1), min(s0,s1), max(s0,s1)])

print(f"\n📦 Saved {len(all_alignments)} alignments across {len(xml_paths)} XML file(s).")
print("💾 XML files :", ", ".join(p.name for p in xml_paths) if xml_paths else "(none)")
print("📑 Summary   :", tsv_path)

🧬 Loaded 107 TaxIDs from /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L0-species_diversity/taxids.txt
🔎 Built 4 ENTREZ_QUERY chunk(s) with [ORGN]
⏳ BLASTP nr  E=1e-05  hits=100
   ENTREZ_QUERY: txid10090[ORGN] OR txid102285[ORGN] OR txid105358[ORGN] OR txid1117[ORGN] OR txid115547[ORGN] OR txid1179[ORGN] OR txid1408545[ORGN] OR txid1416614[ORGN] OR txi …


KeyboardInterrupt: 

 multi-FASTA of top hits per TaxID (one or more sequences per taxon). The cell below:
	•	uses your existing folders (Data/L0-species_diversity for input; writes to the same folder unless OUTPUT_DIR is already set),
	•	reads query_proteins.fasta and taxids.txt,
	•	for each TaxID, runs a separate BLAST restricted to that taxon (txid####[ORGN]), so we can attribute hits unambiguously,
	•	grabs the top N accessions from each BLAST,
	•	fetches their protein FASTA sequences,
	•	writes:
	•	per_taxid_top_hits.fasta (all sequences, grouped by taxid in headers),
	•	per_taxid_hits.tsv (who came from which taxid, evalue, %id, etc.),
	•	optional one FASTA per taxid (toggle with WRITE_SPLIT_FASTA).

In [None]:
# --- BLAST (per TaxID) -> collect top protein hits -> write multi-FASTA + TSV ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
from pathlib import Path
import io, csv, re, time, sys

# ==== config you may tweak ====
Entrez.email = "you@university.edu"   # <-- REQUIRED
# Entrez.api_key = "YOUR_NCBI_API_KEY"  # optional, faster/higher limits

TOP_HITS_PER_TAXID = 3        # how many sequences to keep per taxid
EVALUE              = 1e-5
HITLIST_SIZE        = max(50, TOP_HITS_PER_TAXID*10)  # ask for more, then trim
WRITE_SPLIT_FASTA   = False   # also write one FASTA per taxid
SLEEP_BETWEEN_CALLS = 0.3     # be kind to NCBI :)

# ==== paths (use your course vars if present) ====
if 'DATA_DIR' in globals():
    DATA_DIR = Path(DATA_DIR)
else:
    DATA_DIR = Path("/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L0-species_diversity")

if 'OUTPUT_DIR' in globals():
    OUTPUT_DIR = Path(OUTPUT_DIR)
else:
    OUTPUT_DIR = DATA_DIR

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Files we expect from your screenshot:
PROT_FASTA = DATA_DIR / "query_proteins.fasta"
TAXIDS_TXT = DATA_DIR / "taxids.txt"

# ==== load query; auto-detect protein vs nucleotide ====
def load_query():
    if PROT_FASTA.exists():
        rec = next(SeqIO.parse(str(PROT_FASTA), "fasta"))
        seq = str(rec.seq).upper().replace("*","")
        letters = [c for c in seq if c.isalpha()]
        dna_frac = sum(c in set("ACGTN") for c in letters) / max(1,len(letters))
        if dna_frac >= 0.85:
            # looks like nucleotide despite filename; fallback to query.fasta if present
            alt = DATA_DIR / "query.fasta"
            if alt.exists():
                r2 = next(SeqIO.parse(str(alt), "fasta"))
                return ("blastx", str(r2.seq).upper(), f"{alt.name}:{r2.id}")
            # else: we'll still do blastx on given sequence
            return ("blastx", seq, f"{PROT_FASTA.name}:{rec.id}")
        else:
            return ("blastp", seq, f"{PROT_FASTA.name}:{rec.id}")
    else:
        # if protein file missing, try nucleotide and use blastx
        alt = DATA_DIR / "query.fasta"
        if alt.exists():
            r2 = next(SeqIO.parse(str(alt), "fasta"))
            return ("blastx", str(r2.seq).upper(), f"{alt.name}:{r2.id}")
        raise FileNotFoundError(f"Could not find {PROT_FASTA} or a fallback nucleotide FASTA at {DATA_DIR/'query.fasta'}")

mode, query_seq, query_label = load_query()
print(f"📄 Query source: {query_label}")
print(f"🧪 Mode chosen: {mode.upper()} vs nr  | length={len(query_seq)}")

# ==== load TaxIDs ====
if not TAXIDS_TXT.exists():
    raise FileNotFoundError(f"taxids.txt not found at {TAXIDS_TXT}")
taxids = [t.strip() for t in TAXIDS_TXT.read_text().splitlines() if t.strip().isdigit()]
if not taxids:
    raise ValueError("taxids.txt is empty or contains no numeric TaxIDs.")
print(f"🧬 Loaded {len(taxids)} TaxIDs")

# ==== helpers ====
def run_single_blast(seq, taxid, program="blastp", evalue=EVALUE, hitlist=HITLIST_SIZE):
    q = f"txid{taxid}[ORGN]"  # exact taxon; robust for BLAST
    prev = q if len(q) < 160 else (q[:157] + " …")
    print(f"⏳ {program.upper()} vs nr | taxid={taxid} | E={evalue} | hits={hitlist}\n   ENTREZ_QUERY: {prev}")
    h = NCBIWWW.qblast(program=program, database="nr", sequence=seq,
                       expect=evalue, entrez_query=q,
                       hitlist_size=hitlist, descriptions=hitlist, alignments=hitlist)
    xml = h.read(); h.close()
    rec = NCBIXML.read(io.StringIO(xml))
    return rec, xml

def extract_accession(hit_id, hit_def, accession_attr):
    if accession_attr: return accession_attr
    m = re.search(r"([A-Z]{1,3}_?\d+\.\d+)", hit_id or "")
    if m: return m.group(1)
    m = re.search(r"([A-Z]{1,3}_?\d+\.\d+)", hit_def or "")
    if m: return m.group(1)
    return (hit_id or "unknown")

def fetch_protein_fasta(accessions):
    """Return dict acc->FASTA text (single-record each)."""
    out = {}
    batch = list(accessions)
    while batch:
        # fetch in small batches to be polite
        chunk = batch[:50]; batch = batch[50:]
        try:
            h = Entrez.efetch(db="protein", id=",".join(chunk), rettype="fasta", retmode="text")
            txt = h.read(); h.close()
            # split on records
            parts = [t for t in txt.strip().split(">") if t]
            for rec_txt in parts:
                header, *seq_lines = rec_txt.splitlines()
                acc = header.split()[0]
                out[acc] = ">" + header + "\n" + "\n".join(seq_lines) + "\n"
        except Exception as e:
            sys.stderr.write(f"[warn] efetch failed for {chunk}: {e}\n")
        time.sleep(SLEEP_BETWEEN_CALLS)
    return out

# ==== main loop: BLAST per TaxID -> keep top N -> fetch FASTA ====
all_rows = []
per_taxid_fastas = {}   # taxid -> list of FASTA strings
xml_paths = []

for i, tid in enumerate(taxids, 1):
    try:
        record, xml = run_single_blast(query_seq, tid, program=("blastp" if mode=="blastp" else "blastx"))
        # save each XML (handy for grading/debug)
        xml_file = OUTPUT_DIR / f"{mode}_nr_taxid{tid}.xml"
        xml_file.write_text(xml); xml_paths.append(xml_file)

        if not record.alignments:
            print(f"— No hits for taxid {tid}")
            continue

        # pick top-N by (evalue, -identities)
        hsps = []
        for aln in record.alignments:
            best = sorted(aln.hsps, key=lambda h: (h.expect, -h.identities))[0]
            acc = extract_accession(aln.hit_id, aln.hit_def, getattr(aln, "accession", None))
            pct_id = 100.0 * best.identities / best.align_length if best.align_length else 0.0
            hsps.append((aln, best, acc, pct_id))
        hsps.sort(key=lambda t: (t[1].expect, -t[3]))
        keep = hsps[:TOP_HITS_PER_TAXID]

        # fetch FASTAs for these accessions
        accs = [acc for _,_,acc,_ in keep]
        acc_to_fa = fetch_protein_fasta(accs)

        # store
        per_taxid_fastas.setdefault(tid, [])
        for aln, best, acc, pct in keep:
            fa = acc_to_fa.get(acc)
            if not fa:
                continue
            # prepend taxid info to header for grouping
            # keep original header after a pipe
            lines = fa.strip().splitlines()
            header = lines[0][1:]  # drop '>'
            seq = "\n".join(lines[1:])
            new_header = f">taxid:{tid}|acc:{acc}|e:{best.expect:.2e}|pid:{pct:.2f}|len:{best.align_length} {header}"
            per_taxid_fastas[tid].append(new_header + "\n" + seq + "\n")

            all_rows.append([
                tid, acc, aln.title, aln.length, best.expect,
                best.identities, best.align_length, round(pct,2),
                min(best.query_start,best.query_end), max(best.query_start,best.query_end),
                min(best.sbjct_start,best.sbjct_end), max(best.sbjct_start,best.sbjct_end)
            ])
        print(f"✅ taxid {tid}: kept {len(per_taxid_fastas[tid])} sequences")
    except Exception as e:
        print(f"⚠️ taxid {tid} failed: {e}")
    time.sleep(SLEEP_BETWEEN_CALLS)

# ==== write combined multi-FASTA + per-taxid FASTAs ====
multi_fa = OUTPUT_DIR / "per_taxid_top_hits.fasta"
with open(multi_fa, "w") as fh:
    for tid in taxids:
        for fa in per_taxid_fastas.get(tid, []):
            fh.write(fa)
print(f"💾 Multi-FASTA: {multi_fa}")

if WRITE_SPLIT_FASTA:
    for tid, fas in per_taxid_fastas.items():
        p = OUTPUT_DIR / f"taxid_{tid}_top_hits.fasta"
        with open(p, "w") as fh:
            for fa in fas: fh.write(fa)
    print("💾 Also wrote per-taxid FASTAs")

# ==== write table ====
tsv = OUTPUT_DIR / "per_taxid_hits.tsv"
with open(tsv, "w", newline="") as f:
    w = csv.writer(f, delimiter="\t")
    w.writerow(["taxid","accession","title","subject_length","evalue","identities","align_len","pct_identity","q_start","q_end","s_start","s_end"])
    w.writerows(all_rows)
print(f"📑 Table: {tsv}")

print(f"🗂 XML files saved: {len(xml_paths)}")

📄 Query source: query_proteins.fasta:unknown_seq
🧪 Mode chosen: BLASTP vs nr  | length=475
🧬 Loaded 107 TaxIDs
⏳ BLASTP vs nr | taxid=10090 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid10090[ORGN]
✅ taxid 10090: kept 0 sequences
⏳ BLASTP vs nr | taxid=102285 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid102285[ORGN]
✅ taxid 102285: kept 0 sequences
⏳ BLASTP vs nr | taxid=105358 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid105358[ORGN]
— No hits for taxid 105358
⏳ BLASTP vs nr | taxid=1117 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1117[ORGN]
✅ taxid 1117: kept 0 sequences
⏳ BLASTP vs nr | taxid=115547 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid115547[ORGN]
✅ taxid 115547: kept 0 sequences
⏳ BLASTP vs nr | taxid=1179 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1179[ORGN]
✅ taxid 1179: kept 0 sequences
⏳ BLASTP vs nr | taxid=1408545 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1408545[ORGN]
✅ taxid 1408545: kept 0 sequences
⏳ BLASTP vs nr | taxid=1416614 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1416614[ORGN]




✅ taxid 1416614: kept 0 sequences
⏳ BLASTP vs nr | taxid=1552121 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1552121[ORGN]




✅ taxid 1552121: kept 0 sequences
⏳ BLASTP vs nr | taxid=159855 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid159855[ORGN]




✅ taxid 159855: kept 0 sequences
⏳ BLASTP vs nr | taxid=1644118 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1644118[ORGN]




✅ taxid 1644118: kept 0 sequences
⏳ BLASTP vs nr | taxid=1673428 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1673428[ORGN]




— No hits for taxid 1673428
⏳ BLASTP vs nr | taxid=172827 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid172827[ORGN]




✅ taxid 172827: kept 0 sequences
⏳ BLASTP vs nr | taxid=1737569 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1737569[ORGN]
✅ taxid 1737569: kept 0 sequences
⏳ BLASTP vs nr | taxid=1752064 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1752064[ORGN]
✅ taxid 1752064: kept 0 sequences
⏳ BLASTP vs nr | taxid=1761908 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1761908[ORGN]
✅ taxid 1761908: kept 0 sequences
⏳ BLASTP vs nr | taxid=186192 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid186192[ORGN]
✅ taxid 186192: kept 0 sequences
⏳ BLASTP vs nr | taxid=187137 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid187137[ORGN]
✅ taxid 187137: kept 0 sequences
⏳ BLASTP vs nr | taxid=1874737 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1874737[ORGN]
✅ taxid 1874737: kept 0 sequences
⏳ BLASTP vs nr | taxid=1904752 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1904752[ORGN]
✅ taxid 1904752: kept 0 sequences
⏳ BLASTP vs nr | taxid=1906666 | E=1e-05 | hits=50
   ENTREZ_QUERY: txid1906666[ORGN]
✅ taxid 1906666: kept 0 sequences
⏳ BLASTP vs n