<a href="https://colab.research.google.com/github/RobBurnap/Bioinformatics-MICR4203-MICR5203/blob/main/notebooks/L02_BLASTp_versus_Species_Tree_Diversity_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# BIOINFO4/5203 —
species diversity BLASTp:  Need two files in your

##A. Mount Google Drive, Import Coding Libraries Necessary for Running Subsequent Code

In [1]:

# Install FIRST, then import
%pip install -q biopython       # Install the Biopython package quietly (-q suppresses most output) so we can work with biological sequence files

from google.colab import drive  # Import the module that lets Colab interact with Google Drive
drive.mount('/content/drive')   # Mount your Google Drive so it appears in Colab's file system under /content/drive

import os, pandas as pd          # Import 'os' for file/directory operations, and pandas for working with data tables
from Bio import SeqIO            # Import SeqIO from Biopython for reading/writing biological sequence files (FASTA, GenBank, etc.)
import matplotlib.pyplot as plt  # Import Matplotlib's plotting library to create figures and graphs

print("✅ Dependencies installed & Drive mounted.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.3 MB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.9/3.3 MB[0m [31m26.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m34.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
✅ Dependencies installed & Drive mounted.



## B. Course folders: Define the course folders for places to load data to be processed and output to be saved

Edit only `LECTURE_CODE` and `TOPIC` if needed. All inputs will live in `Data/LECTURE_TOPIC` and outputs in `Outputs/LECTURE_TOPIC`.


In [13]:

# --- Course folder config (customize LECTURE_CODE/TOPIC only) ---
COURSE_DIR   = "/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25"
LECTURE_CODE = "L02-BLASTp"            # change per week (e.g., L02, L03, ...)
TOPIC        = "diversity"    # short slug for the exercise

# Derived paths (do not change)
DATA_DIR   = f"{COURSE_DIR}/Data/{LECTURE_CODE}_{TOPIC}"
OUTPUT_DIR = f"{COURSE_DIR}/Outputs/{LECTURE_CODE}_{TOPIC}"

# Create folder structure if missing
for p in [f"{COURSE_DIR}/Data", f"{COURSE_DIR}/Outputs", f"{COURSE_DIR}/Notebooks", DATA_DIR, OUTPUT_DIR]:
    os.makedirs(p, exist_ok=True)

print("📁 COURSE_DIR :", COURSE_DIR)
print("📁 DATA_DIR   :", DATA_DIR)
print("📁 OUTPUT_DIR :", OUTPUT_DIR)


📁 COURSE_DIR : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25
📁 DATA_DIR   : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L02-BLASTp_diversity
📁 OUTPUT_DIR : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02-BLASTp_diversity


##C.



 multi-FASTA of top hits per TaxID (one or more sequences per taxon). The cell below:
	•	uses your existing folders (Data/L0-species_diversity for input; writes to the same folder unless OUTPUT_DIR is already set),
	•	reads query_proteins.fasta and taxids.txt,
	•	for each TaxID, runs a separate BLAST restricted to that taxon (txid####[ORGN]), so we can attribute hits unambiguously,
	•	grabs the top N accessions from each BLAST,
	•	fetches their protein FASTA sequences,
	•	writes:
	•	per_taxid_top_hits.fasta (all sequences, grouped by taxid in headers),
	•	per_taxid_hits.tsv (who came from which taxid, evalue, %id, etc.),
	•	optional one FASTA per taxid (toggle with WRITE_SPLIT_FASTA).

In [None]:
# --- BLAST (per TaxID) -> collect top protein hits -> write multi-FASTA + TSV (adaptive search, matrix+gapcosts safe) ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
from pathlib import Path
import io, csv, re, time, sys

# ==== required ====
Entrez.email = "you@university.edu"   # <-- set your email

# ==== Alogorithm 'knobs' you can tweak ====
TOP_HITS_PER_TAXID = 1
EVALUE_STEPS       = [1e-5, 1e-2, 0.1]          # strict -> moderate -> permissive
HITLIST_SIZE       = max(200, TOP_HITS_PER_TAXID*10)
WRITE_SPLIT_FASTA  = False # if True, also writes one FASTA file per taxid (in addition to combined multi sequence file)
SLEEP_BETWEEN_CALLS = 0.3
FORCE_MODE = 'auto'                              # 'auto' | 'blastp' | 'blastx'

# Matrix + gap-costs mapping (what NCBI expects)
MATRIX_DEFAULT = "BLOSUM62"
GAPCOSTS_BY_MATRIX = {
    "BLOSUM62": "11 1",
    "BLOSUM45": "15 2",
    "PAM30":    "9 1",
    "PAM70":    "10 1",
}

# ==== paths ====
if 'DATA_DIR' in globals(): DATA_DIR = Path(DATA_DIR)
else: DATA_DIR = Path("/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L0-species_diversity")

if 'OUTPUT_DIR' in globals(): OUTPUT_DIR = Path(OUTPUT_DIR)
else: OUTPUT_DIR = DATA_DIR
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PROT_FASTA = DATA_DIR / "query_protein.fasta"
NUC_FASTA  = DATA_DIR / "query.fasta"
TAXIDS_TXT = DATA_DIR / "taxids.txt"

# ---- basic file checks ----
if not (PROT_FASTA.exists() or NUC_FASTA.exists()):
    raise FileNotFoundError(f"Neither {PROT_FASTA} nor {NUC_FASTA} found in {DATA_DIR}.")
if not TAXIDS_TXT.exists():
    raise FileNotFoundError(f"{TAXIDS_TXT} not found in {DATA_DIR}.")

# ==== helpers: accession parsing & normalization ====
def extract_accession(hit_id, hit_def, accession_attr):
    if accession_attr: return accession_attr.strip()
    for field in (hit_id, hit_def):
        m = re.search(r"([A-Z]{1,5}_?\d+(?:\.\d+)?)", field or "")
        if m: return m.group(1)
    return (hit_id or "unknown").strip()

def _norm_keys(acc_token: str):
    acc_token = acc_token.strip()
    base = acc_token.split(".", 1)[0]
    return {acc_token, base} if "." in acc_token else {acc_token, f"{base}.1"}

def fetch_protein_fasta(accessions):
    out = {}
    batch = list({a for a in accessions if a and a != "unknown"})
    while batch:
        chunk = batch[:50]; batch = batch[50:]
        try:
            h = Entrez.efetch(db="protein", id=",".join(chunk), rettype="fasta", retmode="text")
            txt = h.read(); h.close()
            parts = [t for t in txt.strip().split(">") if t]
            for rec_txt in parts:
                header, *seq_lines = rec_txt.splitlines()
                token = header.split()[0]
                fasta_txt = ">" + header + "\n" + "\n".join(seq_lines) + "\n"
                for k in _norm_keys(token):
                    out[k] = fasta_txt
        except Exception as e:
            sys.stderr.write(f"[warn] efetch failed for {chunk}: {e}\n")
        time.sleep(SLEEP_BETWEEN_CALLS)
    return out

# ==== robust query loader/detector ====
DNA_ALPHABET = set("ACGTNUWSMKRYBDHV-")
def detect_is_protein(seq_upper: str) -> bool:
    letters = [c for c in seq_upper if c.isalpha() or c == '*']
    return any((c not in DNA_ALPHABET) for c in letters)

def load_query():
    if PROT_FASTA.exists():
        rec = next(SeqIO.parse(str(PROT_FASTA), "fasta"))
        seq = str(rec.seq).upper()
        decided = 'blastp' if detect_is_protein(seq) else 'blastx'
        if FORCE_MODE in ('blastp','blastx'): decided = FORCE_MODE
        return decided, seq.replace("*",""), f"{PROT_FASTA.name}:{rec.id}"
    rec = next(SeqIO.parse(str(NUC_FASTA), "fasta"))
    seq = str(rec.seq).upper()
    decided = 'blastp' if detect_is_protein(seq) else 'blastx'
    if FORCE_MODE in ('blastp','blastx'): decided = FORCE_MODE
    if decided == 'blastp' and NUC_FASTA.name == "query.fasta":
        print("⚠️  Detected protein sequence in 'query.fasta'; using BLASTP.")
    return decided, seq.replace("*",""), f"{NUC_FASTA.name}:{rec.id}"

mode, query_seq, query_label = load_query()
print(f"📄 Query source: {query_label}")
print(f"🧪 Mode chosen: {mode.upper()} vs nr  | length={len(query_seq)}")

# ==== load TaxIDs ====
taxids = [t.strip() for t in TAXIDS_TXT.read_text().splitlines() if t.strip().isdigit()]
if not taxids: raise ValueError("taxids.txt is empty or contains no numeric TaxIDs.")
print(f"🧬 Loaded {len(taxids)} TaxIDs")

# ==== ADAPTIVE BLAST (matrix + gapcosts always paired) ====
def run_single_blast_adaptive(seq, taxid, program="blastp",
                              hitlist=HITLIST_SIZE, matrix=MATRIX_DEFAULT):
    q = f"txid{taxid}[ORGN]"
    last_xml, last_rec = None, None
    cur_matrix = matrix
    for e in EVALUE_STEPS:
        gapcosts = GAPCOSTS_BY_MATRIX.get(cur_matrix, GAPCOSTS_BY_MATRIX["BLOSUM62"])
        print(f"⏳ {program.upper()} vs nr | taxid={taxid} | E={e} | hits={hitlist} | matrix={cur_matrix} | gaps={gapcosts}\n   ENTREZ_QUERY: {q}")
        h = NCBIWWW.qblast(program=program, database="nr", sequence=seq,
                           expect=e, entrez_query=q,
                           hitlist_size=hitlist, descriptions=hitlist,
                           alignments=hitlist, matrix_name=cur_matrix,
                           gapcosts=gapcosts)
        xml = h.read(); h.close()
        try:
            rec = NCBIXML.read(io.StringIO(xml))
        except Exception:
            parser = NCBIXML.parse(io.StringIO(xml))
            rec = next(parser, None)
        last_xml, last_rec = xml, rec
        if rec and rec.alignments:
            return rec, xml, e, cur_matrix
        # One relaxation step: switch to BLOSUM45 (with its proper gapcosts) after the moderate E
        if e == 1e-2 and cur_matrix == "BLOSUM62":
            cur_matrix = "BLOSUM45"
    return last_rec, last_xml, e, cur_matrix

# ==== main loop -> XML + top hits -> fetch FASTA ====
all_rows = []
per_taxid_fastas = {}
xml_paths = []

for i, tid in enumerate(taxids, 1):
    try:
        record, xml, e_used, m_used = run_single_blast_adaptive(
            query_seq, tid, program=("blastp" if mode=="blastp" else "blastx"))
        print(f"   ↳ used E={e_used}, matrix={m_used}")
        xml_file = OUTPUT_DIR / f"{mode}_nr_taxid{tid}.xml"
        xml_file.write_text(xml); xml_paths.append(xml_file)

        if not record or not record.alignments:
            print(f"— No hits for taxid {tid}")
            continue

        hsps = []
        for aln in record.alignments:
            best = sorted(aln.hsps, key=lambda h: (h.expect, -h.identities))[0]
            acc = extract_accession(aln.hit_id, aln.hit_def, getattr(aln, "accession", None))
            pct_id = 100.0 * best.identities / best.align_length if best.align_length else 0.0
            hsps.append((aln, best, acc, pct_id))
        hsps.sort(key=lambda t: (t[1].expect, -t[3]))
        keep = hsps[:TOP_HITS_PER_TAXID]

        accs = [acc for _,_,acc,_ in keep]
        acc_to_fa = fetch_protein_fasta(accs)

        per_taxid_fastas.setdefault(tid, [])
        kept_now = 0
        for aln, best, acc, pct in keep:
            fa = None
            for k in _norm_keys(acc):
                fa = acc_to_fa.get(k)
                if fa: break
            if not fa:
                sys.stderr.write(f"[miss] No FASTA for {acc} (taxid {tid})\n")
                continue
            lines = fa.strip().splitlines()
            header = lines[0][1:]
            seq = "\n".join(lines[1:])
            new_header = f">taxid:{tid}|acc:{acc}|e:{best.expect:.2e}|pid:{pct:.2f}|len:{best.align_length} {header}"
            per_taxid_fastas[tid].append(new_header + "\n" + seq + "\n"); kept_now += 1

            all_rows.append([
                tid, acc, aln.title, aln.length, best.expect,
                best.identities, best.align_length, round(pct,2),
                min(best.query_start,best.query_end), max(best.query_start,best.query_end),
                min(best.sbjct_start,best.sbjct_end), max(best.sbjct_start,best.sbjct_end)
            ])
        print(f"✅ taxid {tid}: kept {kept_now} sequences")
    except Exception as e:
        print(f"⚠️ taxid {tid} failed: {e}")
    time.sleep(SLEEP_BETWEEN_CALLS)

# ==== write combined multi-FASTA + per-taxid FASTAs ====
multi_fa = OUTPUT_DIR / "per_taxid_top_hits.fasta"
with open(multi_fa, "w") as fh:
    for tid in taxids:
        for fa in per_taxid_fastas.get(tid, []):
            fh.write(fa)
print(f"💾 Multi-FASTA: {multi_fa}")

if WRITE_SPLIT_FASTA:
    for tid, fas in per_taxid_fastas.items():
        p = OUTPUT_DIR / f"taxid_{tid}_top_hits.fasta"
        with open(p, "w") as fh:
            for fa in fas: fh.write(fa)
    print("💾 Also wrote per-taxid FASTAs")

# ==== write table ====
tsv = OUTPUT_DIR / "per_taxid_hits.tsv"
with open(tsv, "w", newline="") as f:
    w = csv.writer(f, delimiter="\t")
    w.writerow(["taxid","accession","title","subject_length","evalue","identities","align_len","pct_identity","q_start","q_end","s_start","s_end"])
    w.writerows(all_rows)
print(f"📑 Table: {tsv}")
print(f"🗂 XML files saved: {len(xml_paths)}")

📄 Query source: query_protein.fasta:Nqo13-mod
🧪 Mode chosen: BLASTP vs nr  | length=481
🧬 Loaded 100 TaxIDs
⏳ BLASTP vs nr | taxid=562 | E=1e-05 | hits=200 | matrix=BLOSUM62 | gaps=11 1
   ENTREZ_QUERY: txid562[ORGN]
