<a href="https://colab.research.google.com/github/RobBurnap/Bioinformatics-MICR4203-MICR5203/blob/main/notebooks/L02%E2%80%93BLASTn_Mini%E2%80%91API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# BIOINFO4/5203 — Week 2 Exercise (Foundations)

**Goals for today**
- Mount Google Drive and create your course folders
- Load a small FASTA file
- Compute simple sequence statistics
- Save a plot and a summary text into your `Outputs/` folder

> **Deliverables to Canvas:** the executed notebook (`.ipynb`) and a PDF export with outputs visible.


##Cell 1 — Setup Google Drive + Week Folders

In [None]:
# --- L02 SETUP: mount Google Drive and make week folders ---
%pip -q install biopython

from google.colab import drive
drive.mount('/content/drive')

import os

# Your course root in Drive (change only if you chose a different root in L01)
COURSE_DIR   = "/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25"

# Week label for L02 (used to create subfolders)
LECTURE_CODE = "L02_databases_formats"
TOPIC        = "blastn_seed"

# Per-week data/output folders
DATA_DIR    = f"{COURSE_DIR}/Data/{LECTURE_CODE}_{TOPIC}"
OUTPUT_DIR  = f"{COURSE_DIR}/Outputs/{LECTURE_CODE}_{TOPIC}"
for p in [COURSE_DIR, f"{COURSE_DIR}/Data", f"{COURSE_DIR}/Outputs", DATA_DIR, OUTPUT_DIR]:
    os.makedirs(p, exist_ok=True)

print("✅ Drive mounted.")
print("📁 COURSE_DIR :", COURSE_DIR)
print("📁 DATA_DIR   :", DATA_DIR)
print("📁 OUTPUT_DIR :", OUTPUT_DIR)


TESTING

In [None]:
# --- Robust BLASTn cascade: refseq_rna -> refseq_genomic -> nt (mega then sensitive) ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import io, os, time

Entrez.email = "rob.burnap@okstate.edu"  # <-- set this

assert 'FASTA_PATH' in globals() and 'OUTPUT_DIR' in globals(), "Run setup first."
os.makedirs(OUTPUT_DIR, exist_ok=True)

rec = next(SeqIO.parse(FASTA_PATH, "fasta"))
query_seq = str(rec.seq).upper().replace(" ", "").replace("\n", "")
print(f"🔎 Query: {rec.id} (len={len(query_seq)} nt)")
if len(query_seq) < 30:
    print("⚠️ Query is short (<30 nt). Consider using a longer region for BLASTn.")

def run_qblast(db, seq, megablast=True, expect=1e-5, extra_query=None):
    print(f"⏳ Trying BLASTn vs {db}  (megablast={megablast}, E={expect})")
    h = NCBIWWW.qblast(
        program="blastn",
        database=db,
        sequence=seq,
        expect=expect,
        megablast=megablast,
        filter="L",
        entrez_query=extra_query
    )
    xml = h.read(); h.close()
    rec = NCBIXML.read(io.StringIO(xml))
    return rec, xml

attempts = [
    ("refseq_rna",       True,  1e-5, "srcdb_refseq[PROP] AND biomol_mrna[PROP]"),
    ("refseq_genomic",   True,  1e-5, None),
    ("nt",               True,  1e-5, None),
    ("nt",               False, 1e-3, None),  # more sensitive
]

record = None
xml_used = None
db_used = None
for db, mega, ecut, q in attempts:
    try:
        r, xml = run_qblast(db, query_seq, megablast=mega, expect=ecut, extra_query=q)
        if r.alignments:
            record, xml_used, db_used = r, xml, f"{db} (megablast={mega}, E={ecut})"
            print(f"✅ Hits found in: {db_used}")
            break
        else:
            print(f"— No hits in {db} (megablast={mega})")
    except Exception as e:
        print(f"⚠️ BLAST attempt failed for {db}: {e}")
    time.sleep(0.5)

if record is None or not record.alignments:
    raise AssertionError("No BLASTn hits in refseq_rna/refseq_genomic/nt. Try a longer or different query.")

# Save XML named for the database used
xml_basename = f"blastn_{db_used.replace(' ', '_').replace('=', '').replace(',', '').replace('(', '').replace(')', '')}.xml"
xml_path = os.path.join(OUTPUT_DIR, xml_basename)
with open(xml_path, "w") as f:
    f.write(xml_used)
print("💾 Saved BLAST XML:", xml_path)

# Expose for downstream cells
BLAST_XML_PATH = xml_path
BLAST_DB_USED = db_used

🔎 Query: unknown_seq (len=1428 nt)
⏳ Trying BLASTn vs refseq_rna  (megablast=True, E=1e-05)
— No hits in refseq_rna (megablast=True)
⏳ Trying BLASTn vs refseq_genomic  (megablast=True, E=1e-05)





## Cell 2 — Put your unknown DNA FASTA into DATA_DIR
This cell verifies a FASTA is present and, if not, lets students upload directly into the right folder


## Cell 4 — (Optional) Quick visualization: identity distribution


In [None]:
# --- Quick plot of % identity for the top hits ---
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(csv_out)
plt.figure()
df["pct_identity"].plot(kind="bar")
plt.title("% identity of BLASTn top hits")
plt.xlabel("Hit index")
plt.ylabel("% identity")
plt.tight_layout()

png_path = f"{OUTPUT_DIR}/blastn_identity_plot.png"
plt.savefig(png_path, dpi=150)
print("💾 Saved plot:", png_path)

## Cell 5 — Canvas summary (simple key/value printout)

In [None]:
# --- Summary for Canvas auto-grading / quick check ---
summary = {
    "LECTURE": LECTURE_CODE,
    "TOPIC": TOPIC,
    "QUERY_FASTA": os.path.basename(FASTA_PATH),
    "HITS_FASTA": os.path.basename(fasta_out),
    "HITS_CSV": os.path.basename(csv_out)
}
print("=== L02 SUMMARY ===")
for k,v in summary.items():
    print(f"{k}={v}")

with open(f"{OUTPUT_DIR}/summary.txt","w") as f:
    for k,v in summary.items():
        f.write(f"{k}={v}\n")
print("💾 Wrote", f"{OUTPUT_DIR}/summary.txt")

Now translate the sequence by running this code:

## E. Results summary (copy into Canvas if requested)

In [None]:

summary_path = f"{OUTPUT_DIR}/summary.txt"
with open(summary_path, "w") as f:
    f.write(f"LECTURE={LECTURE_CODE}\n")
    f.write(f"TOPIC={TOPIC}\n")
    f.write(f"N_records={len(records)}\n")
    f.write(f"FASTA={os.path.basename(fasta_path)}\n")
print("📝 Saved summary ->", summary_path)

print("=== SUMMARY ===")
print("LECTURE=", LECTURE_CODE)
print("TOPIC=", TOPIC)
print("N_records=", len(records))
print("FASTA=", os.path.basename(fasta_path))



## F. Export & submit
- **File → Print → Save as PDF**, then upload the PDF and `.ipynb` to Canvas.  
- Ensure your `Outputs/` folder contains: `seq_summary.csv`, `lengths_barplot.png`, and `summary.txt`.


In [None]:
%pip install biopython

In [None]:
import os

# Look for a DNA FASTA in DATA_DIR
cands = [f for f in os.listdir(DATA_DIR) if f.lower().endswith((".fa",".fasta",".fna"))]

# Use the newest FASTA
cands.sort(key=lambda f: os.path.getmtime(f"{DATA_DIR}/{f}"), reverse=True)
FASTA_PATH = f"{DATA_DIR}/{cands[0]}"
print("📄 Using FASTA:", FASTA_PATH)

In [None]:
from Bio.Blast import NCBIXML
import io

# Assuming blast_xml is available from a previous cell
# If not, you would need to load it from the saved XML file
# xml_path = f"{OUTPUT_DIR}/blastn_refseqrna.xml" # Or blastn_results.xml depending on which blast run you're using
# with open(xml_path, "r") as f:
#     blast_xml = f.read()

record = NCBIXML.read(io.StringIO(blast_xml))
assert record.alignments, "No BLAST hits." # Add a check for alignments

In [None]:
# --- BLASTn mini-API: find similar DNA and download top hits ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import os, csv, time

# REQUIRED by NCBI: set your email (students: put YOUR email)
Entrez.email = "your_email@university.edu"  # <-- change me
API_KEY = None  # optional: paste NCBI API key to raise rate limits

# Read first record as query
rec = next(SeqIO.parse(FASTA_PATH, "fasta"))
query_seq = str(rec.seq)
print(f"🔎 Query record: {rec.id} (len={len(rec)} nt)")

# Run BLASTn vs nt (fast & familiar for L02)
print("⏳ Running BLASTn vs nt …")
blast_handle = NCBIWWW.qblast(
    program="blastn",
    database="nt",
    sequence=query_seq,
    expect=1e-5,
    megablast=True,
    filter="L",                # low-complexity filter
    entrez_query=None          # no organism filter in L02
)
blast_xml = blast_handle.read()
blast_handle.close()

# Save XML (reproducibility)
xml_path = f"{OUTPUT_DIR}/blastn_results.xml"
with open(xml_path, "w") as f:
    f.write(blast_xml)
print("💾 Saved BLAST XML:", xml_path)

Process XML to easy to read docs

In [None]:
# --- Summarize BLAST XML to a tidy table + CSV ---
from Bio.Blast import NCBIXML
import pandas as pd, os, io

XML_PATH = "/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_results.xml"
assert os.path.exists(XML_PATH), "XML file not found. Check the path."

with open(XML_PATH) as f:
    blast_record = NCBIXML.read(f)  # one query -> one record

rows = []
for aln in blast_record.alignments:
    # take the first HSP per alignment for summary
    h = aln.hsps[0]
    rows.append({
        "accession": aln.accession,
        "title": aln.title,
        "length_nt": aln.length,
        "bitscore": h.bits,
        "evalue": h.expect,
        "identities": h.identities,
        "align_len": h.align_length,
        "pct_identity": round(100.0*h.identities/h.align_length, 2),
        "q_start": h.query_start,
        "q_end": h.query_end,
        "s_start": h.sbjct_start,
        "s_end": h.sbjct_end,
        "gaps": h.gaps if hasattr(h, "gaps") else None
    })

df = pd.DataFrame(rows)
df_sorted = df.sort_values(["evalue","bitscore"], ascending=[True, False]).reset_index(drop=True)
display(df_sorted.head(20))

# Save CSV next to the XML
CSV_OUT = os.path.join(os.path.dirname(XML_PATH), "blastn_results_summary.csv")
df_sorted.to_csv(CSV_OUT, index=False)
print("💾 Saved:", CSV_OUT)

filter to de-duplicate and save top hit only

In [None]:
# --- Export gene-level ORFs (CDS) for NR hits: mRNA -> CDS, genome -> overlap CDS ---
from Bio.Blast import NCBIXML
from Bio import Entrez, SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd, os, io, time, json, re

# REQUIRED by NCBI
Entrez.email = "your_email@university.edu"   # <-- set this
API_KEY = None  # optional

# Inputs from earlier cells
assert 'OUTPUT_DIR' in globals(), "Set OUTPUT_DIR earlier."
XML_PATH = xml_path if 'xml_path' in globals() else XML_PATH  # support either var name
assert os.path.exists(XML_PATH), f"Missing XML at: {XML_PATH}"

# Output paths
CDS_NA_FASTA = os.path.join(OUTPUT_DIR, "nr_hits_CDS_nucleotide.fasta")
CDS_AA_FASTA = os.path.join(OUTPUT_DIR, "nr_hits_CDS_protein.fasta")
NR_SUMMARY   = os.path.join(OUTPUT_DIR, "nr_hits_summary_genes.csv")

# 1) Parse BLAST XML and rank hits
with open(XML_PATH) as f:
    rec = NCBIXML.read(f)

rows = []
for aln in rec.alignments:
    h = aln.hsps[0]
    rows.append({
        "accession": aln.accession,
        "title": aln.title,
        "bitscore": h.bits,
        "evalue": h.expect,
        "pct_identity": round(100.0*h.identities/h.align_length, 2),
        "align_len": h.align_length,
        "s_start": min(h.sbjct_start, h.sbjct_end),
        "s_end":   max(h.sbjct_start, h.sbjct_end)
    })
df = pd.DataFrame(rows).sort_values(
    ["evalue","bitscore","pct_identity","align_len"],
    ascending=[True, False, False, False]
).reset_index(drop=True)

# 2) De-duplicate by accession; optional: keep one per organism TaxID
def esummary_json(db, ids):
    params = dict(db=db, id=",".join(ids), retmode="json")
    if API_KEY: params["api_key"] = API_KEY
    with Entrez.esummary(**params) as h:
        return json.load(h)

df = df.drop_duplicates(subset=["accession"], keep="first").reset_index(drop=True)

# Add TaxID/Organism/Biomol using ESummary (batched for speed)
acc_list = df["accession"].tolist()
meta = esummary_json("nuccore", acc_list)
uid_map = {uid: meta["result"][uid] for uid in meta["result"]["uids"]}
# ESummary may map accessions to UIDs; build acc->(taxid, organism, biomol, slen, title)
def lookup_by_acc(acc):
    # Find UID whose caption or extra matches acc
    for uid, recj in uid_map.items():
        if recj.get("caption","").startswith(acc) or recj.get("accessionversion","")==acc:
            return {
                "uid": uid,
                "taxid": str(recj.get("taxid","")),
                "organism": recj.get("organism",""),
                "biomol": recj.get("biomol",""),        # "mRNA", "genomic", etc.
                "slen": int(recj.get("slen",0)),
                "esummary_title": recj.get("title","")
            }
    # fallback
    return {"uid":"", "taxid":"", "organism":"", "biomol":"", "slen":0, "esummary_title":""}

meta_rows = [lookup_by_acc(acc) for acc in acc_list]
meta_df = pd.DataFrame(meta_rows)
df = pd.concat([df, meta_df], axis=1)

# Optional: keep one best per organism (TaxID) to promote diversity
ENFORCE_ONE_PER_TAXON = True
if ENFORCE_ONE_PER_TAXON:
    df = df.sort_values(["evalue","bitscore","pct_identity","align_len"],
                        ascending=[True, False, False, False])
    df = df.drop_duplicates(subset=["taxid"], keep="first").reset_index(drop=True)

# 3) Helpers to fetch sequences
def efetch_text(db, acc, rettype, **kw):
    params = dict(db=db, id=acc, rettype=rettype, retmode="text")
    if API_KEY: params["api_key"] = API_KEY
    params.update(kw)
    with Entrez.efetch(**params) as h:
        return h.read()

def fetch_genbank(acc):
    txt = efetch_text("nuccore", acc, "gb")
    return SeqIO.read(io.StringIO(txt), "genbank")

def fetch_cds_fastas(acc):
    # For mRNA records, this typically yields one CDS sequence.
    cds_na = efetch_text("nuccore", acc, "fasta_cds_na")
    cds_aa = efetch_text("nuccore", acc, "fasta_cds_aa")
    return cds_na, cds_aa

# 4) Build CDS FASTAs (match the CDS overlapping the BLAST HSP when genomic)
na_out = []
aa_out = []
summary_rows = []

for _, r in df.iterrows():
    acc = r["accession"]
    biomol = (r["biomol"] or "").lower()
    s_start, s_end = int(r["s_start"]), int(r["s_end"])

    try:
        if biomol == "mrna":
            # Directly fetch CDS for mRNA (usually one coding sequence)
            cds_na, cds_aa = fetch_cds_fastas(acc)
            if cds_na.strip(): na_out.append(cds_na)
            if cds_aa.strip(): aa_out.append(cds_aa)
            used = "mRNA_CDS"
        else:
            # Genomic/chromosome: fetch GenBank and pick CDS overlapping HSP coords
            gb = fetch_genbank(acc)
            chosen = None
            for feat in gb.features:
                if feat.type != "CDS":
                    continue
                loc: FeatureLocation = feat.location
                f_start = int(loc.start) + 1  # GenBank is 0-based; BLAST coords are 1-based
                f_end   = int(loc.end)
                # overlap test
                if not (f_end < s_start or f_start > s_end):
                    chosen = feat
                    break
            if chosen is None:
                # Fallback: take the longest CDS (least-bad choice)
                cds_feats = [f for f in gb.features if f.type=="CDS"]
                chosen = max(cds_feats, key=lambda f: int(f.location.end)-int(f.location.start)) if cds_feats else None

            if chosen is not None:
                seq = chosen.extract(gb.seq)  # strand handled by Biopython
                # Build headers
                locus = chosen.qualifiers.get("locus_tag", [""])[0]
                gene  = chosen.qualifiers.get("gene", [""])[0]
                prod  = chosen.qualifiers.get("product", [""])[0]
                prot  = chosen.qualifiers.get("protein_id", [""])[0]
                hdr = f">{acc}|CDS|locus={locus}|gene={gene}|prot={prot}|product={prod}"
                na_out.append(f"{hdr}\n{str(seq)}\n")
                # Protein translation if available
                transl = chosen.qualifiers.get("translation", [""])
                if transl and transl[0]:
                    aa_out.append(f"{hdr}\n{transl[0]}\n")
                used = f"genomic_CDS_overlap:{locus or gene or prot}"
            else:
                used = "no_CDS_found"
        summary_rows.append({
            "accession": acc,
            "organism": r.get("organism",""),
            "biomol": r.get("biomol",""),
            "pct_identity": r["pct_identity"],
            "evalue": r["evalue"],
            "bitscore": r["bitscore"],
            "align_len": r["align_len"],
            "strategy": used
        })
    except Exception as e:
        summary_rows.append({
            "accession": acc,
            "organism": r.get("organism",""),
            "biomol": r.get("biomol",""),
            "pct_identity": r["pct_identity"],
            "evalue": r["evalue"],
            "bitscore": r["bitscore"],
            "align_len": r["align_len"],
            "strategy": f"error:{e}"
        })
    time.sleep(0.25 if not API_KEY else 0.1)

# 5) Write outputs
with open(CDS_NA_FASTA, "w") as f:
    f.write("".join(na_out))
with open(CDS_AA_FASTA, "w") as f:
    f.write("".join(aa_out))
pd.DataFrame(summary_rows).to_csv(NR_SUMMARY, index=False)

print("✅ Wrote:")
print("  • CDS nucleotide FASTA :", CDS_NA_FASTA)
print("  • CDS protein FASTA    :", CDS_AA_FASTA)
print("  • NR gene summary CSV  :", NR_SUMMARY)