<a href="https://colab.research.google.com/github/RobBurnap/Bioinformatics-MICR4203-MICR5203/blob/main/notebooks/L02%E2%80%93BLASTn_Mini%E2%80%91API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# BIOINFO4/5203 — Week 2 Exercise (Foundations)

**Goals for today**
- Mount Google Drive and create your course folders
- Load a small FASTA file
- Compute simple sequence statistics
- Save a plot and a summary text into your `Outputs/` folder

> **Deliverables to Canvas:** the executed notebook (`.ipynb`) and a PDF export with outputs visible.


##Cell 1 — Setup Google Drive + Week Folders

In [1]:
# --- L02 SETUP: mount Google Drive and make week folders ---
%pip -q install biopython

from google.colab import drive
drive.mount('/content/drive')

import os

# Your course root in Drive (change only if you chose a different root in L01)
COURSE_DIR   = "/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25"

# Week label for L02 (used to create subfolders)
LECTURE_CODE = "L02_databases_formats"
TOPIC        = "blastn_seed"

# Per-week data/output folders
DATA_DIR    = f"{COURSE_DIR}/Data/{LECTURE_CODE}_{TOPIC}"
OUTPUT_DIR  = f"{COURSE_DIR}/Outputs/{LECTURE_CODE}_{TOPIC}"
for p in [COURSE_DIR, f"{COURSE_DIR}/Data", f"{COURSE_DIR}/Outputs", DATA_DIR, OUTPUT_DIR]:
    os.makedirs(p, exist_ok=True)

print("✅ Drive mounted.")
print("📁 COURSE_DIR :", COURSE_DIR)
print("📁 DATA_DIR   :", DATA_DIR)
print("📁 OUTPUT_DIR :", OUTPUT_DIR)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.3 MB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m2.9/3.3 MB[0m [31m42.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
✅ Drive mounted.
📁 COURSE_DIR : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25
📁 DATA_DIR   : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L02_databases_formats_blastn_seed
📁 OUTPUT_DIR : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed


TESTING

In [2]:
# --- BLASTn vs RefSeq mRNA: avoids genomes/chromosomes ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import os, csv, time

Entrez.email = "your_email@university.edu"   # <-- set to student's email

# Use the same FASTA_PATH you already selected
rec = next(SeqIO.parse(FASTA_PATH, "fasta"))
query_seq = str(rec.seq)
print(f"🔎 Query: {rec.id} (len={len(rec)} nt)")

print("⏳ BLASTn vs refseq_rna (mRNAs only)…")
blast_handle = NCBIWWW.qblast(
    program="blastn",
    database="refseq_rna",              # <— key change: transcripts only
    sequence=query_seq,
    expect=1e-5,
    megablast=True,
    filter="L",
    entrez_query="srcdb_refseq[PROP] AND biomol_mrna[PROP]"  # reinforce mRNA-only
)
blast_xml = blast_handle.read()
blast_handle.close()

xml_path = f"{OUTPUT_DIR}/blastn_refseqrna.xml"
with open(xml_path, "w") as f:
    f.write(blast_xml)
print("💾 Saved BLAST XML:", xml_path)

record = NCBIXML.read(__import__("io").StringIO(blast_xml))
assert record.alignments, "No BLASTn hits in refseq_rna."

NameError: name 'FASTA_PATH' is not defined


## Cell 2 — Put your unknown DNA FASTA into DATA_DIR
This cell verifies a FASTA is present and, if not, lets students upload directly into the right folder

In [None]:
# --- Data check (and optional upload) ---
import os
%pip install -q biopython       # Install the Biopython package quietly (-q suppresses most output) so we can work with biological sequence files

from google.colab import files
from shutil import move

# Look for a DNA FASTA in DATA_DIR
cands = [f for f in os.listdir(DATA_DIR) if f.lower().endswith((".fa",".fasta",".fna"))]

if not cands:
    print("⚠️ No FASTA found in Data/. Use the picker to upload one now.")
    uploaded = files.upload()  # Choose a local .fa/.fasta/.fna
    for name in uploaded.keys():
        move(name, f"{DATA_DIR}/{name}")
    cands = [f for f in os.listdir(DATA_DIR) if f.lower().endswith((".fa",".fasta",".fna"))]

assert cands, "No FASTA in Data/. Please upload a .fa/.fasta/.fna and re-run this cell."

# Use the newest FASTA
cands.sort(key=lambda f: os.path.getmtime(f"{DATA_DIR}/{f}"), reverse=True)
FASTA_PATH = f"{DATA_DIR}/{cands[0]}"
print("📄 Using FASTA:", FASTA_PATH)

📄 Using FASTA: /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L02_databases_formats_blastn_seed/unknown_seq_DNA.fasta


In [4]:
# --- Fetch GenBank + CDS nucleotide and protein for top hits ---
from Bio import Entrez
import csv, time

top_n = 10
hits = []
for aln in record.alignments[:top_n]:
    hsp = aln.hsps[0]
    hits.append({
        "accession": aln.accession,
        "title": aln.title,
        "hit_length_nt": aln.length,
        "align_len": hsp.align_length,
        "identities": hsp.identities,
        "pct_identity": round(100.0 * hsp.identities / hsp.align_length, 2)
    })

gb_dir   = f"{OUTPUT_DIR}/genbank"
cds_na   = f"{OUTPUT_DIR}/cds_nucleotide.fasta"
cds_aa   = f"{OUTPUT_DIR}/cds_protein.fasta"
summary  = f"{OUTPUT_DIR}/hits_summary.csv"
os.makedirs(gb_dir, exist_ok=True)

def efetch_text(db, acc, rettype, retmode="text"):
    with Entrez.efetch(db=db, id=acc, rettype=rettype, retmode=retmode) as h:
        return h.read()

with open(summary, "w", newline="") as cs, \
     open(cds_na, "w") as fa_na, \
     open(cds_aa, "w") as fa_aa:
    w = csv.DictWriter(cs, fieldnames=list(hits[0].keys()))
    w.writeheader()
    for i, h in enumerate(hits, 1):
        acc = h["accession"]
        print(f"  ▶ {i:02d}/{len(hits)} {acc} ({h['pct_identity']}% id)")

        # 1) GenBank (rich annotation)
        try:
            gb_txt = efetch_text("nuccore", acc, "gb")
            gb_path = f"{gb_dir}/{acc}.gb"
            with open(gb_path, "w") as f:
                f.write(gb_txt)
            h["genbank_path"] = gb_path
        except Exception as e:
            h["genbank_path"] = ""
            print("    ⚠️ GenBank fetch failed:", e)

        # 2) CDS nucleotide (for ORF)
        try:
            fa_txt = efetch_text("nuccore", acc, "fasta_cds_na")
            fa_na.write(fa_txt)
        except Exception as e:
            print("    ⚠️ CDS-NA fetch failed:", e)

        # 3) CDS protein (translation)
        try:
            fa_txt = efetch_text("nuccore", acc, "fasta_cds_aa")
            fa_aa.write(fa_txt)
        except Exception as e:
            print("    ⚠️ CDS-AA fetch failed:", e)

        w.writerow(h)
        time.sleep(0.34)  # be polite to NCBI

print("💾 Summary CSV:", summary)
print("💾 CDS nucleotide FASTA:", cds_na)
print("💾 CDS protein FASTA:", cds_aa)
print("💾 GenBank files in:", gb_dir)

NameError: name 'record' is not defined

##Cell 3 — BLASTn Mini‑API (download top similar DNA sequences)
	•	Runs BLASTn at NCBI (database nt for L02 simplicity)
	•	Saves: raw BLAST XML, a FASTA of top hits, and a CSV summary

In [None]:
# --- BLASTn mini-API: find similar DNA and download top hits ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import os, csv, time

# REQUIRED by NCBI: set your email (students: put YOUR email)
Entrez.email = "your_email@university.edu"  # <-- change me
API_KEY = None  # optional: paste NCBI API key to raise rate limits

# Read first record as query
rec = next(SeqIO.parse(FASTA_PATH, "fasta"))
query_seq = str(rec.seq)
print(f"🔎 Query record: {rec.id} (len={len(rec)} nt)")

# Run BLASTn vs nt (fast & familiar for L02)
print("⏳ Running BLASTn vs nt …")
blast_handle = NCBIWWW.qblast(
    program="blastn",
    database="nt",
    sequence=query_seq,
    expect=1e-5,
    megablast=True,
    filter="L",                # low-complexity filter
    entrez_query=None          # no organism filter in L02
)
blast_xml = blast_handle.read()
blast_handle.close()

# Save XML (reproducibility)
xml_path = f"{OUTPUT_DIR}/blastn_results.xml"
with open(xml_path, "w") as f:
    f.write(blast_xml)
print("💾 Saved BLAST XML:", xml_path)

# Parse top hits
record = NCBIXML.read(__import__("io").StringIO(blast_xml))
assert record.alignments, "BLAST returned no hits. Try a different sequence."

top_n = 10
hits = []
for aln in record.alignments[:top_n]:
    hsp = aln.hsps[0]
    hits.append({
        "title": aln.title,
        "accession": aln.accession,
        "length": aln.length,
        "identity": hsp.identities,
        "align_len": hsp.align_length,
        "pct_identity": round(100.0 * hsp.identities / hsp.align_length, 2)
    })

# Fetch FASTA for each hit
def efetch_fasta(acc):
    params = dict(db="nuccore", id=acc, rettype="fasta", retmode="text")
    if API_KEY: params["api_key"] = API_KEY
    with Entrez.efetch(**params) as h:
        return h.read()

fasta_out = f"{OUTPUT_DIR}/blastn_top_hits.fasta"
csv_out   = f"{OUTPUT_DIR}/blastn_top_hits_summary.csv"

with open(fasta_out, "w") as fa, open(csv_out, "w", newline="") as cs:
    writer = csv.DictWriter(cs, fieldnames=list(hits[0].keys()))
    writer.writeheader()
    for i, h in enumerate(hits, 1):
        try:
            fa.write(efetch_fasta(h["accession"]))
            writer.writerow(h)
            print(f"  ✅ {i:02d}/{len(hits)} {h['accession']}  {h['pct_identity']}% id")
        except Exception as e:
            print(f"  ⚠️  Failed {h['accession']}: {e}")
        time.sleep(0.34 if not API_KEY else 0.1)  # be polite to NCBI

print(f"\n💾 FASTA of hits   : {fasta_out}")
print(f"💾 Hits summary CSV: {csv_out}")

🔎 Query record: unknown_seq (len=1428 nt)
⏳ Running BLASTn vs nt …
💾 Saved BLAST XML: /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_results.xml
  ✅ 01/10 AP012277  100.0% id
  ✅ 02/10 CP012832  100.0% id
  ✅ 03/10 CP129344  100.0% id
  ✅ 04/10 CP094998  100.0% id
  ✅ 05/10 CP003265  100.0% id
  ✅ 06/10 AP012205  100.0% id
  ✅ 07/10 CP028094  100.0% id
  ✅ 08/10 BA000022  100.0% id
  ✅ 09/10 CP073017  100.0% id
  ✅ 10/10 AP012278  100.0% id

💾 FASTA of hits   : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_top_hits.fasta
💾 Hits summary CSV: /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_top_hits_summary.csv



## Cell 4 — (Optional) Quick visualization: identity distribution


In [None]:
# --- Quick plot of % identity for the top hits ---
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(csv_out)
plt.figure()
df["pct_identity"].plot(kind="bar")
plt.title("% identity of BLASTn top hits")
plt.xlabel("Hit index")
plt.ylabel("% identity")
plt.tight_layout()

png_path = f"{OUTPUT_DIR}/blastn_identity_plot.png"
plt.savefig(png_path, dpi=150)
print("💾 Saved plot:", png_path)

Translating sequences to protein...

>unknown_seq
MYEKLQPPSVGSKITFVAGKPVVPNDPIIPYIRGDGTGVDIWPATELVINAAIAKAYGGERKINWFKVYAGDEACELYGTYQYLPEDTLTAIKEYGVAIKGPLTTPVGGGIRSLNVALRQIFDLYTCVRPCRYYPGTPSPHKTPEKLDIIVYRENTEDIYLGIEWAEGTEGAKKLIAYLNDELIPTTPALGKKQIRLDSGIGIKPISKTGSQRLVRRAILHALRLPKAKQMVTLVHKGNIMKFTEGAFRDWGYELATTEFRAECVTERESWILGNKESNPDLTIEANAHMIDPGYDTLTEEKQAVIKQEVEQVLNSIWESHGNGQWKEKVMVNDRIADSIFQQIQTRPDEYSILATMNLNGDYLSDAAAAVVGGLGMGPGANIGDSAAIFEATHGTAPKHAGLDRINPGSVILSGVMMLEFMGWQEAADLIKKGIGAAIANREVTYDLARLMEPKVDKPLKCSEFAQAIVSHFDD

✅ Translations saved to: /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L01_foundations/translated_proteins.fasta


## Cell 5 — Canvas summary (simple key/value printout)

In [None]:
# --- Summary for Canvas auto-grading / quick check ---
summary = {
    "LECTURE": LECTURE_CODE,
    "TOPIC": TOPIC,
    "QUERY_FASTA": os.path.basename(FASTA_PATH),
    "HITS_FASTA": os.path.basename(fasta_out),
    "HITS_CSV": os.path.basename(csv_out)
}
print("=== L02 SUMMARY ===")
for k,v in summary.items():
    print(f"{k}={v}")

with open(f"{OUTPUT_DIR}/summary.txt","w") as f:
    for k,v in summary.items():
        f.write(f"{k}={v}\n")
print("💾 Wrote", f"{OUTPUT_DIR}/summary.txt")

💾 Saved CSV -> /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L01_foundations/seq_summary.csv
🖼️ Saved PNG -> /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L01_foundations/lengths_barplot.png
📦 Output dir listing: ['lengths_barplot.png', 'summary.txt', 'seq_summary.csv', 'translated_proteins.fasta']


Now translate the sequence by running this code:

## E. Results summary (copy into Canvas if requested)

In [None]:

summary_path = f"{OUTPUT_DIR}/summary.txt"
with open(summary_path, "w") as f:
    f.write(f"LECTURE={LECTURE_CODE}\n")
    f.write(f"TOPIC={TOPIC}\n")
    f.write(f"N_records={len(records)}\n")
    f.write(f"FASTA={os.path.basename(fasta_path)}\n")
print("📝 Saved summary ->", summary_path)

print("=== SUMMARY ===")
print("LECTURE=", LECTURE_CODE)
print("TOPIC=", TOPIC)
print("N_records=", len(records))
print("FASTA=", os.path.basename(fasta_path))


📝 Saved summary -> /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L01_foundations/summary.txt
=== SUMMARY ===
LECTURE= L01
TOPIC= foundations
N_records= 1
FASTA= unknown_seq_DNA.fasta



## F. Export & submit
- **File → Print → Save as PDF**, then upload the PDF and `.ipynb` to Canvas.  
- Ensure your `Outputs/` folder contains: `seq_summary.csv`, `lengths_barplot.png`, and `summary.txt`.


In [None]:
%pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [3]:
import os

# Look for a DNA FASTA in DATA_DIR
cands = [f for f in os.listdir(DATA_DIR) if f.lower().endswith((".fa",".fasta",".fna"))]

# Use the newest FASTA
cands.sort(key=lambda f: os.path.getmtime(f"{DATA_DIR}/{f}"), reverse=True)
FASTA_PATH = f"{DATA_DIR}/{cands[0]}"
print("📄 Using FASTA:", FASTA_PATH)

📄 Using FASTA: /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Data/L02_databases_formats_blastn_seed/unknown_seq_DNA.fasta


In [5]:
from Bio.Blast import NCBIXML
import io

# Assuming blast_xml is available from a previous cell
# If not, you would need to load it from the saved XML file
# xml_path = f"{OUTPUT_DIR}/blastn_refseqrna.xml" # Or blastn_results.xml depending on which blast run you're using
# with open(xml_path, "r") as f:
#     blast_xml = f.read()

record = NCBIXML.read(io.StringIO(blast_xml))
assert record.alignments, "No BLAST hits." # Add a check for alignments

NameError: name 'blast_xml' is not defined

In [6]:
# --- BLASTn mini-API: find similar DNA and download top hits ---
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import os, csv, time

# REQUIRED by NCBI: set your email (students: put YOUR email)
Entrez.email = "your_email@university.edu"  # <-- change me
API_KEY = None  # optional: paste NCBI API key to raise rate limits

# Read first record as query
rec = next(SeqIO.parse(FASTA_PATH, "fasta"))
query_seq = str(rec.seq)
print(f"🔎 Query record: {rec.id} (len={len(rec)} nt)")

# Run BLASTn vs nt (fast & familiar for L02)
print("⏳ Running BLASTn vs nt …")
blast_handle = NCBIWWW.qblast(
    program="blastn",
    database="nt",
    sequence=query_seq,
    expect=1e-5,
    megablast=True,
    filter="L",                # low-complexity filter
    entrez_query=None          # no organism filter in L02
)
blast_xml = blast_handle.read()
blast_handle.close()

# Save XML (reproducibility)
xml_path = f"{OUTPUT_DIR}/blastn_results.xml"
with open(xml_path, "w") as f:
    f.write(blast_xml)
print("💾 Saved BLAST XML:", xml_path)

🔎 Query record: unknown_seq (len=1428 nt)
⏳ Running BLASTn vs nt …
💾 Saved BLAST XML: /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_results.xml


Process XML to easy to read docs

In [7]:
# --- Summarize BLAST XML to a tidy table + CSV ---
from Bio.Blast import NCBIXML
import pandas as pd, os, io

XML_PATH = "/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_results.xml"
assert os.path.exists(XML_PATH), "XML file not found. Check the path."

with open(XML_PATH) as f:
    blast_record = NCBIXML.read(f)  # one query -> one record

rows = []
for aln in blast_record.alignments:
    # take the first HSP per alignment for summary
    h = aln.hsps[0]
    rows.append({
        "accession": aln.accession,
        "title": aln.title,
        "length_nt": aln.length,
        "bitscore": h.bits,
        "evalue": h.expect,
        "identities": h.identities,
        "align_len": h.align_length,
        "pct_identity": round(100.0*h.identities/h.align_length, 2),
        "q_start": h.query_start,
        "q_end": h.query_end,
        "s_start": h.sbjct_start,
        "s_end": h.sbjct_end,
        "gaps": h.gaps if hasattr(h, "gaps") else None
    })

df = pd.DataFrame(rows)
df_sorted = df.sort_values(["evalue","bitscore"], ascending=[True, False]).reset_index(drop=True)
display(df_sorted.head(20))

# Save CSV next to the XML
CSV_OUT = os.path.join(os.path.dirname(XML_PATH), "blastn_results_summary.csv")
df_sorted.to_csv(CSV_OUT, index=False)
print("💾 Saved:", CSV_OUT)

Unnamed: 0,accession,title,length_nt,bitscore,evalue,identities,align_len,pct_identity,q_start,q_end,s_start,s_end,gaps
0,AP012277,gi|359273400|dbj|AP012277.1| Synechocystis sp....,3570103,2638.14,0.0,1428,1428,100.0,1,1428,283048,284475,0
1,CP012832,gi|939195038|gb|CP012832.1| Synechocystis sp. ...,3569196,2638.14,0.0,1428,1428,100.0,1,1428,283060,284487,0
2,CP129344,gi|2890936204|gb|CP129344.1| Synechocystis sp....,3569582,2638.14,0.0,1428,1428,100.0,1,1428,283061,284488,0
3,CP094998,gi|2221391174|gb|CP094998.1| Synechocystis sp....,3569929,2638.14,0.0,1428,1428,100.0,1,1428,283060,284487,0
4,CP003265,gi|451779298|gb|CP003265.1| Synechocystis sp. ...,3569561,2638.14,0.0,1428,1428,100.0,1,1428,283060,284487,0
5,AP012205,gi|339272262|dbj|AP012205.1| Synechocystis sp....,3571103,2638.14,0.0,1428,1428,100.0,1,1428,283060,284487,0
6,CP028094,gi|1370130508|gb|CP028094.1| Synechocystis sp....,3574310,2638.14,0.0,1428,1428,100.0,1,1428,283060,284487,0
7,BA000022,gi|47118304|dbj|BA000022.2| Synechocystis sp. ...,3573470,2638.14,0.0,1428,1428,100.0,1,1428,283060,284487,0
8,CP073017,gi|2051288975|gb|CP073017.1| Synechocystis sp....,3571181,2638.14,0.0,1428,1428,100.0,1,1428,283059,284486,0
9,AP012278,gi|359276570|dbj|AP012278.1| Synechocystis sp....,3570114,2638.14,0.0,1428,1428,100.0,1,1428,283060,284487,0


💾 Saved: /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_results_summary.csv


filter to de-duplicate and save top hit only

In [8]:
# --- Non-redundant BLAST hits + FASTA export (unique by accession; optional by TaxID) ---
from Bio.Blast import NCBIXML
from Bio import Entrez
import pandas as pd, os, io, csv, time, re

# REQUIRED by NCBI
Entrez.email = "your_email@university.edu"  # <-- change me
API_KEY = None  # optional API key

# 1) Point to your BLAST XML and output folder
XML_PATH = XML_PATH if 'XML_PATH' in globals() else "/content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/blastn_results.xml"
assert os.path.exists(XML_PATH), f"Missing XML at: {XML_PATH}"

TOP_HIT_FASTA = os.path.join(OUTPUT_DIR, "top_hit.fasta")
NR_FASTA      = os.path.join(OUTPUT_DIR, "nr_hits.fasta")
NR_SUMMARY    = os.path.join(OUTPUT_DIR, "nr_hits_summary.csv")

# 2) Parse BLAST XML and collect a ranked hit list
with open(XML_PATH) as f:
    rec = NCBIXML.read(f)

def hit_rows(rec):
    rows = []
    for aln in rec.alignments:
        hsp = aln.hsps[0]
        rows.append({
            "accession": aln.accession,
            "title": aln.title,
            "length_nt": aln.length,
            "bitscore": hsp.bits,
            "evalue": hsp.expect,
            "identities": hsp.identities,
            "align_len": hsp.align_length,
            "pct_identity": round(100.0*hsp.identities/hsp.align_length, 2),
            "q_start": hsp.query_start, "q_end": hsp.query_end,
            "s_start": hsp.sbjct_start, "s_end": hsp.sbjct_end
        })
    df = pd.DataFrame(rows)
    if df.empty:
        raise ValueError("No BLAST alignments found.")
    # Rank: lowest E, then highest bitscore, then highest %ID, then longest align_len
    df = df.sort_values(["evalue","bitscore","pct_identity","align_len"],
                        ascending=[True, False, False, False]).reset_index(drop=True)
    return df

df = hit_rows(rec)

# 3) De-duplicate by accession (simple redundancy removal)
df_acc = df.drop_duplicates(subset=["accession"], keep="first").reset_index(drop=True)

# 4) Optional: de-duplicate by organism (TaxID) to ensure taxonomic diversity
#    We'll resolve TaxID via ESummary for each accession.
def esummary_one_acc(acc):
    params = dict(db="nuccore", id=acc, retmode="json")
    if API_KEY: params["api_key"] = API_KEY
    with Entrez.esummary(**params) as h:
        res = h.read()
    # Parse minimal fields without full JSON lib (ESummary returns JSON-ish text in many cases)
    import json
    data = json.loads(res)
    try:
        uid = next(iter(data["result"]["uids"]))
        item = data["result"][uid]
        taxid = str(item.get("taxid", "")) if isinstance(item.get("taxid", ""), (int,str)) else ""
        org   = item.get("organism", "")
    except Exception:
        taxid, org = "", ""
    return taxid, org

# Toggle this to True if you want "one best per organism"
ENFORCE_ONE_PER_TAXON = True

if ENFORCE_ONE_PER_TAXON:
    tax_rows = []
    for acc in df_acc["accession"].tolist():
        try:
            taxid, org = esummary_one_acc(acc)
        except Exception as e:
            taxid, org = "", ""
        tax_rows.append({"accession": acc, "taxid": taxid, "organism": org})
        time.sleep(0.25 if not API_KEY else 0.1)

    tax_df = pd.DataFrame(tax_rows)
    df_acc = df_acc.merge(tax_df, on="accession", how="left")

    # Keep only the top (already ranked) hit per TaxID
    # If TaxID missing, treat accession itself as its own group so we don't drop it.
    df_acc["tax_group"] = df_acc["taxid"].where(df_acc["taxid"].astype(bool), df_acc["accession"])
    df_nr = df_acc.drop_duplicates(subset=["tax_group"], keep="first").reset_index(drop=True)
else:
    df_nr = df_acc.copy()
    df_nr["taxid"] = ""
    df_nr["organism"] = ""
    df_nr["tax_group"] = ""

# 5) Export FASTA for top hit and for non-redundant set
from Bio import Entrez

def efetch_fasta(acc):
    params = dict(db="nuccore", id=acc, rettype="fasta", retmode="text")
    if API_KEY: params["api_key"] = API_KEY
    with Entrez.efetch(**params) as h:
        return h.read()

# (a) Single top hit FASTA
top_acc = df_nr.iloc[0]["accession"]
top_fa  = efetch_fasta(top_acc)
with open(TOP_HIT_FASTA, "w") as f:
    f.write(top_fa)

# (b) Non-redundant set FASTA (accession-unique or one-per-taxon)
with open(NR_FASTA, "w") as f:
    for acc in df_nr["accession"]:
        try:
            f.write(efetch_fasta(acc))
            time.sleep(0.25 if not API_KEY else 0.1)
        except Exception as e:
            print(f"⚠️ Failed to fetch {acc}: {e}")

# 6) Save a clean summary CSV
cols = ["accession","organism","taxid","pct_identity","bitscore","evalue",
        "align_len","length_nt","title"]
df_nr.to_csv(NR_SUMMARY, index=False, columns=[c for c in cols if c in df_nr.columns])

print("✅ De-dup complete.")
print("  • Top hit FASTA     :", TOP_HIT_FASTA)
print("  • NR hits FASTA     :", NR_FASTA)
print("  • NR hits summary   :", NR_SUMMARY)
print("  • NR count          :", len(df_nr))

✅ De-dup complete.
  • Top hit FASTA     : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/top_hit.fasta
  • NR hits FASTA     : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/nr_hits.fasta
  • NR hits summary   : /content/drive/MyDrive/Teaching/BIOINFO4-5203-F25/Outputs/L02_databases_formats_blastn_seed/nr_hits_summary.csv
  • NR count          : 42
