<a href="https://colab.research.google.com/github/RobBurnap/Bioinformatics-MICR4203-MICR5203/blob/main/BLASTp_taxon_directed_v3_scratch.ipynb/NDH-1%20/notebooks/BLASTp_taxon_directed_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Cell 0: Mount Drive and define project folders

In [None]:
!fusermount -u /content/drive 2>/dev/null || true
!rm -rf /content/drive 2>/dev/null || true
!mkdir -p /content/drive

In [4]:
# Cell 0: Mount Drive and define project folders
# This notebook uses Google Drive as a *temporary scratch workspace*.
# Put your inputs (query FASTA + taxid list) in:  DATA_DIR
# Outputs will be written to:                   OUTPUT_DIR
#
# Recommended structure in Drive:
#   MyDrive/_Scratch/NDH1_BLASTp_taxon_directed/
#       Data/
#       Outputs/

from google.colab import drive
import os

# Mount Drive (safe to re-run; force_remount helps if you changed accounts)
drive.mount('/content/drive', force_remount=False)

# --- Scratch workspace (edit if you want a different folder name) ---
PROJECT_ROOT = "/content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed"

DATA_DIR   = f"{PROJECT_ROOT}/Data"
OUTPUT_DIR = f"{PROJECT_ROOT}/Outputs"

# Create folders if they don't exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR    :", DATA_DIR)
print("OUTPUT_DIR  :", OUTPUT_DIR)


Mounted at /content/drive
PROJECT_ROOT: /content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed
DATA_DIR    : /content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed/Data
OUTPUT_DIR  : /content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed/Outputs


##Cell 1:Install dependencies (BLAST+, Biopython, pandas)

In [5]:
!apt-get update -qq
!apt-get install -y ncbi-blast+ -qq

!pip install biopython pandas -q
!apt-get install -y ncbi-blast+ -qq
!pip install biopython pandas requests -q

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


##Cell 2:Configuration: paths, NCBI email, input files

In [12]:
# Cell 2: Configuration (paths, email, input files)

import os
import pandas as pd
from Bio import Entrez, SeqIO

# REQUIRED by NCBI — use your real e-mail
Entrez.email = "rob.burnap@okstate.edu"

# ===== Choose the subunit you are working on =====
# You will run the notebook one subunit at a time (simple + reproducible).
SUBUNIT = "NdhC"   # e.g., "NdhA", "NdhB", "NdhF", ...

# ===== INPUT FILES (place these in DATA_DIR) =====
# Query FASTA should contain ONE protein sequence (the query).
QUERY_FASTA = f"{DATA_DIR}/{SUBUNIT}_Telong.fas"

# TaxID list is a 2-column, TAB-delimited file: Label <tab> TaxID
# Example header line allowed:  Header<TAB>TaxID
TAXID_LIST  = f"{DATA_DIR}/ndh_project_taxids.txt"

# ===== OUTPUT FILES (written to OUTPUT_DIR) =====
SUMMARY_CSV = f"{OUTPUT_DIR}/{SUBUNIT}_top_hits_by_taxid2.csv"
MSA_FASTA   = f"{OUTPUT_DIR}/{SUBUNIT}_top_hits_by_taxid2.faa"

print("SUBUNIT           :", SUBUNIT)
print("Query FASTA       :", QUERY_FASTA)
print("TaxID List        :", TAXID_LIST)
print("Summary CSV       :", SUMMARY_CSV)
print("Output FASTA      :", MSA_FASTA)


SUBUNIT           : NdhC
Query FASTA       : /content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed/Data/NdhC_Telong.fas
TaxID List        : /content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed/Data/ndh_project_taxids.txt
Summary CSV       : /content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed/Outputs/NdhC_top_hits_by_taxid2.csv
Output FASTA      : /content/drive/MyDrive/_Scratch/NDH1_BLASTp_taxon_directed/Outputs/NdhC_top_hits_by_taxid2.faa


#Cell 3: Load TaxID table

In [13]:
# Cell 4: Set BLAST parameters (tune these if needed)

# BLAST parameters
BLAST_PROGRAM = "blastp"
BLAST_DB = "nr"            # NCBI 'nr' via remote BLAST
EVALUE = 1e-5
HITLIST_SIZE = 10          # fetch a small list then take the best *usable* hit
MAX_HSPS = 1

# Courtesy delay between NCBI requests (seconds)
NCBI_SLEEP = 2

print("BLAST_PROGRAM :", BLAST_PROGRAM)
print("BLAST_DB      :", BLAST_DB)
print("EVALUE        :", EVALUE)
print("HITLIST_SIZE  :", HITLIST_SIZE)
print("MAX_HSPS      :", MAX_HSPS)
print("NCBI_SLEEP    :", NCBI_SLEEP)


BLAST_PROGRAM : blastp
BLAST_DB      : nr
EVALUE        : 1e-05
HITLIST_SIZE  : 10
MAX_HSPS      : 1
NCBI_SLEEP    : 2


In [15]:
# Cell 3: Load TaxID table (Header <tab> TaxID)

import csv

def load_taxids(taxid_file):
    labels, taxids = [], []

    with open(taxid_file, 'r') as fh:
        reader = csv.reader(fh, delimiter='\t')
        for row in reader:
            if len(row) < 2:
                continue

            label = row[0].strip()
            taxid = row[1].strip()

            # Skip header line
            if label.lower() == "header" and taxid.lower() == "taxid":
                continue

            if not taxid.isdigit():
                print("Skipping bad line:", row)
                continue

            labels.append(label)
            taxids.append(taxid)

    return labels, taxids

labels, taxids = load_taxids(TAXID_LIST)

print(f"Loaded {len(labels)} taxonomic labels/taxids.")
print(list(zip(labels, taxids))[:10])

Loaded 117 taxonomic labels/taxids.
[('A1.Candidatus', '3018267'), ('A10.Thermoplasmatales', '261391'), ('A11.Cuniculiplasma', '1673428'), ('A12.Candidatus', '2608793'), ('A13.Thermoplasma', '1973142'), ('A14.Candidatus', '2026803'), ('A15.Thermoplasma', '273116'), ('A16.Thermoplasma', '50339'), ('A17.Candidatus', '3107143'), ('A18.Thermoplasmatales', '667138')]


#Cell 4 — BLAST + Entrez helpers

In [16]:
# Cell 4: BLAST + Entrez helper functions (upgraded for iTOL metadata)

import time, re
import requests
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML

def run_taxid_blastp(query_seq, taxid, hitlist_size=10, expect=1e-5):
    """
    Remote blastp against nr restricted to a TaxID.
    Returns (best_alignment, stats_dict) or (None, None)
    """
    entrez_query = f"txid{taxid}[ORGN]"
    handle = NCBIWWW.qblast(
        program="blastp",
        database="nr",
        sequence=query_seq,
        entrez_query=entrez_query,
        hitlist_size=hitlist_size,
        expect=expect,
        format_type="XML"
    )
    blast_record = NCBIXML.read(handle)
    handle.close()

    if not blast_record.alignments:
        return None, None

    align = blast_record.alignments[0]
    hsp   = align.hsps[0]

    stats = {
        "bitscore": hsp.bits,
        "evalue": hsp.expect,
        "identity": hsp.identities,
        "align_length": hsp.align_length,
        "pident": 100*hsp.identities/max(1, hsp.align_length),
        "title": align.title,
    }
    return align, stats

def fetch_full_protein_fasta(accession):
    handle = Entrez.efetch(db="protein", id=accession, rettype="fasta", retmode="text")
    records = list(SeqIO.parse(handle, "fasta"))
    handle.close()
    return records[0] if records else None

def fetch_gb_record(accession):
    """
    GenBank record is best for organism name and db_xref (taxon, UniProt).
    """
    handle = Entrez.efetch(db="protein", id=accession, rettype="gb", retmode="text")
    rec = SeqIO.read(handle, "genbank")
    handle.close()
    return rec

def parse_taxid_from_gb(gb_record):
    """
    Extract taxid from source feature db_xref taxon:#### if present.
    """
    for feat in gb_record.features:
        if feat.type == "source":
            for x in feat.qualifiers.get("db_xref", []):
                m = re.search(r"taxon:(\d+)", x)
                if m:
                    return m.group(1)
    return ""

def parse_uniprot_from_gb(gb_record):
    """
    Try to extract UniProt from CDS db_xref lines in the GenBank record.
    Returns a ';'-joined string (possibly empty).
    """
    hits = set()
    for feat in gb_record.features:
        if feat.type != "CDS":
            continue
        for x in feat.qualifiers.get("db_xref", []):
            m = re.search(r"UniProtKB(?:/[^:]+)?:([A-Z0-9]{6,10})", x)
            if m:
                hits.add(m.group(1))
    return ";".join(sorted(hits))

def uniprot_map_refseq_to_uniprot(refseq_id, timeout=20):
    """
    Fallback: map RefSeq protein (WP_/YP_/NP_) to UniProt using UniProt ID mapping API.
    Returns UniProt accession or "".
    """
    try:
        submit = requests.post(
            "https://rest.uniprot.org/idmapping/run",
            data={"from": "RefSeq_Protein", "to": "UniProtKB", "ids": refseq_id},
            timeout=timeout
        )
        submit.raise_for_status()
        job = submit.json()["jobId"]

        # poll
        for _ in range(30):
            status = requests.get(f"https://rest.uniprot.org/idmapping/status/{job}", timeout=timeout)
            status.raise_for_status()
            js = status.json()
            if js.get("jobStatus") in (None, "FINISHED"):
                break
            time.sleep(1)

        res = requests.get(
            f"https://rest.uniprot.org/idmapping/uniprotkb/results/{job}",
            params={"format": "json"},
            timeout=timeout
        )
        res.raise_for_status()
        data = res.json()
        results = data.get("results", [])
        if results:
            return results[0]["to"]["primaryAccession"]
    except Exception:
        return ""
    return ""

#Cell 5: Main loop: BLAST each TaxID, fetch sequence

In [None]:
# Cell 5: Main loop (BLAST per TaxID, fetch sequences + metadata, write iTOL files)

import pandas as pd
from Bio.SeqRecord import SeqRecord
import os, time, re

# Read the query sequence
query_record = list(SeqIO.parse(QUERY_FASTA, "fasta"))[0]

summary_rows = []
seq_out = []

# iTOL dataset outputs
ITOL_LABELS = f"{OUTPUT_DIR}/{SUBUNIT}_iTOL_LABELS.tsv"
ITOL_POPUP  = f"{OUTPUT_DIR}/{SUBUNIT}_iTOL_POPUP.tsv"

# We'll build these as we go (keyed by NCBI accession)
itol_labels_rows = []   # (acc, display_label)
itol_popup_rows  = []   # (acc, popup_text)

def make_display_label(label, organism, acc, taxid_requested):
    """
    Choose what iTOL displays as the leaf label.
    You can adjust formatting here freely.
    """
    org = organism if organism else "Unknown_organism"
    return f"{org} | {acc} | {label} | txid:{taxid_requested}"

def make_popup_text(label, taxid_requested, acc, stats, organism, descr, taxid_hit, uniprot):
    """
    iTOL popup: can be multi-line; keep it readable.
    """
    lines = [
        f"Group label: {label}",
        f"Requested TaxID: {taxid_requested}",
        f"NCBI accession: {acc}",
        f"Organism: {organism}",
        f"Description: {descr}",
        f"TaxID (from GenBank): {taxid_hit}",
        f"UniProt: {uniprot}",
        "",
        f"BLAST bitscore: {stats.get('bitscore','')}",
        f"BLAST evalue: {stats.get('evalue','')}",
        f"BLAST %ident: {stats.get('pident',''):.2f}" if isinstance(stats.get("pident", None), (int,float)) else f"BLAST %ident: {stats.get('pident','')}",
        f"BLAST title: {stats.get('title','')}",
    ]
    return "\n".join(lines)

# Run across list with real-time reporting
for label, taxid in zip(labels, taxids):
    print(f"TaxID {taxid}  ({label}) ... ", end="")

    try:
        align, stats = run_taxid_blastp(str(query_record.seq), taxid, hitlist_size=HITLIST_SIZE, expect=EVALUE)
    except Exception as e:
        print("BLAST error:", e)
        summary_rows.append({
            "label": label,
            "taxid": taxid,
            "status": "blast_error",
            "error": str(e)
        })
        time.sleep(NCBI_SLEEP)
        continue

    if align is None:
        print("no hits")
        summary_rows.append({
            "label": label,
            "taxid": taxid,
            "status": "no_hits"
        })
        time.sleep(NCBI_SLEEP)
        continue

    # Accession from Biopython alignment object (works well)
    acc = align.accession
    print("hit:", acc)

    # Fetch FASTA sequence
    full = fetch_full_protein_fasta(acc)
    if full is None:
        print("  Failed to fetch sequence.")
        summary_rows.append({
            "label": label,
            "taxid": taxid,
            "ncbi_accession": acc,
            "status": "fetch_fasta_failed"
        })
        time.sleep(NCBI_SLEEP)
        continue

    # Fetch GenBank record for metadata
    organism = ""
    descr = ""
    taxid_hit = ""
    uniprot = ""

    try:
        gb = fetch_gb_record(acc)
        organism = gb.annotations.get("organism", "")
        descr = gb.description
        taxid_hit = parse_taxid_from_gb(gb)
        uniprot = parse_uniprot_from_gb(gb)

        # UniProt fallback mapping if missing and looks like RefSeq
        if (not uniprot) and re.match(r"^[A-Z]{2}_\d+", acc):
            unip_fb = uniprot_map_refseq_to_uniprot(acc)
            if unip_fb:
                uniprot = unip_fb
    except Exception as e:
        # keep going; we still have the sequence + BLAST stats
        print(f"  (metadata warning: {type(e).__name__})")

    # Write FASTA with a rich header (kept for your records)
    # IMPORTANT: iTOL/tree key should remain the accession, so we do NOT use this header as the ID.
    # We'll store accession as the FASTA ID, and put details in description.
    rich_desc = f"{label}|taxid:{taxid}|org:{organism}|uniprot:{uniprot}"
    out_rec = SeqRecord(full.seq, id=acc, description=rich_desc)
    seq_out.append(out_rec)

    # Summary table row
    summary_rows.append({
        "label": label,
        "taxid": taxid,
        "ncbi_accession": acc,
        "taxid_hit": taxid_hit,
        "organism": organism,
        "description": descr,
        "uniprot_accession": uniprot,
        **stats,
        "status": "hit"
    })

    # iTOL datasets
    display_label = make_display_label(label, organism, acc, taxid)
    popup_text = make_popup_text(label, taxid, acc, stats, organism, descr, taxid_hit, uniprot)

    itol_labels_rows.append((acc, display_label))
    itol_popup_rows.append((acc, popup_text))

    time.sleep(NCBI_SLEEP)

# ---- Write outputs ----
if seq_out:
    SeqIO.write(seq_out, MSA_FASTA, "fasta")
    print("FASTA saved to:", MSA_FASTA)

if summary_rows:
    df = pd.DataFrame(summary_rows)
    df.to_csv(SUMMARY_CSV, index=False)
    print("Summary CSV saved to:", SUMMARY_CSV)

# ---- Write iTOL datasets (LABELS + POPUP) ----
# LABELS format: https://itol.embl.de/help.cgi#labels
if itol_labels_rows:
    with open(ITOL_LABELS, "w") as f:
        f.write("LABELS\nSEPARATOR TAB\nDATA\n")
        for acc, lab in itol_labels_rows:
            f.write(f"{acc}\t{lab}\n")
    print("iTOL LABELS saved to:", ITOL_LABELS)

# POPUP format: https://itol.embl.de/help.cgi#popup
if itol_popup_rows:
    with open(ITOL_POPUP, "w") as f:
        f.write("POPUP_INFO\nSEPARATOR TAB\nDATA\n")
        for acc, text in itol_popup_rows:
            # iTOL wants \n escaped inside the popup field
            safe = text.replace("\t", " ").replace("\n", "\\n")
            f.write(f"{acc}\t{safe}\n")
    print("iTOL POPUP_INFO saved to:", ITOL_POPUP)

TaxID 3018267  (A1.Candidatus) ... hit: HVA22284
TaxID 261391  (A10.Thermoplasmatales) ... hit: EQB67953
TaxID 1673428  (A11.Cuniculiplasma) ... hit: WP_148690108
TaxID 2608793  (A12.Candidatus) ... hit: CAD6492525
TaxID 1973142  (A13.Thermoplasma) ... hit: WP_237265325
TaxID 2026803  (A14.Candidatus) ... hit: MBS3164726
TaxID 273116  (A15.Thermoplasma) ... hit: WP_241760266
TaxID 50339  (A16.Thermoplasma) ... hit: WP_241760266
TaxID 3107143  (A17.Candidatus) ... hit: MFQ6105625
TaxID 667138  (A18.Thermoplasmatales) ... hit: EQB65936
TaxID 2823368  (A19.Candidatus) ... hit: MBX8631139
TaxID 2032688  (A2.Candidatus) ... hit: HEY9206500
TaxID 1904752  (A20.Halobacteriales) ... hit: MFT4890955
TaxID 3073602  (A21.Oxyplasma) ... hit: WP_393970915
TaxID 2026795  (A22.Nitrososphaerota) ... hit: MGE5333765
TaxID 1906667  (A23.Methanomassiliicoccales) ... hit: MDD1768528
TaxID 1945595  (A24.Methanosarcinaceae) ... hit: MCD4702773
TaxID 2026739  (A25.Methanobacteriota) ... hit: HDH28661
TaxID 2

In [None]:
# Quick sanity check: confirm your input files are in DATA_DIR
import os

print("DATA_DIR exists?  ", os.path.isdir(DATA_DIR), DATA_DIR)
print("QUERY_FASTA exists?", os.path.exists(QUERY_FASTA), QUERY_FASTA)
print("TAXID_LIST exists? ", os.path.exists(TAXID_LIST), TAXID_LIST)

!ls -lah "{DATA_DIR}"
