<a href="https://colab.research.google.com/github/RobBurnap/Bioinformatics-MICR4203-MICR5203/blob/main/%20%20%20%20NDH-1%20/notebooks/BLASTp_taxon_directed_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Cell 0: Mount Drive and define project folders

In [31]:
from google.colab import drive
drive.mount('/content/drive')

# Adjust if you rename the project folder
PROJECT_ROOT = "/content/drive/MyDrive/Research/Gunner_Collaboration/Data/BLASTp_taxon-directed"

DATA_DIR    = f"{PROJECT_ROOT}/Data"
OUTPUT_DIR  = f"{PROJECT_ROOT}/Outputs"

print("DATA_DIR   :", DATA_DIR)
print("OUTPUT_DIR :", OUTPUT_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DATA_DIR   : /content/drive/MyDrive/Research/Gunner_Collaboration/Data/BLASTp_taxon-directed/Data
OUTPUT_DIR : /content/drive/MyDrive/Research/Gunner_Collaboration/Data/BLASTp_taxon-directed/Outputs


##Cell 1:Install dependencies (BLAST+, Biopython, pandas)

In [32]:
!apt-get update -qq
!apt-get install -y ncbi-blast+ -qq

!pip install biopython pandas -q
!apt-get install -y ncbi-blast+ -qq
!pip install biopython pandas requests -q

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


##Cell 2:Configuration: paths, NCBI email, input files

In [41]:
# Cell 2: Configuration (paths, email, input files)

import os
import pandas as pd
from Bio import Entrez, SeqIO

# REQUIRED by NCBI
Entrez.email = "rob.burnap@okstate.edu"

# ===== INPUT FILES =====
DATA_DIR   = "/content/drive/MyDrive/Research/Gunner_Collaboration/Data/BLASTp_taxon-directed"
OUTPUT_DIR = "/content/drive/MyDrive/Research/Gunner_Collaboration/Outputs/BLASTp_taxon-directed"

QUERY_FASTA = f"{DATA_DIR}/NdhA_Telong.fas"
TAXID_LIST  = f"{DATA_DIR}/ndh_project_taxids.txt"   # <-- FIXED

# ===== OUTPUT FILES =====
SUMMARY_CSV = f"{OUTPUT_DIR}/ndhF_top_hits_by_taxid.csv"
MSA_FASTA   = f"{OUTPUT_DIR}/ndhF_top_hits_by_taxid.faa"

print("Query FASTA       :", QUERY_FASTA)
print("TaxID List        :", TAXID_LIST)
print("Summary CSV       :", SUMMARY_CSV)
print("Output FASTA      :", MSA_FASTA)

Query FASTA       : /content/drive/MyDrive/Research/Gunner_Collaboration/Data/BLASTp_taxon-directed/NdhA_Telong.fas
TaxID List        : /content/drive/MyDrive/Research/Gunner_Collaboration/Data/BLASTp_taxon-directed/ndh_project_taxids.txt
Summary CSV       : /content/drive/MyDrive/Research/Gunner_Collaboration/Outputs/BLASTp_taxon-directed/ndhF_top_hits_by_taxid.csv
Output FASTA      : /content/drive/MyDrive/Research/Gunner_Collaboration/Outputs/BLASTp_taxon-directed/ndhF_top_hits_by_taxid.faa


#Cell 3: Load TaxID table

In [42]:
# Cell 3: Load TaxID table (Header <tab> TaxID)

import csv

def load_taxids(taxid_file):
    labels, taxids = [], []

    with open(taxid_file, 'r') as fh:
        reader = csv.reader(fh, delimiter='\t')
        for row in reader:
            if len(row) < 2:
                continue

            label = row[0].strip()
            taxid = row[1].strip()

            # Skip header line
            if label.lower() == "header" and taxid.lower() == "taxid":
                continue

            if not taxid.isdigit():
                print("Skipping bad line:", row)
                continue

            labels.append(label)
            taxids.append(taxid)

    return labels, taxids

labels, taxids = load_taxids(TAXID_LIST)

print(f"Loaded {len(labels)} taxonomic labels/taxids.")
print(list(zip(labels, taxids))[:10])

Loaded 117 taxonomic labels/taxids.
[('A1.Candidatus', '3018267'), ('A10.Thermoplasmatales', '261391'), ('A11.Cuniculiplasma', '1673428'), ('A12.Candidatus', '2608793'), ('A13.Thermoplasma', '1973142'), ('A14.Candidatus', '2026803'), ('A15.Thermoplasma', '273116'), ('A16.Thermoplasma', '50339'), ('A17.Candidatus', '3107143'), ('A18.Thermoplasmatales', '667138')]


#Cell 4 â€” BLAST + Entrez helpers

In [43]:
# Cell 4: BLAST + Entrez helper functions

import time
import requests
from Bio.Blast import NCBIWWW, NCBIXML

def run_taxid_blastp(query_seq, taxid, hitlist_size=10, expect=1e-5):
    entrez_query = f"txid{taxid}[ORGN]"
    handle = NCBIWWW.qblast(
        program="blastp",
        database="nr",
        sequence=query_seq,
        entrez_query=entrez_query,
        hitlist_size=hitlist_size,
        expect=expect,
        format_type="XML"
    )
    blast_record = NCBIXML.read(handle)
    handle.close()

    if not blast_record.alignments:
        return None, None

    align = blast_record.alignments[0]
    hsp   = align.hsps[0]

    stats = {
        "bitscore": hsp.bits,
        "evalue": hsp.expect,
        "identity": hsp.identities,
        "align_length": hsp.align_length,
        "pident": 100*hsp.identities/hsp.align_length,
        "title": align.title,
    }

    return align, stats


def fetch_full_protein_fasta(accession):
    try:
        handle = Entrez.efetch(
            db="protein", id=accession,
            rettype="fasta", retmode="text"
        )
        records = list(SeqIO.parse(handle, "fasta"))
        handle.close()
        if records:
            return records[0]
    except:
        pass
    return None


def try_uniprot_mapping(accession):
    try:
        url = ("https://rest.uniprot.org/uniprotkb/search"
               f"?query={accession}&format=tsv&fields=accession")
        r = requests.get(url, timeout=10)
        lines = r.text.strip().splitlines()
        if len(lines) >= 2:
            return lines[1].split("\t")[0]
    except:
        pass
    return ""

#Cell 5: Main loop: BLAST each TaxID, fetch sequence

In [None]:
# Cell 5: Main loop (run BLAST, fetch sequences, save outputs)

from Bio.SeqRecord import SeqRecord

summary_rows = []
query_record = list(SeqIO.parse(QUERY_FASTA, "fasta"))[0]

seq_out = []

for label, taxid in zip(labels, taxids):
    print(f"TaxID {taxid}  ({label}) ... ", end="")

    try:
        align, stats = run_taxid_blastp(str(query_record.seq), taxid)
    except Exception as e:
        print("BLAST error:", e)
        continue

    if align is None:
        print("no hits")
        continue

    acc = align.accession
    print("hit:", acc)

    full = fetch_full_protein_fasta(acc)
    if full is None:
        print("  Failed to fetch sequence.")
        continue

    uniprot = try_uniprot_mapping(acc)

    # ----- FASTA header format Option A -----
    header = f"{label}|{acc}|taxid:{taxid}"

    out_rec = SeqRecord(full.seq, id=header, description="")
    seq_out.append(out_rec)

    # Record for summary CSV
    summary_rows.append({
        "label": label,
        "taxid": taxid,
        "ncbi_accession": acc,
        "uniprot_accession": uniprot,
        **stats
    })

    time.sleep(2)   # NCBI courtesy wait


# Write outputs
if seq_out:
    SeqIO.write(seq_out, MSA_FASTA, "fasta")
    print("FASTA saved to:", MSA_FASTA)

if summary_rows:
    df = pd.DataFrame(summary_rows)
    df.to_csv(SUMMARY_CSV, index=False)
    print("Summary CSV saved to:", SUMMARY_CSV)

TaxID 3018267  (A1.Candidatus) ... hit: HVA22291
TaxID 261391  (A10.Thermoplasmatales) ... hit: EQB67957
TaxID 1673428  (A11.Cuniculiplasma) ... hit: WMT45016
TaxID 2608793  (A12.Candidatus) ... hit: CAD6492513
TaxID 1973142  (A13.Thermoplasma) ... hit: WP_297025968
TaxID 2026803  (A14.Candidatus) ... hit: MAG63130
TaxID 273116  (A15.Thermoplasma) ... hit: WP_010917370
TaxID 50339  (A16.Thermoplasma) ... hit: WP_010917370
TaxID 3107143  (A17.Candidatus) ... hit: MEE8402932
TaxID 667138  (A18.Thermoplasmatales) ... hit: EQB65932
TaxID 2823368  (A19.Candidatus) ... hit: MBX8631143
TaxID 2032688  (A2.Candidatus) ... hit: HMB46486
TaxID 1904752  (A20.Halobacteriales) ... hit: MCA1819543
TaxID 3073602  (A21.Oxyplasma) ... hit: WP_393970911
TaxID 2026795  (A22.Nitrososphaerota) ... hit: MGE5335303
TaxID 1906667  (A23.Methanomassiliicoccales) ... hit: TET90366
TaxID 1945595  (A24.Methanosarcinaceae) ... hit: MDD2439728
TaxID 2026739  (A25.Methanobacteriota) ... hit: MBU4340117
TaxID 2026739  

In [40]:
!ls -l "/content/drive/MyDrive/Research/Gunner_Collaboration/Data/BLASTp_taxon-directed"

total 5
-rw------- 1 root root  482 Nov 21 02:55 NdhA_Telong.fas
-rw------- 1 root root 2732 Nov 21 15:31 ndh_project_taxids.txt
-rw------- 1 root root  651 Aug 26 21:42 query_protein.fasta
