In [1]:
import os
from Bio import SeqIO

In [2]:
# Directories
input_dir = "./"                 # current directory
output_dir = "FASTAS_aligned"    # output directory

os.makedirs(output_dir, exist_ok=True)

# outfmt6 extension
outfmt_ext = ".outfmt6"

for file in os.listdir(input_dir):
    if file.endswith("_nr.fasta"):

        fasta_path = os.path.join(input_dir, file)
        base = file.replace(".fasta", "")
        outfmt_path = os.path.join(input_dir, base + outfmt_ext)

        if not os.path.exists(outfmt_path):
            print(f"⚠ outfmt6 file not found for {file}")
            continue

        print(f"Processing: {file}")

        # -----------------------------
        # Read aligned query IDs
        # -----------------------------
        aligned_ids = set()

        with open(outfmt_path) as f:
            for line in f:
                if line.strip():
                    qid = line.split("\t")[0]
                    aligned_ids.add(qid)

        if not aligned_ids:
            print(f"  No alignments found in {outfmt_path}")
            continue

        # -----------------------------
        # Filter FASTA
        # -----------------------------
        records = []
        for record in SeqIO.parse(fasta_path, "fasta"):
            if record.id in aligned_ids:
                records.append(record)

        # -----------------------------
        # Write output
        # -----------------------------
        out_fasta = os.path.join(
            output_dir,
            file.replace("_nr.fasta", "_aligned.fasta")
        )

        SeqIO.write(records, out_fasta, "fasta")

        print(f"  -> {len(records)} sequences saved to {out_fasta}")

print("\n✅ Finished.")

Processing: kpsT_merged_nr.fasta
  -> 16283 sequences saved to FASTAS_aligned/kpsT_merged_aligned.fasta
Processing: nanA_merged_nr.fasta
  -> 102021 sequences saved to FASTAS_aligned/nanA_merged_aligned.fasta
Processing: nanR_merged_nr.fasta
  -> 9482 sequences saved to FASTAS_aligned/nanR_merged_aligned.fasta
Processing: nanE_merged_nr.fasta
  -> 39141 sequences saved to FASTAS_aligned/nanE_merged_aligned.fasta
Processing: cpsK_merged_nr.fasta
  -> 33 sequences saved to FASTAS_aligned/cpsK_merged_aligned.fasta
Processing: neuA_merged_nr.fasta
  -> 53332 sequences saved to FASTAS_aligned/neuA_merged_aligned.fasta
Processing: nanC_merged_nr.fasta
  -> 1760 sequences saved to FASTAS_aligned/nanC_merged_aligned.fasta
Processing: nanT_merged_nr.fasta
  -> 12762 sequences saved to FASTAS_aligned/nanT_merged_aligned.fasta
Processing: siaQ_merged_nr.fasta
  -> 320875 sequences saved to FASTAS_aligned/siaQ_merged_aligned.fasta
Processing: nanM_merged_nr.fasta
  -> 5888 sequences saved to FASTA