In [6]:
from pathlib import Path
import os
from Bio import SeqIO, Seq

from local.constants import WORKSPACE_ROOT

In [7]:
NAME = "ab48"
assembly = WORKSPACE_ROOT/f"data/assembly/{NAME}.fna"
orf_prediction = WORKSPACE_ROOT/"data/orf_prediction"
orf_prediction.mkdir(parents=True, exist_ok=True)

In [8]:
prodigal_out = orf_prediction/f"{NAME}.prodigal.faa"
if (prodigal_out).exists():
    print("ORF prediction already done")
else:
    os.system(f"""
        prodigal -f gff \
            -i {assembly} \
            -o {orf_prediction}/{NAME}.gff \
            -a {prodigal_out} \
    """)

-------------------------------------
PRODIGAL v2.6.3 [February, 2016]         
Univ of Tenn / Oak Ridge National Lab
Doug Hyatt, Loren Hauser, et al.     
-------------------------------------
Request:  Single Genome, Phase:  Training
Reading in the sequence(s) to train...4799092 bp seq created, 51.65 pct GC
Locating all potential starts and stops...228420 nodes
Looking for GC bias in different frames...frame bias scores: 2.12 0.16 0.72
Building initial set of genes to train from...done!
Creating coding model and scoring nodes...done!
Examining upstream regions and training starts...done!
-------------------------------------
Request:  Single Genome, Phase:  Gene Finding
Finding genes in sequence #1 (4747469 bp)...done!
Finding genes in sequence #2 (51599 bp)...done!


In [9]:
OUTPUT = """
-------------------------------------
PRODIGAL v2.6.3 [February, 2016]         
Univ of Tenn / Oak Ridge National Lab
Doug Hyatt, Loren Hauser, et al.     
-------------------------------------
Request:  Single Genome, Phase:  Training
Reading in the sequence(s) to train...4799092 bp seq created, 51.65 pct GC
Locating all potential starts and stops...228420 nodes
Looking for GC bias in different frames...frame bias scores: 2.12 0.16 0.72
Building initial set of genes to train from...done!
Creating coding model and scoring nodes...done!
Examining upstream regions and training starts...done!
-------------------------------------
Request:  Single Genome, Phase:  Gene Finding
Finding genes in sequence #1 (4747469 bp)...done!
Finding genes in sequence #2 (51599 bp)...done!
"""

In [10]:
with open(orf_prediction/f"{NAME}.faa", "w") as f:
    for e in SeqIO.parse(prodigal_out, "fasta"):
        e.seq = Seq.Seq(str(e.seq).replace("*", ""))
        e.description = e.id
        SeqIO.write(e, f, "fasta")