In [None]:
# Ch06-2 - Genome Annotation

In [None]:
# Example wget command for downloading the E. coli Genbank file

In [None]:
! wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gbff.gz

In [None]:
# Install Prodigal to annotate the E. coli genome #

In [None]:
! brew install prodigal

In [None]:
# Alternative prodigal install options:
! arch -arm64 brew install prodigal 

In [None]:
# Or alternatively: 
! conda install bioconda::prodigal 

In [None]:
# Move over the E. coli reference genome file from Chapter 05

In [None]:
! cp ../Ch05/ecoli_genome/ecoli_reference.fasta input/ 

In [None]:
# Or alternatively, download it again

In [None]:
! wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz 
! gunzip GCF_000005845.2_ASM584v2_genomic.fna.gz 

In [None]:
# 1. Run Prodigal on the E. coli genome # 

In [None]:
# Import libraries
import subprocess
import os

In [None]:
# Function to run Prodigal
def run_prodigal(input_fasta, output_gbk, output_proteins, output_cds):
    """
    Run Prodigal on a given FASTA file to predict genes.
    Parameters:
        input_fasta (str): Path to the input FASTA file.
        output_gbk (str): Path to the output GenBank file for gene predictions.
        output_proteins (str): Path to the output FASTA file for predicted proteins.
        output_cds (str): Path to the output FASTA file for predicted CDS.
    """
    # Ensure input file exists
    if not os.path.exists(input_fasta):
        raise FileNotFoundError(f"Input FASTA file not found: {input_fasta}")
    # Construct Prodigal command
    command = [
        "prodigal",
        "-i", input_fasta,       # Input FASTA file
        "-o", output_gbk,        # Output GenBank file
        "-a", output_proteins,   # Output proteins FASTA file
        "-d", output_cds,        # Output CDS FASTA file
        "-p", "single"             # Mode (meta for metagenomes, single for single genome)
    ]
    # Run the Prodigal command
    try:
        print("Running Prodigal...")
        subprocess.run(command, check=True)
        print("Prodigal run completed.")
    except subprocess.CalledProcessError as e:
        print(f"Error running Prodigal: {e}")
    except FileNotFoundError:
        print("Prodigal is not installed or not in your PATH.")

In [None]:
# Main function to execute Prodigal on our files
if __name__ == "__main__":
    # Input and output file paths
    input_fasta = "input/ecoli_reference.fasta"       # Input FASTA file
    output_gbk = "output/ecoli_genes.gbk"             # Output GenBank file
    output_proteins = "output/ecoli_proteins.faa"     # Output proteins FASTA file
    output_cds = "output/ecoli_cds.fna"               # Output CDS FASTA file
    # Run Prodigal
    run_prodigal(input_fasta, output_gbk, output_proteins, output_cds)

In [None]:
# 2. Combining prodigal output to get a genbank - this worked!

In [None]:
# Import libraries
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [None]:
# Function to parse the Prodigal header
def parse_prodigal_header(header):
    """Parse Prodigal's FASTA header to extract gene information"""
    # Example header: >NODE_1_1 # 1 # 951 # 1 # ID=1_1 # partial=00 # start_type=ATG
    parts = header.split(' # ')
    seqid = parts[0][1:]  # Remove '>'
    start = int(parts[1])
    end = int(parts[2])
    strand = 1 if parts[3] == '1' else -1
    return {
        'seqid': seqid,
        'start': start,
        'end': end,
        'strand': strand
    }

In [None]:
# Function to write out the updated Genbank file
def create_genbank(genome_fasta, prodigal_fna, output_gb):
    """Create GenBank file from genome FASTA and Prodigal predictions"""
    # Read genome sequence
    genome_record = next(SeqIO.parse(genome_fasta, "fasta"))
    # Create new SeqRecord for GenBank
    gb_record = SeqRecord(
        seq=genome_record.seq,
        id=genome_record.id,
        name=genome_record.id,
        description="Generated from Prodigal predictions"
    )
    # Add required GenBank annotations
    gb_record.annotations["molecule_type"] = "DNA"
    gb_record.annotations["topology"] = "linear"
    gb_record.annotations["data_file_division"] = "BCT"
    gb_record.annotations["source"] = "Escherichia coli"
    gb_record.annotations["organism"] = "Escherichia coli"
    gb_record.annotations["taxonomy"] = ['Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 
                                       'Enterobacterales', 'Enterobacteriaceae', 'Escherichia']
    # Add features from Prodigal predictions
    feature_count = 0
    for record in SeqIO.parse(prodigal_fna, "fasta"):
        # Parse Prodigal header
        gene_info = parse_prodigal_header(record.description)   
        # Create feature
        feature = SeqFeature(
            location=FeatureLocation(
                gene_info['start'] - 1,  # Convert to 0-based indexing
                gene_info['end'],
                strand=gene_info['strand']
            ),
            type="CDS",
            qualifiers={
                "locus_tag": f"CDS_{feature_count+1}",
                "translation": str(record.seq.translate()),
                "product": "hypothetical protein",
                "note": ["Predicted by Prodigal"]
            }
        )
        # Add feature to record
        gb_record.features.append(feature)
        feature_count += 1
    # Write GenBank file
    SeqIO.write(gb_record, output_gb, "genbank")
    print(f"Created GenBank file with {feature_count} features")

In [None]:
# Main function to combine Prodigal output with Genome sequence
def main():
    create_genbank(
        genome_fasta="input/ecoli_reference.fasta",  # This is our e. coli fasta file
        prodigal_fna="output/ecoli_cds.fna",  # We will use the Prodigal CDS file
        output_gb="output/ecoli_prodigal_combined.gb"   # This will be the combined output file
    )
if __name__ == "__main__":
    main()

In [None]:
## 3. (Optional Exercise) Parse a Genbank file to Extract Annotations ##

In [None]:
from Bio import SeqIO
def annotate_ecoli(genbank_file, output_file):
    """
    Extract gene information from a GenBank file.

    Parameters:
        genbank_file (str): Path to the GenBank file.
        output_file (str): Path to save the annotation output.
    """
    annotations = []

    print(f"Reading GenBank file: {genbank_file}")
    for record in SeqIO.parse(genbank_file, "genbank"):
        print(f"Processing record: {record.id}")

        for feature in record.features:
            if feature.type == "gene":
                gene_info = {
                    "gene_id": feature.qualifiers.get("gene", ["unknown"])[0],
                    "locus_tag": feature.qualifiers.get("locus_tag", ["unknown"])[0],
                    "start": int(feature.location.start),
                    "end": int(feature.location.end),
                    "strand": "+" if feature.location.strand == 1 else "-",
                }
                annotations.append(gene_info)
            elif feature.type == "CDS":
                cds_info = {
                    "protein_id": feature.qualifiers.get("protein_id", ["unknown"])[0],
                    "gene": feature.qualifiers.get("gene", ["unknown"])[0],
                    "product": feature.qualifiers.get("product", ["unknown"])[0],
                    "start": int(feature.location.start),
                    "end": int(feature.location.end),
                    "strand": "+" if feature.location.strand == 1 else "-",
                }
                annotations.append(cds_info)
    # Write annotations to output file
    print(f"Writing annotations to: {output_file}")
    with open(output_file, "w") as out:
        out.write("Type\tID\tStart\tEnd\tStrand\tDetails\n")
        for annotation in annotations:
            if "gene_id" in annotation:
                out.write(
                    f"Gene\t{annotation['gene_id']}\t{annotation['start']}\t{annotation['end']}\t"
                    f"{annotation['strand']}\tLocus: {annotation['locus_tag']}\n"
                )
            elif "protein_id" in annotation:
                out.write(
                    f"CDS\t{annotation['protein_id']}\t{annotation['start']}\t{annotation['end']}\t"
                    f"{annotation['strand']}\tGene: {annotation['gene']}, Product: {annotation['product']}\n"
                )

    print("Annotation completed!")

# Main Usage
#genbank_file = "ecoli_genome/GCF_000005845.2_ASM584v2_genomic.gbff"  # Public Genbank file
genbank_file = "output/ecoli_prodigal_combined.gb"  # Genbank file we created using Prodigal
output_file = "output/ecoli_annotations.txt"  # Output file for annotations
annotate_ecoli(genbank_file, output_file)

In [None]:
# 4. Genome Browsers

In [None]:
# Install IGV
! brew install igv

In [None]:
# Alternative IGV installation commands
# brew install --cask igv
# OR
# brew install homebrew/cask/igv 
# OR
#arch -arm64 brew install --cask igv 
# OR use conda:
#conda install bioconda::igv  
# Note: Using conda may require a Java update! brew link igv   

In [None]:
# Create IGV symlinks
! brew link igv

In [None]:
# Load the Progidal genome in IGV
! igv output/ecoli_prodigal_combined.gb

In [None]:
## End of Notebook ##