In [None]:
# Ch06-4 Protein Domain annotation

In [None]:
# Install hmmer
! brew install hmmer
# Download the PFAM database
! mkdir -p pfam
! wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz
! gunzip Pfam-A.hmm.gz
! mv Pfam-A.hmm pfam/
! cd pfam
! hmmpress Pfam-A.hmm  # Index the HMM database
! cd ..

In [None]:
# Code to make a smaller file with only 10 entries from the E. coli protein fasta (.faa) file
from Bio import SeqIO
# Define input and output file paths
input_fasta = "output/ecoli_proteins.faa"  # Input protein FASTA file
output_fasta = "output/ecoli_proteins.top10.faa"  # Output file for the first 10 entries
# Function to select the first N entries from a FASTA file (Default is 10)
def select_first_n_entries(input_file, output_file, n=10):
    """
    Selects the first N entries from a FASTA file and writes them to a new file.
    Parameters:
        input_file (str): Path to the input FASTA file.
        output_file (str): Path to the output FASTA file.
        n (int): Number of entries to select.
    Returns:
        None
    """
    # Read and write the first N entries
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        records = SeqIO.parse(infile, "fasta")
        limited_records = (record for i, record in enumerate(records) if i < n)
        SeqIO.write(limited_records, outfile, "fasta")
    print(f"First {n} entries written to {output_file}")

# Call the function to select the first 10 entries of the E. coli faa file
select_first_n_entries(input_fasta, output_fasta, n=10)

In [None]:
# Import Libraries
import os
import subprocess
from Bio import SeqIO

In [None]:
# Define file paths
fasta_file = "output/ecoli_proteins.top10.faa"  # Input protein FASTA file (from Prodigal annotation)
pfam_hmm_db = "pfam/Pfam-A.hmm"          # Path to the Pfam-A HMM database
output_domtblout = "output/ecoli.pfam_domains.top10.out"  # HMMER domain table output file for E. coli

In [None]:
# Function to run hmmsearch
def run_hmmsearch(input_fasta, hmm_db, output_file):
    """
    Runs HMMER's hmmsearch to find Pfam domains in the input FASTA file.
    Parameters:
        input_fasta (str): Path to the input protein FASTA file.
        hmm_db (str): Path to the Pfam-A HMM database.
        output_file (str): Path to the output domain table file.
    Returns:
        None
    """
    command = [
        "hmmsearch",  # HMMER executable
        "--domtblout", output_file,  # Domain table output
        hmm_db,  # Pfam HMM database
        input_fasta  # Input protein FASTA file
    ]
    print(f"Running HMMER with command: {' '.join(command)}")
    subprocess.run(command, check=True)
    print(f"HMMER search completed. Results saved to {output_file}")

In [None]:
# Function to parse hmmsearch output
def parse_hmmsearch_output(domtbl_file):
    """
    Parses the hmmsearch domain table output to extract domain annotations.

    Parameters:
        domtbl_file (str): Path to the domain table output file.

    Returns:
        list: A list of domain annotations (query name, domain name, e-value).
    """
    annotations = []
    with open(domtbl_file, "r") as file:
        for line in file:
            if line.startswith("#") or not line.strip():
                continue
            fields = line.split()
            query_name = fields[0]  # Query sequence name
            domain_name = fields[3]  # Domain name (Pfam ID)
            e_value = float(fields[6])  # E-value of the match
            annotations.append((query_name, domain_name, e_value))
    return annotations

In [None]:
# Function to annotate FASTA file with Pfam domains
def annotate_fasta_with_pfam(fasta_file, annotations):
    """
    Annotates sequences in a FASTA file with Pfam domain information.

    Parameters:
        fasta_file (str): Path to the input FASTA file.
        annotations (list): List of domain annotations.

    Returns:
        None
    """
    annotated_file = "annotated_protein.fasta"
    seq_annotations = {query: [] for query, _, _ in annotations}
    for query, domain, e_value in annotations:
        seq_annotations[query].append(f"{domain} (E-value: {e_value:.2e})")
    
    with open(annotated_file, "w") as output:
        for record in SeqIO.parse(fasta_file, "fasta"):
            domains = seq_annotations.get(record.id, [])
            record.description += " | Domains: " + ", ".join(domains) if domains else ""
            SeqIO.write(record, output, "fasta")
    print(f"Annotated FASTA file saved to {annotated_file}")

In [None]:
# Main execution
if __name__ == "__main__":
    # Ensure Pfam database is indexed
    if not os.path.exists(f"{pfam_hmm_db}.h3f"):
        print("Indexing Pfam HMM database...")
        subprocess.run(["hmmpress", pfam_hmm_db], check=True)

    # Run hmmsearch
    run_hmmsearch(fasta_file, pfam_hmm_db, output_domtblout)

    # Parse the hmmsearch output
    pfam_annotations = parse_hmmsearch_output(output_domtblout)

    # Annotate the input FASTA file with Pfam domains
    annotate_fasta_with_pfam(fasta_file, pfam_annotations)

In [None]:
# Clean up by moving the hmmer annotated protein fasta output into our output directory
! mv annotated_protein.fasta output 

In [None]:
## End of Notebook ##