In [None]:
# Ch06-4 Protein Domain annotation

In [None]:
# Install hmmer
! brew install hmmer
# Download the PFAM database
! mkdir -p pfam
! wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz
! gunzip Pfam-A.hmm.gz
! mv Pfam-A.hmm pfam/
! cd pfam
! hmmpress Pfam-A.hmm  # Index the HMM database
! cd ..

In [2]:
# Take only the first 10 entries from the E. coli protein fasta (.faa) file
from Bio import SeqIO

# Define input and output file paths
input_fasta = "output/ecoli_proteins.faa"  # Input protein FASTA file
output_fasta = "output/ecoli_proteins.top10.faa"  # Output file for the first 10 entries

# Function to select the first N entries from a FASTA file
def select_first_n_entries(input_file, output_file, n=10):
    """
    Selects the first N entries from a FASTA file and writes them to a new file.

    Parameters:
        input_file (str): Path to the input FASTA file.
        output_file (str): Path to the output FASTA file.
        n (int): Number of entries to select.

    Returns:
        None
    """
    # Read and write the first N entries
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        records = SeqIO.parse(infile, "fasta")
        limited_records = (record for i, record in enumerate(records) if i < n)
        SeqIO.write(limited_records, outfile, "fasta")
    print(f"First {n} entries written to {output_file}")

# Call the function to select the first 10 entries
select_first_n_entries(input_fasta, output_fasta, n=10)


First 10 entries written to output/ecoli_proteins.top10.faa


In [4]:
import os
import subprocess
from Bio import SeqIO

# Define file paths
fasta_file = "output/ecoli_proteins.top10.faa"  # Input protein FASTA file (from Prodigal annotation)
pfam_hmm_db = "pfam/Pfam-A.hmm"          # Path to the Pfam-A HMM database
output_domtblout = "output/ecoli.pfam_domains.top10.out"  # HMMER domain table output file for E. coli

# Function to run hmmsearch
def run_hmmsearch(input_fasta, hmm_db, output_file):
    """
    Runs HMMER's hmmsearch to find Pfam domains in the input FASTA file.

    Parameters:
        input_fasta (str): Path to the input protein FASTA file.
        hmm_db (str): Path to the Pfam-A HMM database.
        output_file (str): Path to the output domain table file.

    Returns:
        None
    """
    command = [
        "hmmsearch",  # HMMER executable
        "--domtblout", output_file,  # Domain table output
        hmm_db,  # Pfam HMM database
        input_fasta  # Input protein FASTA file
    ]
    print(f"Running HMMER with command: {' '.join(command)}")
    subprocess.run(command, check=True)
    print(f"HMMER search completed. Results saved to {output_file}")

# Function to parse hmmsearch output
def parse_hmmsearch_output(domtbl_file):
    """
    Parses the hmmsearch domain table output to extract domain annotations.

    Parameters:
        domtbl_file (str): Path to the domain table output file.

    Returns:
        list: A list of domain annotations (query name, domain name, e-value).
    """
    annotations = []
    with open(domtbl_file, "r") as file:
        for line in file:
            if line.startswith("#") or not line.strip():
                continue
            fields = line.split()
            query_name = fields[0]  # Query sequence name
            domain_name = fields[3]  # Domain name (Pfam ID)
            e_value = float(fields[6])  # E-value of the match
            annotations.append((query_name, domain_name, e_value))
    return annotations

# Function to annotate FASTA file with Pfam domains
def annotate_fasta_with_pfam(fasta_file, annotations):
    """
    Annotates sequences in a FASTA file with Pfam domain information.

    Parameters:
        fasta_file (str): Path to the input FASTA file.
        annotations (list): List of domain annotations.

    Returns:
        None
    """
    annotated_file = "annotated_protein.fasta"
    seq_annotations = {query: [] for query, _, _ in annotations}
    for query, domain, e_value in annotations:
        seq_annotations[query].append(f"{domain} (E-value: {e_value:.2e})")
    
    with open(annotated_file, "w") as output:
        for record in SeqIO.parse(fasta_file, "fasta"):
            domains = seq_annotations.get(record.id, [])
            record.description += " | Domains: " + ", ".join(domains) if domains else ""
            SeqIO.write(record, output, "fasta")
    print(f"Annotated FASTA file saved to {annotated_file}")

# Main execution
if __name__ == "__main__":
    # Ensure Pfam database is indexed
    if not os.path.exists(f"{pfam_hmm_db}.h3f"):
        print("Indexing Pfam HMM database...")
        subprocess.run(["hmmpress", pfam_hmm_db], check=True)

    # Run hmmsearch
    run_hmmsearch(fasta_file, pfam_hmm_db, output_domtblout)

    # Parse the hmmsearch output
    pfam_annotations = parse_hmmsearch_output(output_domtblout)

    # Annotate the input FASTA file with Pfam domains
    annotate_fasta_with_pfam(fasta_file, pfam_annotations)


Running HMMER with command: hmmsearch --domtblout output/ecoli.pfam_domains.top10.out pfam/Pfam-A.hmm output/ecoli_proteins.top10.faa
# hmmsearch :: search profile(s) against a sequence database
# HMMER 3.4 (Aug 2023); http://hmmer.org/
# Copyright (C) 2023 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query HMM file:                  pfam/Pfam-A.hmm
# target sequence database:        output/ecoli_proteins.top10.faa
# per-dom hits tabular output:     output/ecoli.pfam_domains.top10.out
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       1-cysPrx_C  [M=41]
Accession:   PF10417.14
Description: C-terminal domain of 1-Cys peroxiredoxin
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Sequence Description
    -

Passed Vit filter:                         0  (0); expected 0.0 (0.001)
Passed Fwd filter:                         0  (0); expected 0.0 (1e-05)
Initial search space (Z):                 10  [actual number of targets]
Domain search space  (domZ):               0  [number of targets reported over threshold]
# CPU time: 0.00u 0.00s 00:00:00.00 Elapsed: 00:00:00.00
# Mc/sec: 2169.81
//
Query:       Anamorsin_N  [M=163]
Accession:   PF20922.3
Description: Anamorsin, N-terminal
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Sequence Description
    ------- ------ -----    ------- ------ -----   ---- --  -------- -----------

   [No hits detected that satisfy reporting thresholds]


Domain annotation for each sequence (and alignments):

   [No targets detected that satisfy reporting thresholds]


Internal pipeline statistics summary:
-----------------------------

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



.0 (1e-05)
Initial search space (Z):                 10  [actual number of targets]
Domain search space  (domZ):               0  [number of targets reported over threshold]
# CPU time: 0.00u 0.00s 00:00:00.00 Elapsed: 00:00:00.00
# Mc/sec: 2029.09
//
Query:       NOMO_5th  [M=89]
Accession:   PF23194.1
Description: NOMO fifth transthyretin-like domain
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Sequence Description
    ------- ------ -----    ------- ------ -----   ---- --  -------- -----------

   [No hits detected that satisfy reporting thresholds]


Domain annotation for each sequence (and alignments):

   [No targets detected that satisfy reporting thresholds]


Internal pipeline statistics summary:
-------------------------------------
Query model(s):                            1  (89 nodes)
Target sequences:                         10  (3131 resi

Passed Fwd filter:                         0  (0); expected 0.0 (1e-05)
Initial search space (Z):                 10  [actual number of targets]
Domain search space  (domZ):               0  [number of targets reported over threshold]
# CPU time: 0.00u 0.00s 00:00:00.00 Elapsed: 00:00:00.00
# Mc/sec: 1829.68
//
Query:       PEP-utilizers_C  [M=293]
Accession:   PF02896.24
Description: PEP-utilising enzyme, PEP-binding domain
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Sequence Description
    ------- ------ -----    ------- ------ -----   ---- --  -------- -----------

   [No hits detected that satisfy reporting thresholds]


Domain annotation for each sequence (and alignments):

   [No targets detected that satisfy reporting thresholds]


Internal pipeline statistics summary:
-------------------------------------
Query model(s):                        

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Clean up by moving the hmmer annotated protein fasta output into our output directory
! mv annotated_protein.fasta output 