In [None]:
# Ch12-4 Gene Discovery

In [None]:
! pip install biopython pandas numpy scikit-learn networkx matplotlib goatools

In [None]:
# Create a directory for the data
! mkdir -p genomics_data
! cd genomics_data

# Download target genomes (pathogenic bacteria)
# E. coli O157:H7 (pathogenic strain)
! wget -O target_organism1.fasta "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/865/GCF_000008865.2_ASM886v2/GCF_000008865.2_ASM886v2_protein.faa.gz"
! gunzip -f target_organism1.fasta.gz

# Vibrio cholerae (cholera pathogen)
! wget -O target_organism2.fasta "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/006/745/GCF_000006745.1_ASM674v1/GCF_000006745.1_ASM674v1_protein.faa.gz"
! gunzip -f target_organism2.fasta.gz

# Download reference genomes (non-pathogenic relatives)
# E. coli K-12 (non-pathogenic lab strain)
! wget -O reference_organism1.fasta "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_protein.faa.gz"
! gunzip -f reference_organism1.fasta.gz

# Vibrio natriegens (non-pathogenic Vibrio)
! wget -O reference_organism2.fasta "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/456/255/GCF_001456255.1_ASM145625v1/GCF_001456255.1_ASM145625v1_protein.faa.gz"
! gunzip -f reference_organism2.fasta.gz

# Bacillus subtilis (different genus as outgroup)
! wget -O reference_organism3.fasta "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_protein.faa.gz"
! gunzip -f reference_organism3.fasta.gz

# Return to previous directory
! cd ..

! echo "Downloaded and prepared example FASTA files for comparative genomics analysis"

In [9]:
#!/usr/bin/env python3
"""
Subtractive Comparative Genomics Pipeline

This script demonstrates a workflow for identifying unique genes in a target organism
by comparing against reference genomes. The pipeline includes:
1. Genome sequence retrieval
2. Gene prediction
3. Sequence similarity comparison
4. Filtering for unique genes
5. Functional annotation of candidate genes

Requirements:
- Biopython
- BLAST+ command line tools
- Prodigal (for prokaryotic gene prediction)
- HMMER and Pfam database (for functional annotation)
"""

import os
import subprocess
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastpCommandline
import pandas as pd
import argparse

In [12]:
def setup_directories():
    """Create necessary directories for the pipeline"""
    directories = ['genomes', 'predictions', 'blast_results', 'unique_genes', 'annotations']
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    return directories

def download_genome(accession, output_dir):
    """Download genome from NCBI using accession number"""
    output_file = f"{output_dir}/{accession}.fasta"
    if not os.path.exists(output_file):
        print(f"Downloading {accession}...")
        # In a real implementation, use Entrez from Biopython to download
        # For demonstration purposes:
        from Bio import Entrez
        Entrez.email = "your.email@example.com"
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
        with open(output_file, 'w') as out_f:
            out_f.write(handle.read())
        print(f"Downloaded {accession} to {output_file}")
    else:
        print(f"Genome {accession} already exists at {output_file}")
    return output_file

def predict_genes(genome_file, output_dir, organism_type="prokaryote"):
    """Predict genes from genome sequence"""
    genome_name = os.path.basename(genome_file).split('.')[0]
    output_prefix = f"{output_dir}/{genome_name}"
    protein_file = f"{output_prefix}_proteins.faa"
    
    if not os.path.exists(protein_file):
        print(f"Predicting genes for {genome_name}...")
        if organism_type == "prokaryote":
            # Using Prodigal for prokaryotic gene prediction
            cmd = f"prodigal -i {genome_file} -a {protein_file} -o {output_prefix}_genes.gff -f gff"
            subprocess.run(cmd, shell=True, check=True)
        else:
            # For eukaryotes, more complex gene prediction tools would be used
            # This is simplified for demonstration
            print("Eukaryotic gene prediction requires tools like Augustus or MAKER")
        print(f"Gene prediction complete for {genome_name}")
    else:
        print(f"Gene predictions already exist for {genome_name}")
    
    return protein_file

def create_blast_database(protein_file):
    """Create BLAST database from protein sequences"""
    db_name = protein_file
    cmd = f"makeblastdb -in {protein_file} -dbtype prot -out {db_name}"
    subprocess.run(cmd, shell=True, check=True)
    return db_name

def run_blast_comparison(query_proteins, subject_db, output_file, evalue=1e-5):
    """Run BLAST to compare query proteins against subject database"""
    if not os.path.exists(output_file):
        print(f"Running BLAST comparison: {query_proteins} vs {subject_db}")
        blastp_cline = NcbiblastpCommandline(
            query=query_proteins,
            db=subject_db,
            out=output_file,
            outfmt="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore",
            evalue=evalue,
            max_target_seqs=1
        )
        stdout, stderr = blastp_cline()
        print(f"BLAST comparison complete. Results saved to {output_file}")
    else:
        print(f"BLAST results already exist at {output_file}")
    return output_file

def identify_unique_genes(target_protein_file, blast_results_files, output_dir, identity_threshold=30, coverage_threshold=50):
    """Identify genes in target organism that have no significant hits in reference organisms"""
    target_name = os.path.basename(target_protein_file).split('_')[0]
    
    # Create a dictionary of all proteins in the target organism
    target_proteins = {}
    for record in SeqIO.parse(target_protein_file, "fasta"):
        target_proteins[record.id] = record
    
    # Track proteins with significant hits in reference genomes
    proteins_with_hits = set()
    
    # Process each BLAST result file
    for blast_file in blast_results_files:
        with open(blast_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                query_id = parts[0]
                identity = float(parts[2])
                alignment_length = float(parts[3])
                query_length = len(target_proteins[query_id].seq)
                coverage = (alignment_length / query_length) * 100
                
                # If a protein has a significant hit, add it to the set
                if identity >= identity_threshold and coverage >= coverage_threshold:
                    proteins_with_hits.add(query_id)
    
    # Identify proteins without significant hits (unique genes)
    unique_proteins = set(target_proteins.keys()) - proteins_with_hits
    
    # Write unique proteins to output file
    output_file = f"{output_dir}/{target_name}_unique_proteins.faa"
    with open(output_file, 'w') as out_f:
        for protein_id in unique_proteins:
            SeqIO.write(target_proteins[protein_id], out_f, "fasta")
    
    print(f"Identified {len(unique_proteins)} unique proteins in {target_name}")
    print(f"Unique proteins saved to {output_file}")
    
    return output_file, unique_proteins

def annotate_unique_genes(unique_proteins_file, output_dir):
    """Annotate unique genes using HMMER and Pfam database"""
    output_name = os.path.basename(unique_proteins_file).split('_')[0]
    hmmer_output = f"{output_dir}/{output_name}_pfam_annotations.txt"
    
    # In a real implementation, run HMMER against Pfam database
    cmd = f"hmmscan --domtblout {hmmer_output} pfam/Pfam-A.hmm {unique_proteins_file}"
    # subprocess.run(cmd, shell=True, check=True)
    
    # This is a placeholder for parsing HMMER output
    # In a real implementation, parse the domtblout file to extract domain annotations
    annotations = {}
    print(f"Functional annotation complete. Results saved to {hmmer_output}")
    
    return hmmer_output, annotations



In [13]:
def main():
    # Example usage with Mycobacterium tuberculosis as target and related species as references
    
    # 1. Set up directories
    directories = setup_directories()
    
    # 2. Define target and reference organisms
    target_accession = "NC_000962"  # M. tuberculosis H37Rv
    reference_accessions = [
        "NC_002945",  # M. bovis
        "NC_008769",  # M. avium
        "NC_002677"   # M. leprae
    ]
    
    # 3. Download genomes
    target_genome = download_genome(target_accession, directories[0])
    reference_genomes = [download_genome(acc, directories[0]) for acc in reference_accessions]
    
    # 4. Predict genes
    target_proteins = predict_genes(target_genome, directories[1])
    reference_proteins = [predict_genes(genome, directories[1]) for genome in reference_genomes]
    
    # 5. Create BLAST databases for reference genomes
    reference_dbs = [create_blast_database(proteins) for proteins in reference_proteins]
    
    # 6. Run BLAST comparisons
    blast_results = []
    for i, db in enumerate(reference_dbs):
        ref_name = os.path.basename(reference_genomes[i]).split('.')[0]
        target_name = os.path.basename(target_genome).split('.')[0]
        output_file = f"{directories[2]}/{target_name}_vs_{ref_name}.blast"
        blast_results.append(run_blast_comparison(target_proteins, db, output_file))
    
    # 7. Identify unique genes
    unique_genes_file, unique_gene_ids = identify_unique_genes(
        target_proteins, blast_results, directories[3]
    )
    
    # 8. Annotate unique genes
    annotation_file, annotations = annotate_unique_genes(unique_genes_file, directories[4])
    
    # 9. Generate summary report
    print("\nSubtractive Comparative Genomics Results:")
    print(f"Target organism: {target_accession}")
    print(f"Reference organisms: {', '.join(reference_accessions)}")
    print(f"Total predicted proteins in target: {len(list(SeqIO.parse(target_proteins, 'fasta')))}")
    print(f"Number of unique proteins identified: {len(unique_gene_ids)}")
    print(f"Unique protein sequences saved to: {unique_genes_file}")
    print(f"Functional annotations saved to: {annotation_file}")
    
    # In a complete implementation, additional analyses could include:
    # - GO term enrichment
    # - Pathway analysis
    # - Structural prediction
    # - Phylogenetic analysis of unique genes

if __name__ == "__main__":
    main()

Downloading NC_000962...
Downloaded NC_000962 to genomes/NC_000962.fasta
Downloading NC_002945...
Downloaded NC_002945 to genomes/NC_002945.fasta
Downloading NC_008769...
Downloaded NC_008769 to genomes/NC_008769.fasta
Downloading NC_002677...
Downloaded NC_002677 to genomes/NC_002677.fasta
Predicting genes for NC_000962...


-------------------------------------
PRODIGAL v2.6.3 [February, 2016]         
Univ of Tenn / Oak Ridge National Lab
Doug Hyatt, Loren Hauser, et al.     
-------------------------------------
Request:  Single Genome, Phase:  Training
Reading in the sequence(s) to train...4411532 bp seq created, 65.61 pct GC
301350 nodes potential starts and stops...
Looking for GC bias in different frames...frame bias scores: 0.43 0.18 2.39
done!ing initial set of genes to train from...
done!ing coding model and scoring nodes...
done!ning upstream regions and training starts...
-------------------------------------
Request:  Single Genome, Phase:  Gene Finding
done!ng genes in sequence #1 (4411532 bp)...
-------------------------------------
PRODIGAL v2.6.3 [February, 2016]         
Univ of Tenn / Oak Ridge National Lab
Doug Hyatt, Loren Hauser, et al.     
-------------------------------------
Request:  Single Genome, Phase:  Training
Reading in the sequence(s) to train...

Gene prediction complete for NC_000962
Predicting genes for NC_002945...


4349904 bp seq created, 65.63 pct GC
297423 nodes potential starts and stops...
Looking for GC bias in different frames...frame bias scores: 0.43 0.18 2.39
done!ing initial set of genes to train from...
Creating coding model and scoring nodes...done!
done!ning upstream regions and training starts...
-------------------------------------
Request:  Single Genome, Phase:  Gene Finding
done!ng genes in sequence #1 (4349904 bp)...
-------------------------------------
PRODIGAL v2.6.3 [February, 2016]         
Univ of Tenn / Oak Ridge National Lab
Doug Hyatt, Loren Hauser, et al.     
-------------------------------------
Request:  Single Genome, Phase:  Training
Reading in the sequence(s) to train...

Gene prediction complete for NC_002945
Predicting genes for NC_008769...


4374522 bp seq created, 65.64 pct GC
299136 nodes potential starts and stops...
Looking for GC bias in different frames...frame bias scores: 0.44 0.18 2.38
done!ing initial set of genes to train from...
Creating coding model and scoring nodes...done!
done!ning upstream regions and training starts...
-------------------------------------
Request:  Single Genome, Phase:  Gene Finding
done!ng genes in sequence #1 (4374522 bp)...
-------------------------------------
PRODIGAL v2.6.3 [February, 2016]         
Univ of Tenn / Oak Ridge National Lab
Doug Hyatt, Loren Hauser, et al.     
-------------------------------------
Request:  Single Genome, Phase:  Training
Reading in the sequence(s) to train...

Gene prediction complete for NC_008769
Predicting genes for NC_002677...


3268203 bp seq created, 57.80 pct GC
199519 nodes potential starts and stops...
Looking for GC bias in different frames...frame bias scores: 0.83 0.27 1.90
done!ing initial set of genes to train from...
Creating coding model and scoring nodes...done!
done!ning upstream regions and training starts...
-------------------------------------
Request:  Single Genome, Phase:  Gene Finding
done!ng genes in sequence #1 (3268203 bp)...


Gene prediction complete for NC_002677


Building a new DB, current time: 03/23/2025 13:38:17
New DB name:   /Users/shanebrubaker/work/CookBook/Ch12/predictions/NC_002945_proteins.faa
New DB title:  predictions/NC_002945_proteins.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 4013 sequences in 0.042331 seconds.




Building a new DB, current time: 03/23/2025 13:38:17
New DB name:   /Users/shanebrubaker/work/CookBook/Ch12/predictions/NC_008769_proteins.faa
New DB title:  predictions/NC_008769_proteins.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 4027 sequences in 0.0401521 seconds.




Building a new DB, current time: 03/23/2025 13:38:18
New DB name:   /Users/shanebrubaker/work/CookBook/Ch12/predictions/NC_002677_proteins.faa
New DB title:  predictions/NC_002677_proteins.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FAS