In [None]:
# Ch10-2 - Aligning genetic data [Updated]

In [None]:
# Install packages
! brew install trimal
! brew install mafft
! brew install muscle

In [None]:
# 8.  Run MAFFT to align the genomes
#     Note- this takes about 30-60 minutes to run
import subprocess
from Bio.Align.Applications import MafftCommandline
# Define the MAFFT command
mafft_cline = MafftCommandline(input="sample.fasta", ep=0.123, reorder=True, maxiterate=1000, localpair=True)
# Print the command (for debugging purposes)
print("Running MAFFT with command:", mafft_cline)
# Run MAFFT using subprocess
process = subprocess.run(
    str(mafft_cline),  # Convert command to string
    shell=True,        # Run in shell environment
    capture_output=True,  # Capture stdout and stderr
    text=True  # Ensure output is captured as text (string)
)
# Check for errors
if process.returncode != 0:
    print("Error running MAFFT:", process.stderr)
else:
    # Save the aligned output to a file
    with open("align.fasta", "w") as w:
        w.write(process.stdout)
print("Alignment completed and saved to align.fasta")

In [None]:
# Once the above is completed, you should see the file align.fasta in your working directory

In [None]:
# 9.  Use TrimAl to trim sequences
import os
os.system('trimal -automated1 -in align.fasta -out trim.fasta -fasta') 

In [None]:
# You should see the file trim.fasta as the output

In [None]:
# 10.  Run MUSCLE to align the proteins (This uses MUSCLE V5)
import subprocess
import os

my_genes = ['NP', 'L', 'VP35', 'VP40'] 
for gene in my_genes:

    input_file = f"{gene}_P.fasta"
    output_file = f"{gene}_P_align.fasta"

    # Verify if the input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found.")
    else:
        # Construct the correct command for MUSCLE v5+
        muscle_cmd = f"muscle -align {input_file} -output {output_file}"

        print(f"Running MUSCLE with command: {muscle_cmd}")

        # Run MUSCLE using subprocess
        process = subprocess.run(
            muscle_cmd, shell=True, capture_output=True, text=True
        )

        # Check for errors
        if process.returncode != 0:
            print("Error running MUSCLE:", process.stderr)
        else:
            print(f"Alignment completed and saved to {output_file}")

In [None]:
# You should see file four files: NP_P_align.fasta, L_P_align.fasta, VP35_P_align.fasta, VP40_P_align.fasta as the output

In [None]:
# 11.  Align genes by back-translation
from Bio import SeqIO 
from Bio.Seq import Seq 
from Bio.SeqRecord import SeqRecord 
for gene in my_genes: 
    gene_seqs = {} 
    unal_gene = SeqIO.parse('%s.fasta' % gene, 'fasta') 
    for rec in unal_gene: 
        gene_seqs[rec.id] = rec.seq 
    al_prot = SeqIO.parse('%s_P_align.fasta' % gene, 'fasta') 
    al_genes = [] 
    for protein in al_prot: 
        my_id = protein.id 
        seq = '' 
        pos = 0 
        for c in protein.seq: 
            if c == '-': 
                seq += '---' 
            else: 
                seq += str(gene_seqs[my_id][pos:pos + 3]) 
                pos += 3 
        al_genes.append(SeqRecord(Seq(seq), id=my_id)) 
    SeqIO.write(al_genes, '%s_align.fasta' % gene, 'fasta') 

In [None]:
# You should see 4 output files:  NP_align.fasta, L_align.fasta, VP35_align.fasta, VP40_align.fasta

In [None]:
## End of Notebook ##