In [None]:
# Ch05-3 Alignment 

In [None]:
# Create a directory to store the genome
! mkdir -p data/ecoli_genome
! cd data/ecoli_genome
# Download the E. coli reference genome (GCF_000005845.2)
!wget -O ecoli_reference.fasta.gz "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz"
# Uncompress the file
! gunzip ecoli_reference.fasta.gz

In [None]:
# Download the E. coli sequencing reads
! cd ..
! fasterq-dump --split-files --outdir ./ecoli_reads SRR31783077
! cd ..

In [None]:
# Import Libraries
import subprocess
import os

In [None]:
def index_reference_genome(reference_fasta):
    """
    Index the reference genome using BWA.
    """
    print("Indexing the reference genome with BWA...")
    cmd = ["bwa", "index", reference_fasta]
    subprocess.run(cmd, check=True)
    print("Reference genome indexing complete.\n")

In [None]:
def align_fastq_to_reference(reference_fasta, fastq_file1, fastq_file2, output_sam, threads=4):
    """
    Align FASTQ reads to the reference genome using BWA-MEM.
    """
    print("Performing alignment with BWA-MEM...")
    cmd = [
        "bwa", "mem",
        "-t", str(threads),       # Number of threads
        reference_fasta,          # Reference genome
        fastq_file1,              # First FASTQ file
        fastq_file2,              # Second FASTQ file (paired-end)
    ]
    with open(output_sam, "w") as out:
        subprocess.run(cmd, stdout=out, check=True)
    print(f"Alignment complete. SAM file saved to: {output_sam}\n")

In [None]:
def convert_sam_to_sorted_bam(sam_file, bam_file, threads=4):
    """
    Convert the SAM file to sorted BAM format using Samtools.
    """
    print("Converting SAM to sorted BAM using Samtools...")
    cmd_sort = ["samtools", "sort", "-@", str(threads), "-o", bam_file, sam_file]
    subprocess.run(cmd_sort, check=True)

    cmd_index = ["samtools", "index", bam_file]
    subprocess.run(cmd_index, check=True)
    print(f"Sorted BAM file saved to: {bam_file}\n")

In [None]:
def main():
    """
    Main function to run the full alignment workflow.
    """
    # Input files
    reference_fasta = "data/ecoli_genome/ecoli_reference.fasta"  # Path to the reference genome
    fastq_file1 = "data/ecoli_reads/SRR31783077_1.fastq"              # First FASTQ file (paired-end reads)
    fastq_file2 = "data/ecoli_reads/SRR31783077_2.fastq"              # Second FASTQ file (paired-end reads)
    # Output files
    output_sam = "data/output/aligned_reads.sam"         # Output SAM file
    output_bam = "data/output/aligned_reads_sorted.bam"  # Output sorted BAM file
    # Create output directory
    os.makedirs("data/output", exist_ok=True)
    try:
        # Step 1: Index the reference genome
        index_reference_genome(reference_fasta)
        # Step 2: Align reads to the reference genome
        align_fastq_to_reference(reference_fasta, fastq_file1, fastq_file2, output_sam, threads=4)
        # Step 3: Convert SAM to sorted BAM
        convert_sam_to_sorted_bam(output_sam, output_bam, threads=4)
    except subprocess.CalledProcessError as e:
        print(f"Error occurred during execution: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

if __name__ == "__main__":
    main()
