In [None]:
# Ch05-3 Alignment 

In [None]:
# Create a directory to store the genome
! mkdir -p data/ecoli_genome
! cd data/ecoli_genome

# Download the E. coli reference genome (GCF_000005845.2)
!wget -O ecoli_reference.fasta.gz "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz"

# Uncompress the file
! gunzip ecoli_reference.fasta.gz


In [None]:
# Download the E. coli sequencing reads
! cd ..
! fasterq-dump --split-files --outdir ./ecoli_reads SRR31783077
! cd ..

In [2]:
# Import Libraries
import subprocess
import os

In [3]:
def index_reference_genome(reference_fasta):
    """
    Index the reference genome using BWA.
    """
    print("Indexing the reference genome with BWA...")
    cmd = ["bwa", "index", reference_fasta]
    subprocess.run(cmd, check=True)
    print("Reference genome indexing complete.\n")

In [4]:
def align_fastq_to_reference(reference_fasta, fastq_file1, fastq_file2, output_sam, threads=4):
    """
    Align FASTQ reads to the reference genome using BWA-MEM.
    """
    print("Performing alignment with BWA-MEM...")
    cmd = [
        "bwa", "mem",
        "-t", str(threads),       # Number of threads
        reference_fasta,          # Reference genome
        fastq_file1,              # First FASTQ file
        fastq_file2,              # Second FASTQ file (paired-end)
    ]
    with open(output_sam, "w") as out:
        subprocess.run(cmd, stdout=out, check=True)
    print(f"Alignment complete. SAM file saved to: {output_sam}\n")

In [5]:
def convert_sam_to_sorted_bam(sam_file, bam_file, threads=4):
    """
    Convert the SAM file to sorted BAM format using Samtools.
    """
    print("Converting SAM to sorted BAM using Samtools...")
    cmd_sort = ["samtools", "sort", "-@", str(threads), "-o", bam_file, sam_file]
    subprocess.run(cmd_sort, check=True)

    cmd_index = ["samtools", "index", bam_file]
    subprocess.run(cmd_index, check=True)
    print(f"Sorted BAM file saved to: {bam_file}\n")

In [7]:
def main():
    """
    Main function to run the full alignment workflow.
    """
    # Input files
    reference_fasta = "data/ecoli_genome/ecoli_reference.fasta"  # Path to the reference genome
    fastq_file1 = "data/ecoli_reads/SRR31783077_1.fastq"              # First FASTQ file (paired-end reads)
    fastq_file2 = "data/ecoli_reads/SRR31783077_2.fastq"              # Second FASTQ file (paired-end reads)
    
    # Output files
    output_sam = "data/output/aligned_reads.sam"         # Output SAM file
    output_bam = "data/output/aligned_reads_sorted.bam"  # Output sorted BAM file

    # Create output directory
    os.makedirs("data/output", exist_ok=True)

    try:
        # Step 1: Index the reference genome
        index_reference_genome(reference_fasta)

        # Step 2: Align reads to the reference genome
        align_fastq_to_reference(reference_fasta, fastq_file1, fastq_file2, output_sam, threads=4)

        # Step 3: Convert SAM to sorted BAM
        convert_sam_to_sorted_bam(output_sam, output_bam, threads=4)

    except subprocess.CalledProcessError as e:
        print(f"Error occurred during execution: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


if __name__ == "__main__":
    main()


Indexing the reference genome with BWA...


[bwa_index] Pack FASTA... 0.03 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.57 seconds elapse.
[bwa_index] Update BWT... 0.01 sec
[bwa_index] Pack forward-only FASTA... 0.01 sec
[bwa_index] Construct SA from BWT and Occ... 0.16 sec
[main] Version: 0.7.18-r1243-dirty
[main] CMD: bwa index data/ecoli_genome/ecoli_reference.fasta
[main] Real time: 0.794 sec; CPU: 0.793 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs


Reference genome indexing complete.

Performing alignment with BWA-MEM...


[M::process] read 273342 sequences (40000215 bp)...
[M::process] read 273370 sequences (40000269 bp)...
[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (2, 104168, 13, 0)
[M::mem_pestat] skip orientation FF as there are not enough pairs
[M::mem_pestat] analyzing insert size distribution for orientation FR...
[M::mem_pestat] (25, 50, 75) percentile: (252, 360, 464)
[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 888)
[M::mem_pestat] mean and std.dev: (363.18, 158.99)
[M::mem_pestat] low and high boundaries for proper pairs: (1, 1100)
[M::mem_pestat] analyzing insert size distribution for orientation RF...
[M::mem_pestat] (25, 50, 75) percentile: (55, 101, 1338)
[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 3904)
[M::mem_pestat] mean and std.dev: (419.85, 600.79)
[M::mem_pestat] low and high boundaries for proper pairs: (1, 5187)
[M::mem_pestat] skip orientation RR as there are not enough pairs
[M::mem_pestat] skip 

Alignment complete. SAM file saved to: data/output/aligned_reads.sam

Converting SAM to sorted BAM using Samtools...


[bam_sort_core] merging from 0 files and 4 in-memory blocks...


Sorted BAM file saved to: data/output/aligned_reads_sorted.bam

