In [None]:
# Ch05-2 - Tools for Sequence Manipulation

In [None]:
# Get sample data
! wget -O sample.fasta "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&id=NM_001200.1&report=fasta"
! mv sample.fasta data/
! wget -O cds_sequence.fasta "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&id=NM_000518.5&report=fasta_cds_na&retmode=text"
! mv cds_sequence.fasta data/ 

In [43]:
# Install SRATools 
! curl --output sratoolkit.tar.gz https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-mac64.tar.gz
! gunzip sratoolkit.tar.gz  
! tar -xvf sratoolkit.tar
! rm sratoolkit.tar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 84.8M  100 84.8M    0     0  17.3M      0  0:00:04  0:00:04 --:--:-- 20.2M
x sratoolkit.3.1.1-mac-x86_64/
x sratoolkit.3.1.1-mac-x86_64/README.md
x sratoolkit.3.1.1-mac-x86_64/README-vdb-config
x sratoolkit.3.1.1-mac-x86_64/schema/
x sratoolkit.3.1.1-mac-x86_64/schema/vdb/
x sratoolkit.3.1.1-mac-x86_64/schema/vdb/vdb.vschema
x sratoolkit.3.1.1-mac-x86_64/schema/vdb/built-in.vschema
x sratoolkit.3.1.1-mac-x86_64/schema/insdc/
x sratoolkit.3.1.1-mac-x86_64/schema/insdc/insdc.vschema
x sratoolkit.3.1.1-mac-x86_64/schema/insdc/sra.vschema
x sratoolkit.3.1.1-mac-x86_64/schema/insdc/seq.vschema
x sratoolkit.3.1.1-mac-x86_64/schema/sra/
x sratoolkit.3.1.1-mac-x86_64/schema/sra/abi.vschema
x sratoolkit.3.1.1-mac-x86_64/schema/sra/pevents.vschema
x sratoolkit.3.1.1-mac-x86_64/schema/sra/generic-fastq.vschema
x sratoolkit.3.1.1-mac-x86_

In [44]:
# Download Sample fastq files
! sratoolkit.3.1.1-mac-x86_64/bin/fasterq-dump SRR000001
! mv *.fastq data/

spots read      : 470,985
reads read      : 1,883,940
reads written   : 707,026
reads 0-length  : 468,635
technical reads : 708,279


In [None]:
# Import Modules
from Bio.Seq import Seq  # BioPython Seq module
from Bio import SeqIO

In [None]:
# Read in our Sequences
# Define the input FASTA files
fasta_file1 = "data/sample.fasta"
fasta_file2 = "data/cds_sequence.fasta"
# Initialize an empty list to hold sequences
sample_sequence = []
cds_sequence = []
# Function to read a FASTA file and append sequences as strings
def read_fasta(file_path, seq_list):
    with open(file_path, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            seq_list.append(str(record.seq))
# Read sequences from both files
read_fasta(fasta_file1, sample_sequence)
read_fasta(fasta_file2, cds_sequence)
# Convert the sequences to strings
sample_seq_str = " ".join(sample_sequence)
cds_seq_str = " ".join(cds_sequence)
# Create a Seq object for each sequence 
dna_seq = Seq(sample_seq_str)
cds_seq = Seq(cds_seq_str)

In [None]:
# Complement the sample sequence
print("Complement:", dna_seq.complement())
# Reverse complement the sample sequence
print("Reverse Complement:", dna_seq.reverse_complement())
# Transcribe the CDS sequence to RNA
print("Transcription (DNA to RNA):", cds_seq.transcribe())
# Translate the CDS sequence from DNA to protein
print("Protein Translation (DNA to Protein):", cds_seq.translate(to_stop=True))

In [None]:
# Read Trimming 

In [45]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

def trim_low_quality_bases(record, quality_threshold):
    """
    Trim low-quality bases from the 3' end of a read based on a Phred quality score.
    """
    qualities = record.letter_annotations["phred_quality"]
    trimmed_index = len(qualities)  # Default to full length
    
    # Find the position where quality drops below threshold
    for i in range(len(qualities) - 1, -1, -1):  # Iterate from the end to the beginning
        if qualities[i] >= quality_threshold:
            break
        trimmed_index = i
    
    # Create a new SeqRecord with the trimmed sequence and quality scores
    trimmed_seq = record.seq[:trimmed_index]
    trimmed_qual = qualities[:trimmed_index]
    
    trimmed_record = SeqRecord(
        Seq(str(trimmed_seq)),
        id=record.id,
        description=record.description,
        letter_annotations={"phred_quality": trimmed_qual}
    )
    
    return trimmed_record

def remove_adapter(record, adapter_seq):
    """
    Remove a known adapter sequence from the read if found.
    """
    seq_str = str(record.seq)
    adapter_position = seq_str.find(adapter_seq)
    
    if adapter_position != -1:
        # Trim the sequence at the adapter position
        trimmed_seq = record.seq[:adapter_position]
        trimmed_qual = record.letter_annotations["phred_quality"][:adapter_position]
        record = SeqRecord(
            Seq(str(trimmed_seq)),
            id=record.id,
            description=record.description,
            letter_annotations={"phred_quality": trimmed_qual}
        )
    return record

def process_fastq(input_fastq, output_fastq, quality_threshold=20, adapter_seq=None):
    """
    Process a FASTQ file to trim low-quality bases and optionally remove adapter sequences.
    """
    with open(input_fastq, "r") as input_handle, open(output_fastq, "w") as output_handle:
        for record in SeqIO.parse(input_handle, "fastq"):
            # Step 1: Trim low-quality bases
            record = trim_low_quality_bases(record, quality_threshold)
            
            # Step 2: Remove adapter sequence if provided
            if adapter_seq:
                record = remove_adapter(record, adapter_seq)
            
            # Write the processed record only if it has a sequence left
            if len(record.seq) > 0:
                SeqIO.write(record, output_handle, "fastq")
    print(f"Processing complete. Trimmed reads saved to {output_fastq}")

# Example usage
input_fastq = "data/SRR000001_1.fastq"  # Replace with your input FASTQ file
output_fastq = "data/processed_reads.fastq"  # Output file path
quality_threshold = 30  # Minimum Phred quality score
adapter_sequence = "AGATCGGAAGAGC"  # Replace with your known adapter sequence

process_fastq(input_fastq, output_fastq, quality_threshold, adapter_sequence)


Processing complete. Trimmed reads saved to data/processed_reads.fastq
