In [None]:
# Ch08-2 Using the Short Read Archive

In [None]:
# To get the following code to run, you'll need to make sure fasterq-dump is already in your PATH
#  If you have not already done so, you'll want to install it and add it to your PATH
#  Then close out your notebooks and restart jupyter notebook from a terminal where you know you can
#   see fasterq-dump in your path

In [None]:
# Refer to Ch 5 Recipe 2 for sra tools install
# Run the command below in your terminal to make sure fasterq-dump is in your path and add it to your zshrc file
#   (check that your path is correct, it may not be the same as below)
echo 'export PATH=$PATH:~/Software/sratoolkit.3.1.1-mac-x86_64/bin' >> ~/.zshrc
source ~/.zshrc 
# check that fasterq-dump is working
fasterq-dump -h
# Then restart your jupyter notebook

In [None]:
# Install pysradb
! pip install pysradb

In [None]:
# Import Libraries
import os
import subprocess
from pysradb.sraweb import SRAweb

In [None]:
def fetch_sra_metadata(sra_accession):
    """
    Retrieve metadata for a given SRA accession using pysradb.
    
    Parameters:
        sra_accession (str): SRA study or run accession (e.g., SRP, SRX, SRA, or ERR).
        
    Returns:
        metadata (DataFrame): Metadata table for the SRA accession.
    """
    db = SRAweb()
    metadata = db.sra_metadata(sra_accession, detailed=True)
    return metadata

In [None]:
def download_sra_run(run_accession, output_dir="sra_data"):
    """
    Download SRA run data using fasterq-dump.
    
    Parameters:
        run_accession (str): The specific SRA run accession (e.g., SRR12345678).
        output_dir (str): Directory to save the downloaded data.
        
    Returns:
        None
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    try:
        print(f"Downloading SRA run {run_accession}...")
        # Command to download and convert to FASTQ
        subprocess.run(
            ["fasterq-dump", run_accession, "--outdir", output_dir, "--split-files"],
            check=True
        )
        print(f"Download complete. Files saved in {output_dir}")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading {run_accession}: {e}")

In [None]:
def main():
    sra_accession = "SRR536546"  # Small test dataset
    # Fetch metadata
    metadata = fetch_sra_metadata(sra_accession)
    print("Metadata for the accession:")
    print(metadata)

    # Download the first run as an example
    if not metadata.empty:
        first_run = metadata["run_accession"].iloc[0]
        download_sra_run(first_run)
    else:
        print("No runs found for this accession.")
if __name__ == "__main__":
    main()

In [None]:
# Move folder to output
! mv sra_data output/

In [None]:
# Use BLAST #

In [5]:
# Query using BLAST via the NCBI API
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

# Define a sample FASTA sequence
query_sequence = ">test_query\nATGGCCATTGTAATCATGTTCTAATAGTGTTCA"

# Submit the query to NCBI BLAST (nucleotide BLAST: blastn)
result_handle = NCBIWWW.qblast("blastn", "nt", query_sequence)

# Save the results to a file
with open("blast_result.xml", "w") as out_file:
    out_file.write(result_handle.read())

print("BLAST search completed! Results saved in 'blast_result.xml'")

BLAST search completed! Results saved in 'blast_result.xml'


In [6]:
# Parse the BLAST output #
# Read BLAST results from XML file
with open("blast_result.xml") as result_file:
    blast_records = NCBIXML.read(result_file)

# Print top hits
for alignment in blast_records.alignments[:5]:  # Display top 5 hits
    print(f"Hit: {alignment.title}")
    for hsp in alignment.hsps:
        print(f"  Score: {hsp.score}, E-value: {hsp.expect}")

Hit: gi|1338838386|ref|XM_023806334.1| PREDICTED: Paramormyrops kingsleyae T-box transcription factor TBX5-like (LOC111840956), transcript variant X2, mRNA
  Score: 45.0, E-value: 1.12623
Hit: gi|1338838384|ref|XM_023806333.1| PREDICTED: Paramormyrops kingsleyae T-box transcription factor TBX5-like (LOC111840956), transcript variant X1, mRNA
  Score: 45.0, E-value: 1.12623
Hit: gi|2647104289|gb|CP141595.1| Rossellomorea aquimaris strain Rossellomorea aquimaris S-2 chromosome, complete genome
  Score: 44.0, E-value: 1.12623
Hit: gi|1190964948|ref|XR_002333164.1| PREDICTED: Arabidopsis lyrata subsp. lyrata uncharacterized LOC110229766 (LOC110229766), ncRNA
  Score: 43.0, E-value: 3.93094
Hit: gi|891573148|ref|XM_013165470.1| Schizosaccharomyces cryophilus OY26 RNA polymerase II associated Paf1 complex (SPOG_02808), mRNA
  Score: 42.0, E-value: 3.93094


In [None]:
## End of Notebook ##