# SRA Testing Module
For scratch testing of the SRA module.

In [1]:
# Enable autorealod so that modified files can be re-loaded
# with out restarting the kernel.
%load_ext autoreload
%autoreload 2

In [2]:
%pwd

'C:\\Users\\Tyler\\git\\pynome\\tests'

In [60]:
# Add the path of the pynome project.
import os
import csv
import json
import sys
sys.path.insert(0, os.path.abspath('..'))

In [61]:
from pynome.sra import (
    build_sra_query_string,
    run_sra_query,
    fetch_sra_info,
    parse_sra_query_response,
    get_SRA_accession,
    chunk_accession_id, build_sra_path
)

In [62]:
# print(fetch_result['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['EXPERIMENT']['@accession'])
# print(fetch_result['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['SUBMISSION']['@accession'])
# print(fetch_result['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['STUDY']['@accession'])
# print(fetch_result['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['SAMPLE']['@accession'])

In [77]:
def download_sra_json(taxonomy_id_list, base_download_path=None):
    """
    Downloads the SRA metadata for each ID found in the
    `taxonomy_id_list`. These files are saved under a series of
    two-digit file  paths generated from the SRA accession number.

    :param taxonomy_id_list:
        A list of taxonomy identification values.
    :param base_download_path:
        The base location where the SRA accession number folders
        will be placed.
    :return:
        A list of success or failures, indexed the same as the
        input `taxonomy_id_list`.
    """

    # Create the output status dictionary to track whether a given
    # taxonomy ID was downloaded successfully or not.
    status_list = dict()

    # For each of the taxonomy ID numbers provided.
    for id in taxonomy_id_list:

        # Generate the corresponding query.
        query = build_sra_query_string(id)
        print("\nQUERY: \n", query)

        # Run the query.
        query_response = run_sra_query(query)
        print("\nRESPONSE: \n", query_response)

        # Parse the response, get the list of SRA identification
        # numbers so that the corresponding metadata can be
        # downloaded.
        fetch_id_list = parse_sra_query_response(query_response)
        print("\nFETCHED ID LIST: \n", fetch_id_list)
        
        # If there are any accession values found.
        if fetch_id_list is not None:

            # Iterate through the fetch ID numbers.
            for fetch_id in fetch_id_list:

                # Get the desired *.json file associated with
                # the current fetch ID.
                print('FETCH ID', fetch_id)
                fetch_result = fetch_sra_info(fetch_id)

                # Get the ERR or SRR from the fetched result.
                SRA_accession_list = get_SRA_accession(fetch_result)
                print("\nSRA accession:\n", SRA_accession_list)
            
                for sra_id in SRA_accession_list:
                    print('sra_id', sra_id)
                    
                    # Create the broken up path.
                    path = build_sra_path(sra_id)
                    print(path)
                    
                    # Create this path if it does not exist.
                    if not os.path.exists(path):
                        os.makedirs(path)
                        
                    # Write the file.
                    with open(os.path.join(path, sra_id + '.sra.json'), 'w') as nfp:
                        nfp.write(json.dumps(fetch_result))
                    
                    print("\nFILE WRITTEN AT:\n", path, sra_id + '.sra.json')

In [78]:
download_sra_json(['1094985'])


QUERY: 
 txid1094985[Organism:noexp]+AND+biomol+rna[Properties]+AND+platform+illumina[Properties]+AND+100:1000[ReadLength]+AND+"paired"[Layout]

RESPONSE: 
 OrderedDict([('eSearchResult', OrderedDict([('Count', '1'), ('RetMax', '1'), ('RetStart', '0'), ('IdList', OrderedDict([('Id', '244092')])), ('TranslationSet', None), ('TranslationStack', OrderedDict([('TermSet', [OrderedDict([('Term', 'txid1094985[Organism:noexp]'), ('Field', 'Organism'), ('Count', '3'), ('Explode', 'N')]), OrderedDict([('Term', 'biomol rna[Properties]'), ('Field', 'Properties'), ('Count', '779724'), ('Explode', 'N')]), OrderedDict([('Term', 'platform illumina[Properties]'), ('Field', 'Properties'), ('Count', '3177818'), ('Explode', 'N')]), OrderedDict([('Term', '00000000100[ReadLength]'), ('Field', 'ReadLength'), ('Count', '0'), ('Explode', 'N')]), OrderedDict([('Term', '00000001000[ReadLength]'), ('Field', 'ReadLength'), ('Count', '0'), ('Explode', 'N')]), OrderedDict([('Term', '"paired"[Layout]'), ('Field', 'L

In [8]:
tax_id_list = [1290391,999810,1124627,1356009,1094988,1094985,1094992,1094990,1094989,294748]

In [9]:
test_fetch_result = fetch_sra_info('149157')

In [10]:
test_fetch_result2 = fetch_sra_info('147979')

In [11]:
test_fetch_result2['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['RUN_SET']['RUN'][0]['@accession']

'SRR489777'

In [12]:
# test_fetch_result['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['RUN_SET']['RUN']['@accession']

'SRR447617'

In [14]:
download_sra_json(tax_id_list)


QUERY: 
 txid1290391[Organism:noexp]+AND+biomol+rna[Properties]+AND+platform+illumina[Properties]+AND+100:1000[ReadLength]+AND+"paired"[Layout]

RESPONSE: 
 OrderedDict([('eSearchResult', OrderedDict([('Count', '0'), ('RetMax', '0'), ('RetStart', '0'), ('IdList', None), ('TranslationSet', None), ('TranslationStack', OrderedDict([('TermSet', [OrderedDict([('Term', 'txid1290391[Organism:noexp]'), ('Field', 'Organism'), ('Count', '1'), ('Explode', 'N')]), OrderedDict([('Term', 'biomol rna[Properties]'), ('Field', 'Properties'), ('Count', '779724'), ('Explode', 'N')]), OrderedDict([('Term', 'platform illumina[Properties]'), ('Field', 'Properties'), ('Count', '3177818'), ('Explode', 'N')]), OrderedDict([('Term', '00000000100[ReadLength]'), ('Field', 'ReadLength'), ('Count', '0'), ('Explode', 'N')]), OrderedDict([('Term', '00000001000[ReadLength]'), ('Field', 'ReadLength'), ('Count', '0'), ('Explode', 'N')]), OrderedDict([('Term', '"paired"[Layout]'), ('Field', 'Layout'), ('Count', '2367418


SRA accession:
 ['SRR567695', 'SRR567711', 'SRR567719', 'SRR567746']
sra_id SRR567695
Chunked ID
 ['SRR', '95']
sra_id SRR567711
Chunked ID
 ['SRR', '11']
sra_id SRR567719
Chunked ID
 ['SRR', '19']
sra_id SRR567746
Chunked ID
 ['SRR', '46']

QUERY: 
 txid1094990[Organism:noexp]+AND+biomol+rna[Properties]+AND+platform+illumina[Properties]+AND+100:1000[ReadLength]+AND+"paired"[Layout]

RESPONSE: 
 OrderedDict([('eSearchResult', OrderedDict([('Count', '6'), ('RetMax', '6'), ('RetStart', '0'), ('IdList', OrderedDict([('Id', ['149153', '149151', '149146', '147978', '147970', '147969'])])), ('TranslationSet', None), ('TranslationStack', OrderedDict([('TermSet', [OrderedDict([('Term', 'txid1094990[Organism:noexp]'), ('Field', 'Organism'), ('Count', '8'), ('Explode', 'N')]), OrderedDict([('Term', 'biomol rna[Properties]'), ('Field', 'Properties'), ('Count', '779724'), ('Explode', 'N')]), OrderedDict([('Term', 'platform illumina[Properties]'), ('Field', 'Properties'), ('Count', '3177818'), ('E

In [31]:
test_str = list('447617')

In [36]:
test_str[:2]


['4', '4']

In [41]:
for i in range(0, len(test_str), 2):
    print(i)
    print(test_str[i:i + 2])

0
['4', '4']
2
['7', '6']
4
['1', '7']
