In [150]:
import sys
import re
import itertools
import synapseclient
from synapseclient import Activity

In [137]:
syn = synapseclient.login()

Welcome, James Eddy!



## FASTQs, BAMs, counts

In [51]:
def get_star_inputs(syn, star_input_folder_id):
    """
    Collect full list of input BAM Synapse entities for the
    current study (most of this data in the AMP-AD knowledge
    portal is not annotated, so can't directly filter with
    query). Note: inputs can also include FASTQs with
    unmapped reads.
    """
    query = ('select id, name from file where parentId=="{}"' 
             .format(star_input_folder_id))
    return list(syn.chunkedQuery(query))

In [52]:
def get_star_out_r1_fastqs(syn, star_out_fastq_folder_id):
    """
    Collect the R1 FASTQ Synapse entities generated by the STAR
    reprocessing activity for the current study.
    """
    fastq_query = ('select id, name from file where parentId=="{}" '
                   'and read=="R1"'
                   .format(star_out_fastq_folder_id))
    return syn.chunkedQuery(fastq_query)

In [53]:
def get_sample_star_inputs(input_list, sample_id):
    """
    Collect Synapse IDs of any input files matching the sample ID.
    """
    return [entity['file.id'] for entity in input_list
            if re.search(sample_id, entity['file.name'])]

In [97]:
def get_sample_r2_fastq(syn, star_out_fastq_folder_id, sample_id):
    """
    Collect Synapse ID for R2 FASTQ generated by STAR and
    matching the sample ID.
    """
    fastq_query = ('select id, name from file where parentId=="{}" '
                    'and specimenID=="{}" and read=="R2"'
                    .format(star_out_fastq_folder_id, sample_id))
    try:
        return syn.chunkedQuery(fastq_query).next()
    except StopIteration:
        return

In [70]:
def get_sample_counts(syn, star_out_count_folder_id, sample_id):
    """
    Collect Synapse ID for counts generated by STAR and
    matching the sample ID.
    """
    counts_query = ('select id, name from file where parentId=="{}" '
                    'and specimenID=="{}"'
                    .format(star_out_counts_folder_id, sample_id))
    return syn.chunkedQuery(counts_query).next()['file.id']

In [73]:
def get_sample_bam(syn, star_out_bam_folder_id, sample_id):
    """
    Collect Synapse ID for BAM file generated by STAR and
    matching the sample ID.
    """
    bam_query = ('select id, name from file where parentId=="{}" '
                    'and specimenID=="{}"'
                    .format(star_out_bam_folder_id, sample_id))
    return syn.chunkedQuery(bam_query).next()['file.id']

In [158]:
def set_star_output_provenance(syn, star_out_fastq_folder_id, 
                               star_input_list, star_script_id):
    # get R1 FASTQs on Synapse for all study samples
    star_out_r1_fastqs = get_star_out_r1_fastqs(syn, out_fastq_folder_id)
    
    for sample_r1_fastq in itertools.islice(star_out_r1_fastqs, 0, 2):
        # parse sample ID from R1 FASTQ filename
        sample_id = sample_r1_fastq['file.name'].split('.')[0]
        print("Collecting inputs/outputs for R1 FASTQ ID: '{}' (sample ID: '{}')"
              .format(sample_r1_fastq['file.id'], sample_id))

        # retrieve Synapse IDs for matched BAM input files
        # based on the current sample ID
        print("=> collecting matched input files from '{}'".format(star_input_folder_id))
        sample_star_input_ids = get_sample_star_inputs(input_list, sample_id)

        # build activity for current sample inputs
        print("=> building activity for script '{}'".format(star_script_id))
        sample_star_activity = Activity(
            used=sample_star_input_ids, 
            executed=star_script_id
        )
        print("\tActivity: used={}, executed={}".format(
              [u['reference']['targetId'] for u in sample_star_activity['used']
               if not u['wasExecuted']],
              [u['reference']['targetId'] for u in sample_star_activity['used']
               if u['wasExecuted']])
             )

        # retrieve Synapse IDs for matched R2 FASTQ, reprocessed BAM,
        # and counts file based on current sample ID
        print("=> getting matched R2 FASTQ from '{}'".format(star_out_fastq_folder_id))
        sample_r2_fastq = get_sample_r2_fastq(syn, star_out_fastq_folder_id, sample_id)

        print("=> getting matched counts from '{}'".format(star_out_counts_folder_id))
        sample_counts = get_sample_counts(syn, star_out_counts_folder_id, sample_id)

        print("=> getting matched BAM from '{}'".format(star_out_bam_folder_id))
        sample_bam = get_sample_bam(syn, star_out_bam_folder_id, sample_id)

        # combine sample output IDs
        if sample_r2_fastq:
            sample_star_output_ids = [
                sample_r1_fastq['file.id'], 
                sample_r2_fastq['file.id'], 
                sample_counts, 
                sample_bam,
            ]
        else:
            sample_star_output_ids = [
                sample_r1_fastq['file.id'], 
                sample_counts, 
                sample_bam,
            ]

        # set provenance for all sample outputs
        print("=> setting provenance for the following outputs")
        for output_id in sample_star_output_ids:
            print("\tOutput ID: '{}'".format(output_id))

### MSSM

In [54]:
# site-contributed Synapse BAMs and unmapped FASTQs for MSSM
star_input_folder_id = 'syn7416949'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [159]:
# reprocessing output folders on Synapse for MSSM study
star_out_fastq_folder_id = 'syn8612191'
star_out_counts_folder_id = 'syn8672879'
star_out_bam_folder_id = 'syn8540822'

# Synapse ID for STAR reprocessing script for MSSM samples
star_script_id = 'run_star_mssm.sh'

# do things
set_star_output_provenance(syn, star_out_bam_folder_id, star_input_list, star_script_id)

Collecting inputs/outputs for R1 FASTQ ID: 'syn8613453' (sample ID: 'hB_RNA_12838')
=> collecting matched input files from 'syn7416949'
=> building activity for script 'run_star_mssm.sh'
	Activity: used=[u'syn5518922', u'syn5519399'], executed=['run_star_mssm.sh']
=> getting matched R2 FASTQ from 'syn8540822'
=> getting matched counts from 'syn8672879'
=> getting matched BAM from 'syn8540822'
=> setting provenance for the following outputs
	Output ID: 'syn8613453'
	Output ID: 'syn8686958'
	Output ID: 'syn8596856'
Collecting inputs/outputs for R1 FASTQ ID: 'syn8613635' (sample ID: 'BM_22_154')
=> collecting matched input files from 'syn7416949'
=> building activity for script 'run_star_mssm.sh'
	Activity: used=[u'syn4055855', u'syn4055856'], executed=['run_star_mssm.sh']
=> getting matched R2 FASTQ from 'syn8540822'
=> getting matched counts from 'syn8672879'
=> getting matched BAM from 'syn8540822'
=> setting provenance for the following outputs
	Output ID: 'syn8613635'
	Output ID: 'sy

### ROSMAP

In [None]:
# site-contributed Synapse BAMs for ROSMAP
star_input_folder_id = 'syn4164376'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [None]:
# reprocessing output folders on Synapse for ROSMAP study
star_out_fastq_folder_id = 'syn8612097'
star_out_counts_folder_id = 'syn8683042'
star_out_bam_folder_id = 'syn8540863'

# Synapse ID for STAR reprocessing script for ROSMAP samples
star_script_id = 'run_star_rosmap.sh'

set_star_output_provenance(syn, star_out_bam_folder_id, star_input_list, star_script_id)

### Mayo_TCX

In [None]:
# site-contributed Synapse BAMs for Mayo_TCX
star_input_folder_id = 'syn4894912'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [None]:
# reprocessing output folders on Synapse for Mayo_TCX study
star_out_fastq_folder_id = 'syn8612203'
star_out_counts_folder_id = 'syn8672882'
star_out_bam_folder_id = 'syn8540820'

# Synapse ID for STAR reprocessing script for Mayo_TCX samples
star_script_id = 'run_star_mayo.sh'

set_star_output_provenance(syn, star_out_bam_folder_id, star_input_list, star_script_id)

### Mayo_CBE

In [None]:
# site-contributed Synapse BAMs for Mayo_CBE
star_input_folder_id = 'syn5049322'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [None]:
# reprocessing output folders on Synapse for Mayo_CBE study
star_out_fastq_folder_id = 'syn8612213'
star_out_counts_folder_id = 'syn8672881'
star_out_bam_folder_id = 'syn8540821'

# Synapse ID for STAR reprocessing script for Mayo_CBE samples
star_script_id = 'run_star_mayo.sh'

set_star_output_provenance(syn, star_out_bam_folder_id, star_input_list, star_script_id)

### Mayo (failed QC)

In [None]:
'syn9688855'

## Picard alignment metrics

In [None]:
picard_out_folder_id = 'syn8683000' # ROSMAP
picard_out_folder_id = 'syn8673913' # MSSM
picard_out_folder_id = 'syn8673915' # Mayo_TCX
picard_out_folder_id = 'syn8663914' # Mayo_CBE

## Combined counts

In [None]:
study_counts_id = 'syn8691134.1' # ROSMAP
study_counts_id = 'syn8691099.1' # MSSM
study_counts_id = 'syn8690799.1' # Mayo_TCX
study_counts_id = 'syn8690904.1' # Mayo_CBE

## Combined metrics

In [None]:
study_counts_id = 'syn8698240.2' # ROSMAP
study_counts_id = 'syn8698270.1' # MSSM
study_counts_id = 'syn8698211.1' # Mayo_TCX
study_counts_id = 'syn8698214.1' # Mayo_CBE