In [1]:
import sys
import re
import itertools
import synapseclient
from synapseclient import Activity

In [2]:
syn = synapseclient.login()

Welcome, James Eddy!



## FASTQs, BAMs, counts

In [3]:
def get_star_inputs(syn, star_input_folder_id):
    """
    Collect full list of input BAM Synapse entities for the
    current study (most of this data in the AMP-AD knowledge
    portal is not annotated, so can't directly filter with
    query). Note: inputs can also include FASTQs with
    unmapped reads.
    """
    query = ('select id, name from file where parentId=="{}"' 
             .format(star_input_folder_id))
    return list(syn.chunkedQuery(query))

In [4]:
def get_star_out_r1_fastqs(syn, star_out_fastq_folder_id):
    """
    Collect the R1 FASTQ Synapse entities generated by the STAR
    reprocessing activity for the current study.
    """
    fastq_query = ('select id, name from file where parentId=="{}" '
                   'and read=="R1"'
                   .format(star_out_fastq_folder_id))
    return syn.chunkedQuery(fastq_query)

In [5]:
def get_sample_star_inputs(input_list, sample_id):
    """
    Collect Synapse IDs of any input files matching the sample ID.
    """
    return [entity['file.id'] for entity in input_list
            if re.search(sample_id, entity['file.name'])]

In [6]:
def get_sample_r2_fastq(syn, star_out_fastq_folder_id, sample_id):
    """
    Collect Synapse ID for R2 FASTQ generated by STAR and
    matching the sample ID.
    """
    fastq_query = ('select id, name from file where parentId=="{}" '
                    'and specimenID=="{}" and read=="R2"'
                    .format(star_out_fastq_folder_id, sample_id))
    try:
        return syn.chunkedQuery(fastq_query).next()
    except StopIteration:
        return

In [7]:
def get_sample_counts(syn, star_out_count_folder_id, sample_id):
    """
    Collect Synapse ID for counts generated by STAR and
    matching the sample ID.
    """
    counts_query = ('select id, name from file where parentId=="{}" '
                    'and specimenID=="{}"'
                    .format(star_out_counts_folder_id, sample_id))
    return syn.chunkedQuery(counts_query).next()['file.id']

In [8]:
def get_sample_bam(syn, star_out_bam_folder_id, sample_id):
    """
    Collect Synapse ID for BAM file generated by STAR and
    matching the sample ID.
    """
    bam_query = ('select id, name from file where parentId=="{}" '
                    'and specimenID=="{}"'
                    .format(star_out_bam_folder_id, sample_id))
    return syn.chunkedQuery(bam_query).next()['file.id']

In [23]:
def set_star_output_provenance(syn, star_out_fastq_folder_id, 
                               star_input_list, star_script_id):
    # get R1 FASTQs on Synapse for all study samples
    star_out_r1_fastqs = get_star_out_r1_fastqs(syn, star_out_fastq_folder_id)
    
    for sample_r1_fastq in itertools.islice(star_out_r1_fastqs, 0, 2):
        # parse sample ID from R1 FASTQ filename
        sample_id = sample_r1_fastq['file.name'].split('.')[0]
        print("Collecting inputs/outputs for R1 FASTQ ID: '{}' (sample ID: '{}')"
              .format(sample_r1_fastq['file.id'], sample_id))

        # retrieve Synapse IDs for matched BAM input files
        # based on the current sample ID
        print("=> collecting matched input files from '{}'".format(star_input_folder_id))
        sample_star_input_ids = get_sample_star_inputs(star_input_list, sample_id)

        # build activity for current sample inputs
        print("=> building activity for script '{}'".format(star_script_id))
        sample_star_activity = Activity(
            used=sample_star_input_ids, 
            executed=star_script_id
        )
        print("\tActivity: used={}, executed={}".format(
              [u['reference']['targetId'] for u in sample_star_activity['used']
               if not u['wasExecuted']],
              [u['reference']['targetId'] for u in sample_star_activity['used']
               if u['wasExecuted']])
             )

        # retrieve Synapse IDs for matched R2 FASTQ, reprocessed BAM,
        # and counts file based on current sample ID
        print("=> getting matched R2 FASTQ from '{}'".format(star_out_fastq_folder_id))
        sample_r2_fastq = get_sample_r2_fastq(syn, star_out_fastq_folder_id, sample_id)

        print("=> getting matched counts from '{}'".format(star_out_counts_folder_id))
        sample_counts = get_sample_counts(syn, star_out_counts_folder_id, sample_id)

        print("=> getting matched BAM from '{}'".format(star_out_bam_folder_id))
        sample_bam = get_sample_bam(syn, star_out_bam_folder_id, sample_id)

        # combine sample output IDs
        if sample_r2_fastq:
            sample_star_output_ids = [
                sample_r1_fastq['file.id'], 
                sample_r2_fastq['file.id'], 
                sample_counts, 
                sample_bam,
            ]
        else:
            sample_star_output_ids = [
                sample_r1_fastq['file.id'], 
                sample_counts, 
                sample_bam,
            ]

        # set provenance for all sample outputs
        print("=> setting provenance for the following outputs")
        for output_id in sample_star_output_ids:
            print("\tOutput ID: '{}'".format(output_id))

### MSSM

In [16]:
# site-contributed Synapse BAMs and unmapped FASTQs for MSSM
star_input_folder_id = 'syn7416949'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [24]:
# reprocessing output folders on Synapse for MSSM study
star_out_fastq_folder_id = 'syn8612191'
star_out_counts_folder_id = 'syn8672879'
star_out_bam_folder_id = 'syn8540822'

# Synapse ID for STAR reprocessing script for MSSM samples
star_script_id = 'run_star_mssm.sh'

# do things
set_star_output_provenance(syn, star_out_fastq_folder_id, star_input_list, star_script_id)

Collecting inputs/outputs for R1 FASTQ ID: 'syn8620246' (sample ID: 'BM_10_546')
=> collecting matched input files from 'syn7416949'
=> building activity for script 'run_star_mssm.sh'
	Activity: used=[u'syn4055333', u'syn4055334'], executed=['run_star_mssm.sh']
=> getting matched R2 FASTQ from 'syn8612191'
=> getting matched counts from 'syn8672879'
=> getting matched BAM from 'syn8540822'
=> setting provenance for the following outputs
	Output ID: 'syn8620246'
	Output ID: 'syn8686802'
	Output ID: 'syn8593437'
Collecting inputs/outputs for R1 FASTQ ID: 'syn8669905' (sample ID: 'BM_10_548')
=> collecting matched input files from 'syn7416949'
=> building activity for script 'run_star_mssm.sh'
	Activity: used=[u'syn4055335', u'syn4055336'], executed=['run_star_mssm.sh']
=> getting matched R2 FASTQ from 'syn8612191'
=> getting matched counts from 'syn8672879'
=> getting matched BAM from 'syn8540822'
=> setting provenance for the following outputs
	Output ID: 'syn8669905'
	Output ID: 'syn86

### ROSMAP

In [27]:
# site-contributed Synapse BAMs for ROSMAP
star_input_folder_id = 'syn4164376'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [28]:
# reprocessing output folders on Synapse for ROSMAP study
star_out_fastq_folder_id = 'syn8612097'
star_out_counts_folder_id = 'syn8683042'
star_out_bam_folder_id = 'syn8540863'

# Synapse ID for STAR reprocessing script for ROSMAP samples
star_script_id = 'run_star_rosmap.sh'

set_star_output_provenance(syn, star_out_fastq_folder_id, star_input_list, star_script_id)

Collecting inputs/outputs for R1 FASTQ ID: 'syn8613921' (sample ID: '108_120418')
=> collecting matched input files from 'syn4164376'
=> building activity for script 'run_star_rosmap.sh'
	Activity: used=[u'syn4212548'], executed=['run_star_rosmap.sh']
=> getting matched R2 FASTQ from 'syn8612097'
=> getting matched counts from 'syn8683042'


StopIteration: 

### Mayo_TCX

In [29]:
# site-contributed Synapse BAMs for Mayo_TCX
star_input_folder_id = 'syn4894912'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [31]:
# reprocessing output folders on Synapse for Mayo_TCX study
star_out_fastq_folder_id = 'syn8612203'
star_out_counts_folder_id = 'syn8672882'
star_out_bam_folder_id = 'syn8540820'

# Synapse ID for STAR reprocessing script for Mayo_TCX samples
star_script_id = 'run_star_mayo.sh'

set_star_output_provenance(syn, star_out_fastq_folder_id, star_input_list, star_script_id)

Collecting inputs/outputs for R1 FASTQ ID: 'syn8619777' (sample ID: '1005_TCX')
=> collecting matched input files from 'syn4894912'
=> building activity for script 'run_star_mayo.sh'
	Activity: used=[u'syn4898414'], executed=['run_star_mayo.sh']
=> getting matched R2 FASTQ from 'syn8612203'
=> getting matched counts from 'syn8672882'
=> getting matched BAM from 'syn8540820'
=> setting provenance for the following outputs
	Output ID: 'syn8619777'
	Output ID: 'syn8621132'
	Output ID: 'syn8685924'
	Output ID: 'syn8620557'
Collecting inputs/outputs for R1 FASTQ ID: 'syn8619364' (sample ID: '1010_TCX')
=> collecting matched input files from 'syn4894912'
=> building activity for script 'run_star_mayo.sh'
	Activity: used=[u'syn4899441'], executed=['run_star_mayo.sh']
=> getting matched R2 FASTQ from 'syn8612203'
=> getting matched counts from 'syn8672882'
=> getting matched BAM from 'syn8540820'
=> setting provenance for the following outputs
	Output ID: 'syn8619364'
	Output ID: 'syn8620580'


### Mayo_CBE

In [32]:
# site-contributed Synapse BAMs for Mayo_CBE
star_input_folder_id = 'syn5049322'
star_input_list = get_star_inputs(syn, star_input_folder_id)

In [36]:
star_input_list[0:5]

[{u'file.id': u'syn5196750',
  u'file.name': u'1000_CER.FCC7KUNACXX_L5IATCACG.snap.bam'},
 {u'file.id': u'syn5051379',
  u'file.name': u'1010_CER.FCC7DJAACXX_L5ITTAGGC.snap.bam'},
 {u'file.id': u'syn5051388',
  u'file.name': u'1015_CER.FCC7983ACXX_L5IACTTGA.snap.bam'},
 {u'file.id': u'syn5051449',
  u'file.name': u'1019_CER.FCC7BR1ACXX_L7IGTGGCC.snap.bam'},
 {u'file.id': u'syn5051608',
  u'file.name': u'1027_CER.FCC7BR1ACXX_L2IGATCAG.snap.bam'}]

In [35]:
# reprocessing output folders on Synapse for Mayo_CBE study
star_out_fastq_folder_id = 'syn8612213'
star_out_counts_folder_id = 'syn8672881'
star_out_bam_folder_id = 'syn8540821'

# Synapse ID for STAR reprocessing script for Mayo_CBE samples
star_script_id = 'run_star_mayo.sh'

set_star_output_provenance(syn, star_out_fastq_folder_id, star_input_list, star_script_id)

### Mayo (failed QC)

In [None]:
'syn9688855'

## Picard alignment metrics

In [40]:
def get_picard_out_metrics(syn, picard_out_metrics_folder_id):
    """
    Collect the metrics table Synapse entities generated by the
    Picard activity for the current study.
    """
    metrics_query = ('select id, name from file where parentId=="{}"'
                     .format(picard_out_metrics_folder_id))
    return syn.chunkedQuery(metrics_query)

In [41]:
def get_sample_picard_input(syn, picard_input_folder_id, sample_id):
    """
    Collect Synapse IDs for input BAM file matching the sample ID.
    """
    bam_query = ('select id, name from file where parentId=="{}" '
                 'and specimenID=="{}"'
                 .format(picard_input_folder_id, sample_id))
    return syn.chunkedQuery(bam_query).next()['file.id']

In [45]:
def set_picard_output_provenance(syn, picard_out_folder_id, 
                                 picard_input_folder_id, picard_script_id):
    # get metrics on Synapse for all study samples
    picard_out_metrics = get_picard_out_metrics(syn, picard_out_folder_id)
    
    for sample_metrics in itertools.islice(picard_out_metrics, 0, 2):
        # parse sample ID from metrics filename
        sample_id = sample_metrics['file.name'].split('.')[0].strip('_picard')
        print("Collecting inputs/outputs for metrics ID: '{}' (sample ID: '{}')"
              .format(sample_metrics['file.id'], sample_id))

        # retrieve Synapse ID for matched input BAM file
        print("=> getting matched BAM from '{}'".format(picard_input_folder_id))
        sample_bam = get_sample_picard_input(syn, picard_input_folder_id, sample_id)

        # build activity for current sample inputs
        print("=> building activity for script '{}'".format(picard_script_id))
        sample_picard_activity = Activity(
            used=sample_bam, 
            executed=picard_script_id
        )
        print("\tActivity: used={}, executed={}".format(
              [u['reference']['targetId'] for u in sample_picard_activity['used']
               if not u['wasExecuted']],
              [u['reference']['targetId'] for u in sample_picard_activity['used']
               if u['wasExecuted']])
             )

        # set provenance for all sample outputs
        print("=> setting provenance for the following outputs")
        print("\tOutput ID: '{}'".format(sample_metrics['file.id']))

In [46]:
picard_out_folder_id = 'syn8673913' # MSSM
picard_input_folder_id = 'syn8540822'
picard_script_id = 'run_picard.sh'

set_picard_output_provenance(syn, picard_out_folder_id, 
                             picard_input_folder_id, picard_script_id)

Collecting inputs/outputs for metrics ID: 'syn8683105' (sample ID: 'BM_10_546')
=> getting matched BAM from 'syn8540822'
=> building activity for script 'run_picard.sh'
	Activity: used=[u'syn8593437'], executed=['run_picard.sh']
=> setting provenance for the following outputs
	Output ID: 'syn8683105'
Collecting inputs/outputs for metrics ID: 'syn8683891' (sample ID: 'BM_10_548')
=> getting matched BAM from 'syn8540822'
=> building activity for script 'run_picard.sh'
	Activity: used=[u'syn8604616'], executed=['run_picard.sh']
=> setting provenance for the following outputs
	Output ID: 'syn8683891'


In [47]:
picard_out_folder_id = 'syn8683000' # ROSMAP
picard_input_folder_id = 'syn8540863'
picard_script_id = 'run_picard.sh'

set_picard_output_provenance(syn, picard_out_folder_id, 
                             picard_input_folder_id, picard_script_id)

Collecting inputs/outputs for metrics ID: 'syn8685374' (sample ID: '01_120405')
=> getting matched BAM from 'syn8540863'


StopIteration: 

In [48]:
picard_out_folder_id = 'syn8673915' # Mayo_TCX
picard_input_folder_id = 'syn8540820'
picard_script_id = 'run_picard.sh'

set_picard_output_provenance(syn, picard_out_folder_id, 
                             picard_input_folder_id, picard_script_id)

Collecting inputs/outputs for metrics ID: 'syn8683162' (sample ID: '1005_TCX')
=> getting matched BAM from 'syn8540820'
=> building activity for script 'run_picard.sh'
	Activity: used=[u'syn8620557'], executed=['run_picard.sh']
=> setting provenance for the following outputs
	Output ID: 'syn8683162'
Collecting inputs/outputs for metrics ID: 'syn8682855' (sample ID: '1010_TCX')
=> getting matched BAM from 'syn8540820'
=> building activity for script 'run_picard.sh'
	Activity: used=[u'syn8570047'], executed=['run_picard.sh']
=> setting provenance for the following outputs
	Output ID: 'syn8682855'


In [51]:
picard_out_folder_id = 'syn8673914' # Mayo_CBE
picard_input_folder_id = 'syn8540821'
picard_script_id = 'run_picard.sh'

set_picard_output_provenance(syn, picard_out_folder_id, 
                             picard_input_folder_id, picard_script_id)

Collecting inputs/outputs for metrics ID: 'syn8684675' (sample ID: '1000_CER')
=> getting matched BAM from 'syn8540821'
=> building activity for script 'run_picard.sh'
	Activity: used=[u'syn8545498'], executed=['run_picard.sh']
=> setting provenance for the following outputs
	Output ID: 'syn8684675'
Collecting inputs/outputs for metrics ID: 'syn8684470' (sample ID: '1010_CER')
=> getting matched BAM from 'syn8540821'
=> building activity for script 'run_picard.sh'
	Activity: used=[u'syn8541791'], executed=['run_picard.sh']
=> setting provenance for the following outputs
	Output ID: 'syn8684470'


## Combined counts

In [None]:
study_counts_id = 'syn8691134.1' # ROSMAP
study_counts_id = 'syn8691099.1' # MSSM
study_counts_id = 'syn8690799.1' # Mayo_TCX
study_counts_id = 'syn8690904.1' # Mayo_CBE

## Combined metrics

In [None]:
study_counts_id = 'syn8698240.2' # ROSMAP
study_counts_id = 'syn8698270.1' # MSSM
study_counts_id = 'syn8698211.1' # Mayo_TCX
study_counts_id = 'syn8698214.1' # Mayo_CBE