# load library

In [1]:
# auto reloading of local scripts under dev
import pprint
%load_ext autoreload
%autoreload 2

In [13]:
# relying on these stdlib anyway
import http
import re
import os
import sys
import datetime
import pprint
from dateutil import parser
import pandas as pd

In [3]:
# Google Cloud and FISS
from firecloud import api as fapi

from google.cloud import storage
storage_client = storage.Client()

In [62]:
# load local lib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.gcs_utils import GcsPath

from src.terra.table_utils import *
from src.terra.submission.submission_utils import verify_before_submit

# load data

In [5]:
# primary_namespace = 'production-long-reads'
# primary_workspace = 'broad-gp-pacbio'
# root_data_type = 'sample'

In [6]:
primary_namespace = 'broad-firecloud-dsde-methods'
primary_workspace = 'HGSVC2-unified'
root_data_type = 'ccs-flowcell'

In [7]:
flowcell_table = \
  fetch_existing_root_table(ns=primary_namespace,
                            ws=primary_workspace,
                            etype=root_data_type)

In [8]:
gcs_locations = ['aligned_bai', 'aligned_bam', 'aligned_pbi',
                 'ccs_bam', 'ccs_pbi', 'ccs_report',
                 'fingerprint_details', 'fingerprint_metrics',
                 'fq', 'gcs_input_dir', 'input_bam', 'input_pbi', 'subreads_bam', 'subreads_pbi']

In [None]:
lab_identity = ['bio_sample', 'description', 'well_sample']
sequencer_identity = ['flowcell_id', 'movie_name', 'well_name']
terra_identity = ['sample']

In [None]:
categorical_columns = {'type': 'category',
                       'columns': ['application', 'experiment_type', 'instrument', 'workspace']}

date_time_columns = {'type': 'datetime64',
                     'timezone': datetime.timezone.utc,
                     'columns': ['created_at']}

boolean_columns = {'type': 'bool',
                   'columns': ['is_ccs', 'is_corrected', 'is_isoseq']}

int_type_columns = {'type': 'Int64',
                    'columns': ['aligned_num_bases','aligned_num_reads','aligned_read_length_N50',
                                'ccs_zmws_fail_filters','ccs_zmws_input','ccs_zmws_pass_filters', 'ccs_zmws_shortcut_filters',
                                'insert_size',
                                'num_bases','num_reads','num_reads_Q10','num_reads_Q12','num_reads_Q15','num_reads_Q5','num_reads_Q7','num_records',
                                'total_length']}

float_type_columns = {'type': 'float64',
                      'columns': ['lod_expected_sample',
                                  'aligned_est_fold_cov', 'raw_est_fold_cov',
                                  'aligned_frac_bases','aligned_read_length_mean','aligned_read_length_median','aligned_read_length_stdev',
                                  'average_identity', 'median_identity',
                                  'ccs_zmws_fail_filters_pct','ccs_zmws_pass_filters_pct','ccs_zmws_shortcut_filters_pct',
                                  'polymerase_read_length_N50', 'polymerase_read_length_mean',
                                  'read_length_N50', 'read_length_mean', 'read_length_median', 'read_length_stdev', 'read_qual_mean', 'read_qual_median',
                                  'subread_read_length_N50','subread_read_length_mean']}

string_type_columns = {'type': 'str',
                       'columns': gcs_locations + terra_identity + lab_identity + sequencer_identity}

In [None]:
for n in boolean_columns['columns']:
    flowcell_table[n] = flowcell_table[n].apply(lambda s: s=='TRUE' or s=='True' or s=='true').astype(boolean_columns['type'])

In [None]:
for n in categorical_columns['columns']:
    flowcell_table[n] = flowcell_table[n].astype(categorical_columns['type'])

In [None]:
for n in string_type_columns['columns']:
    flowcell_table[n] = flowcell_table[n].astype(string_type_columns['type'])

In [None]:
def convert_to_float(e) -> float or None:
    if e:
        if e.lower() in ['nan', 'none']:
            return None
        else:
            try:
                return float(e)
            except TypeError:
                print(e)
                raise
    else:
        return None

def convert_to_int(e) -> int:
    f = convert_to_float(e)
    return round(f) if f else None

In [None]:
for n in int_type_columns['columns']:
    try:
        flowcell_table[n] = flowcell_table[n].apply(convert_to_int).astype(int_type_columns['type'])
    except ValueError:
        print(n)
        raise

In [None]:
for n in float_type_columns['columns']:
    try:
        flowcell_table[n] = flowcell_table[n].apply(convert_to_float).astype(float_type_columns['type'])
    except ValueError:
        print(n)
        raise

In [None]:
def convert_date_time(s):
    try:
        t = parser.isoparse(s).astimezone(tz=date_time_columns['timezone'])
        return pd.to_datetime(t)
    except (ValueError, pd.errors.OutOfBoundsDatetime):
        return pd.Timestamp.min
for n in date_time_columns['columns']:
    flowcell_table[n] = flowcell_table[n].apply(lambda s: pd.to_datetime(convert_date_time(s)))

# PLAY

In [None]:
verify_before_submit(hgsvc2_namespace, hgsvc2_workspace,
                     workflow_name='Dummy',
                     etype='ccs-flowcell', enames=hgsvc2_ccs_flowcells['ccs-flowcell'][0:4].tolist(),
                     expression='this.ccs-flowcells',
                     use_callcache=True)

In [10]:
response = fapi.get_workspace_config(primary_namespace, primary_workspace, primary_namespace, 'Dummy')
response.ok

True

In [11]:
response.json()

{'deleted': False,
 'inputs': {'Dummy.bai': 'this.aligned_bai', 'Dummy.bam': 'this.aligned_bam'},
 'methodConfigVersion': 3,
 'methodRepoMethod': {'methodUri': 'dockstore://github.com%2Fbroadinstitute%2Flong-read-pipelines%2FDummy/sh_dummy',
  'sourceRepo': 'dockstore',
  'methodPath': 'github.com/broadinstitute/long-read-pipelines/Dummy',
  'methodVersion': 'sh_dummy'},
 'name': 'Dummy',
 'namespace': 'broad-firecloud-dsde-methods',
 'outputs': {},
 'prerequisites': {},
 'rootEntityType': 'ccs-flowcell'}

In [14]:
pprint.pp(response.json())

{'deleted': False,
 'inputs': {'Dummy.bai': 'this.aligned_bai', 'Dummy.bam': 'this.aligned_bam'},
 'methodConfigVersion': 3,
 'methodRepoMethod': {'methodUri': 'dockstore://github.com%2Fbroadinstitute%2Flong-read-pipelines%2FDummy/sh_dummy',
                      'sourceRepo': 'dockstore',
                      'methodPath': 'github.com/broadinstitute/long-read-pipelines/Dummy',
                      'methodVersion': 'sh_dummy'},
 'name': 'Dummy',
 'namespace': 'broad-firecloud-dsde-methods',
 'outputs': {},
 'prerequisites': {},
 'rootEntityType': 'ccs-flowcell'}


In [15]:
import copy

In [16]:
to_be_updated = copy.deepcopy(response.json())

In [17]:
to_be_updated['rootEntityType'] = 'clr-flowcell'

In [18]:
pprint.pp(to_be_updated)

{'deleted': False,
 'inputs': {'Dummy.bai': 'this.aligned_bai', 'Dummy.bam': 'this.aligned_bam'},
 'methodConfigVersion': 3,
 'methodRepoMethod': {'methodUri': 'dockstore://github.com%2Fbroadinstitute%2Flong-read-pipelines%2FDummy/sh_dummy',
                      'sourceRepo': 'dockstore',
                      'methodPath': 'github.com/broadinstitute/long-read-pipelines/Dummy',
                      'methodVersion': 'sh_dummy'},
 'name': 'Dummy',
 'namespace': 'broad-firecloud-dsde-methods',
 'outputs': {},
 'prerequisites': {},
 'rootEntityType': 'clr-flowcell'}


In [19]:
import json

In [24]:
to_be_updated.pop('deleted')
to_be_updated.pop('methodConfigVersion')

3

In [33]:
to_be_updated['methodConfigVersion'] = 4
to_be_updated['deleted'] = False

In [34]:
response = fapi.update_workspace_config(primary_namespace, primary_workspace, primary_namespace,
                                        configname='Dummy', body=to_be_updated)
response.ok

True

In [32]:
response.json()

{'400 Bad Request': "The request content was malformed:\nObject is missing required member 'deleted'",
 'timestamp': 1641525394720}

In [27]:
type(response.json())

dict

In [63]:
from src.terra.submission.submission_utils import *

In [70]:
# primary_namespace = 'production-long-reads'
# primary_workspace = 'broad-gp-pacbio'
# root_data_type = 'sample'
analyzable_entities('production-long-reads', 'broad-gp-pacbio',
                    'PBFlowcell',
                    'sample',
                    ['3894997c-6eab-497f-a8a9-0417ce985e6a', '66200c03-2e0c-4e45-a037-855e48a204a7', 'a7013a6d-c784-4d52-b339-be42e28c7f4b'])

['a7013a6d-c784-4d52-b339-be42e28c7f4b']

In [39]:
response = fapi.get_submission('production-long-reads', 'broad-gp-pacbio', '5a9e1602-2652-40cf-9adb-08e5882a0bad')
response.ok

True

In [41]:
pprint.pp(response.json())

{'cost': 0.0,
 'deleteIntermediateOutputFiles': False,
 'memoryRetryMultiplier': 1.0,
 'methodConfigurationName': 'PBFlowcell',
 'methodConfigurationNamespace': 'production-long-reads',
 'status': 'Submitted',
 'submissionDate': '2022-01-07T08:00:28.933Z',
 'submissionEntity': {'entityType': 'sample',
                      'entityName': '66200c03-2e0c-4e45-a037-855e48a204a7'},
 'submissionId': '5a9e1602-2652-40cf-9adb-08e5882a0bad',
 'submitter': 'kiran@broadinstitute.org',
 'useCallCache': True,
 'useReferenceDisks': False,
 'workflows': [{'cost': 0.0,
                'inputResolutions': [{'inputName': 'PBFlowcell.LB',
                                      'value': 'SM-LNN16'},
                                     {'inputName': 'PBFlowcell.SM',
                                      'value': '1-01335'},
                                     {'inputName': 'PBFlowcell.bam',
                                      'value': 'gs://broad-gp-pacbio/r64271e_20220103_194247/2_B01/m64271e_220105_06

In [44]:
response = fapi.list_submissions('production-long-reads', 'broad-gp-pacbio')
if not response.ok:
    raise FireCloudServerError(response.status_code, response.text)

jobs = response.json()
filtered_down_jobs = [job for job in jobs
                      if 'PBFlowcell' == job['methodConfigurationName']
                      and 'sample' == job['submissionEntity']['entityType']]

In [45]:
filtered_down_jobs

[{'deleteIntermediateOutputFiles': False,
  'methodConfigurationDeleted': False,
  'methodConfigurationName': 'PBFlowcell',
  'methodConfigurationNamespace': 'production-long-reads',
  'status': 'Aborted',
  'submissionDate': '2021-12-29T02:08:18.112Z',
  'submissionEntity': {'entityType': 'sample',
   'entityName': 'e5f4e00c-29b6-488d-a02c-1d3a179f8ea6'},
  'submissionId': '00d3f478-4995-469c-9fe8-17d3c7ef488a',
  'submitter': 'shuang@broadinstitute.org',
  'useCallCache': True,
  'workflowStatuses': {'Aborted': 1}},
 {'deleteIntermediateOutputFiles': False,
  'methodConfigurationDeleted': False,
  'methodConfigurationName': 'PBFlowcell',
  'methodConfigurationNamespace': 'production-long-reads',
  'status': 'Done',
  'submissionDate': '2022-01-03T06:52:21.251Z',
  'submissionEntity': {'entityType': 'sample',
   'entityName': '3d6db72c-33c6-44b2-a5fc-1a1fd6924fef'},
  'submissionId': '033a9ebb-d7d7-4064-9bfa-527e19221f26',
  'submitter': 'kiran@broadinstitute.org',
  'useCallCache': T

In [54]:
def __no_success_analysis(submission_metadata: dict) -> bool:
    if 'Submitted' == submission_metadata['status']:
        if 'Running' in submission_metadata['workflowStatuses']:
            return False
        if 'Failed' in submission_metadata['workflowStatuses']:
            return True

    if 'Done' == submission_metadata['status']:
        return 'Succeeded' not in submission_metadata['workflowStatuses']

In [None]:
['a7013a6d-c784-4d52-b339-be42e28c7f4b',
 '66200c03-2e0c-4e45-a037-855e48a204a7',
 '3894997c-6eab-497f-a8a9-0417ce985e6a']

In [60]:
for job in filtered_down_jobs:
    if job['submissionEntity']['entityName'] == '3894997c-6eab-497f-a8a9-0417ce985e6a':
        print(__no_success_analysis(job))

False
