### Data removing from Terra.bio

#### Library lording and function definition

In [None]:
### autoreload ### 
%load_ext autoreload
%autoreload 2

In [2]:
### Load necessary libraries ###
from genepy import terra
import dalmatian as dm
import pandas as pd

In [4]:
# removing things from old failed workflows
def removeFromFailedWorkflows(
    workspaceid: str,
    maxtime: str="2020-06-10",
    force_remove: list=[],
    dryrun: bool=True
) -> None:
    """
    Lists all files from all jobs that have failed and deletes them.

    Can be very long

    parameters
    ----------
    workspaceid: str
        str the workspace name

    maxtime: str
        str date format (eg. 2020-06-10) does not delete files generated past this date

    force_remove: list
        list[str of submissionId] removes from these workflows even if not failed

    dryrun: bool
        bool whether or not to execute or just print commands

    Returns
    -------
    None
    """
    wm = dm.WorkspaceManager(workspaceid)
    for val in wm.list_submissions():
        if (
            val["workflowStatuses"][list(val["workflowStatuses"].keys())[0]] > 0 or val["submissionId"] in force_remove
        ) and (pd.to_datetime(val['submissionDate']).tz_localize(None)) > pd.to_datetime(maxtime):
            for w in wm.get_submission(val["submissionId"])["workflows"]:
                #if w["status"] == "Failed" or val["methodConfigurationName"] in force_remove:
                if w["status"] in ["Failed", "Aborted"] or val["submissionId"] in force_remove:
                    print(w["status"], val['submissionId'])
                    try:
                        a = w["workflowId"]
                        terra.deleteJob(workspaceid, "submissions/" + val["submissionId"], a, dryrun=dryrun)
                    # else it was not even run
                    except Exception as e:
                        if str(e) == "b'CommandException: One or more URLs matched no objects.\\n'":
                            continue
                        else:
                            print(e)


#### Workspace cleaning

In [5]:
# defining the workspace
TERRA_WS = 'whitelabgx/bulkRNAseq'

In [6]:
# Fetching the submission ids which we want to remove
submission_ids_all = [submission['submissionId'] for submission in dm.WorkspaceManager(TERRA_WS).list_submissions()]
print(len(submission_ids_all))

submission_ids_keep = ['0a72fe8e-f3e8-4dba-8cd6-5aefe5b1b5d3', '5f2e2227-78e0-46ff-9682-d8138eff8a89']
print(len(submission_ids_keep))

submission_ids_remove = [i for i in submission_ids_all if i not in submission_ids_keep]
print(len(submission_ids_remove))
submission_ids_remove

21
2
19


['210d963d-c404-40a4-afa5-7ff21b2170be',
 '2386958e-281c-4b7e-9355-9a049478121e',
 '44f31035-0bf5-4351-9b09-fc3e1a7b7a4b',
 '481f0380-a539-49a5-b9c0-ed7a9e78c03a',
 '4a3d0d4a-748c-42bb-8435-03e9b5686c39',
 '4b9becc2-4ed1-453d-b1e9-6689c6c72287',
 '5b4a9fd3-560f-440e-bce5-1acdb19fa0cf',
 '5f885c77-56ff-401f-94fe-7d0086ae25a4',
 '612a8eda-7cfe-4199-9432-85f089cbb083',
 '77f3841b-0a77-4779-9a19-a3d7ef6bbb04',
 '864cab14-03ef-4794-951f-5f9c755ab88b',
 '8bdaaf79-e451-4c4c-a065-371b8f2bbf1b',
 '8d0c8eb1-9d99-4740-b5a6-02dec144f081',
 '9e5a1273-32bb-4a5c-8cd7-e6c3a4d0210d',
 'a0c11535-31a6-4d55-a611-1b3b101c253f',
 'c44e1b81-23fc-402a-966e-26304bd034a6',
 'e4dc2e4f-50e8-4d3f-bd54-3fecee106335',
 'ea8ce43f-c84e-4af2-8a57-244131c2cd00',
 'fbb20465-64e4-4048-8dfe-5f01162f317f']

In [20]:
# removing jobs from workspace
#removeFromFailedWorkflows(TERRA_WS, maxtime="2023-04-19", force_remove=submission_ids_remove)
removeFromFailedWorkflows(TERRA_WS, maxtime="2023-04-19", force_remove=submission_ids_remove, dryrun=False)

Aborted 210d963d-c404-40a4-afa5-7ff21b2170be
{'gs://fc-fe01b45c-fa1e-40e1-aa9c-01f059d7ad33/submissions/210d963d-c404-40a4-afa5-7ff21b2170be/RNA_preprocessing_pipeline/0e516e65-da36-4b08-af0c-cd61158b8388/call-star/star_workflow/1fc78865-7c02-4835-a4fa-03dee3fc7036/call-star/cacheCopy/star_out/RSeq-01-001.Aligned.sortedByCoord.out.bam', 'gs://fc-fe01b45c-fa1e-40e1-aa9c-01f059d7ad33/submissions/210d963d-c404-40a4-afa5-7ff21b2170be/RNA_preprocessing_pipeline/0e516e65-da36-4b08-af0c-cd61158b8388/call-star/star_workflow/1fc78865-7c02-4835-a4fa-03dee3fc7036/call-star/cacheCopy/star_out/RSeq-01-001.SJ.out.tab.gz', 'gs://fc-fe01b45c-fa1e-40e1-aa9c-01f059d7ad33/submissions/210d963d-c404-40a4-afa5-7ff21b2170be/RNA_preprocessing_pipeline/0e516e65-da36-4b08-af0c-cd61158b8388/call-star/star_workflow/1fc78865-7c02-4835-a4fa-03dee3fc7036/call-star/cacheCopy/rc', 'gs://fc-fe01b45c-fa1e-40e1-aa9c-01f059d7ad33/submissions/210d963d-c404-40a4-afa5-7ff21b2170be/RNA_preprocessing_pipeline/0e516e65-da36-4b0