In [1]:
import os
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))

import asyncio
import nest_asyncio
nest_asyncio.apply()

from pubmetric.network import create_citation_network 
from pubmetric.workflow import *
from pubmetric.metrics import *
from pubmetric.pckg_dev import generate_random_workflow

##### Loading graph

In [2]:
path_to_data = '../out_20240801231111' 
loaded_graph = asyncio.run(create_citation_network(inpath=path_to_data, load_graph=True))

2024-08-06 13:48:17 - Graph loaded from ../out_20240801231111


##### Testing that loading one Workflomics produced CWL workflow works
These have urls to containers, which is needed for the cwl_utils.parser function load_document_by_uri to work 

In [3]:
cwl_file_path = "../workflows/workflomics/candidate_workflow_23.cwl" # loading one of the APE generated workflows 
workflow = parse_cwl_workflows(loaded_graph, cwl_file_path)

Generating a random workflow of the same structure. Each radnom tool is picked from a pool of tools with a siilar degree (using a range of +-50)

In [4]:
random_workflow = generate_random_workflow(graph=loaded_graph, workflow=workflow)

Checking that they look the same 

In [5]:
print(json.dumps(workflow, indent=4)) # 
print(json.dumps(random_workflow, indent=4)) 

{
    "edges": [
        [
            "XTandem_01",
            "ProteinProphet_02"
        ],
        [
            "ProteinProphet_02",
            "StPeter_04"
        ],
        [
            "XTandem_03",
            "StPeter_04"
        ]
    ],
    "steps": {
        "ProteinProphet_02": "14632076",
        "StPeter_04": "29400476",
        "XTandem_01": "14976030",
        "XTandem_03": "14976030"
    },
    "pmid_edges": [
        [
            "14976030",
            "14632076"
        ],
        [
            "14632076",
            "29400476"
        ],
        [
            "14976030",
            "29400476"
        ]
    ]
}
{
    "edges": [
        [
            "PeptideProphet_01",
            "ProteinProphet_02"
        ],
        [
            "ProteinProphet_02",
            "ProteinInfer_04"
        ],
        [
            "PeptideProphet_03",
            "ProteinInfer_04"
        ]
    ],
    "steps": {
        "PeptideProphet_01": "12403597",
        "ProteinPro

##### Test that loading one APE generated workflow (without uri) works

In [6]:
cwl_file_path = "../workflows/APE/candidate_workflow_23.cwl" # loading one of the APE generated workflows 
undoc_workflow = parse_undocumented_workflows(loaded_graph, cwl_file_path)
print(json.dumps(undoc_workflow, indent=4)) 

{
    "edges": [
        [
            "CrosstalkDB_01",
            "CrosstalkDB_02"
        ],
        [
            "CrosstalkDB_02",
            "MSiReader_03"
        ],
        [
            "MSiReader_03",
            "ComPIL_04"
        ],
        [
            "ComPIL_04",
            "isobar_05"
        ]
    ],
    "steps": {
        "CrosstalkDB_01": "24741113",
        "CrosstalkDB_02": "24741113",
        "MSiReader_03": "23536269",
        "ComPIL_04": "30525664",
        "isobar_05": "21526793"
    },
    "pmid_edges": [
        [
            "24741113",
            "24741113"
        ],
        [
            "24741113",
            "23536269"
        ],
        [
            "23536269",
            "30525664"
        ],
        [
            "30525664",
            "21526793"
        ]
    ]
}


#### Generating and saving a random dataset

The random dataset will be based on 1000 APE generated workflows in the proteomics domain. Out of these 1000 workflows, only the ones with at most one undefined pmid in the graph will be used. For each of these a randomly generated conterpart will be generated. 

In [24]:
# Because of a problem with the naming in APE the 8 and 9s are skipped, thus the id of the 1000th workflow is 1750. 
random_dataset = []
ape_tools = []
random_tools = []

ape_edges = []
random_edges = []

ape_worfklows = []

for i in range(1,1751):
    if '8' in str(i) or '9' in str(i):
        continue
    cwl_file_path = f"../workflows/APE/candidate_workflow_{i}.cwl" #
    workflow = parse_undocumented_workflows(loaded_graph, cwl_file_path)

    if workflow['edges'] in ape_worfklows:
        print('Repetition')
        continue
    else:
        ape_worfklows.append(workflow['edges'])

    workflow_pmids = workflow['steps'].values()
    if len([pmid for pmid in workflow_pmids if not pmid]) <=1:
        random_workflow = generate_random_workflow(graph=loaded_graph, workflow=workflow)
        random_workflow_pmids = random_workflow['steps'].values()

        ape_tools += workflow_pmids
        random_tools += random_workflow_pmids
        ape_edges += [(edge[0], edge[1]) for edge in workflow['edges']]
        random_edges += [(edge[0], edge[1]) for edge in random_workflow['edges']]

        

        # To save the dataset
        random_dataset.append( {
            'APE_workflow': workflow,
            'random_workflow': random_workflow
        })


Repetition
Repetition
Repetition
Repetition


In [25]:
unique_ape_tools = np.unique([t for t in ape_tools if t])
unique_random_tools = np.unique([t for t in random_tools if t])

unique_ape_edges = np.unique([t for t in ape_edges if t])
unique_random_edges = np.unique([t for t in random_edges if t])

In [26]:
print(len(random_dataset))

669


Saving the dataset

In [27]:
with open("../data/random_dataset.json", 'w') as f:
    json.dump(random_dataset, f)

Stats on the random data set

In [28]:
print(len(unique_random_tools))
print(len(unique_ape_tools))
print(len(unique_random_edges))
print(len(unique_ape_edges))

41
96
146
286
