In [30]:
# More information on:
# https://github.com/ResearchObject/ro-crate-py; https://about.workflowhub.eu/Workflow-RO-Crate/

# Import modules
from rocrate.rocrate import ROCrate
from rocrate.model.file import File
from rocrate.model.computationalworkflow import ComputationalWorkflow
from rocrate.model.computerlanguage import ComputerLanguage
from rocrate.model.person import Person
from rocrate.model.contextentity import ContextEntity
from rocrate.model.data_entity import DataEntity

# Intialize the RO Crate object
crate = ROCrate()

In [31]:
# Get current date, which will be added as publish date

from datetime import date
today = date.today()

## Main Workflow file

In [32]:
# Define the Nextflow workflow as a ComputationalWorkflow, add to crate.

MainWorkflow = crate.add(ComputationalWorkflow(crate, "action.nf", properties={
    "@id": "action.nf",
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "NTR-ACTION Data-analysis workflow",
    "dateCreated": str(today),
    "input": "",
    "description": "Use of multi-omics data (Metabolomics + DNA Methylation) to study CBCL data",
    "output": "",
    "license": "https://opensource.org/licenses/MIT",
    "url": "https://github.com/Xomics/ACTIONdemonstrator_workflow",
    "version": "1.0.0"
}))

## Contextual information

In [33]:
# Define Nextflow as computer language

nextflow_id = "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"

Nextflow = crate.add (ComputerLanguage(crate, nextflow_id, properties={
  "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow",
  "@type": "ComputerLanguage",
  "name": "Nextflow",
  "identifier": {
    "@id": "https://www.nextflow.io/"
  },
  "url": {
    "@id": "https://www.nextflow.io/"
  }
}))

In [34]:
# Define persons (authors)

Anna_Niehues_id = "https://orcid.org/0000-0002-9839-5439"
Casper_de_Visser_id = "https://orcid.org/0000-0002-2812-5898"
Fiona_Hagenbeek_id = "https://orcid.org/0000-0002-8773-0430"
Naama_Karu_id = "https://orcid.org/0000-0001-8005-0726"
Alida_Kindt_id = "https://orcid.org/0000-0001-6551-6030"
Purva_Kulkarni_id = "https://orcid.org/0000-0002-4681-4582"
Rene_Pool_id = "https://orcid.org/0000-0001-5579-0933"
Dorret_Boomsma_id = "https://orcid.org/0000-0002-7099-7972"
Jenny_van_Dongen_id = "https://orcid.org/0000-0003-2063-8741"
Alain_van_Gool_id =  "https://orcid.org/0000-0003-0010-5286"
PeterBram_t_Hoen_id = "https://orcid.org/0000-0003-4450-3112"

Anna_Niehues = crate.add(Person(crate, Anna_Niehues_id, properties={
    "name": "Anna Niehues",
    "affiliation": "Radboud university medical center"
}))
Casper_de_Visser = crate.add(Person(crate, Casper_de_Visser_id, properties={
    "name": "Casper de Visser",
    "affiliation": "Radboud university medical center"
}))
Fiona_Hagenbeek = crate.add(Person(crate, properties={
    "name": "Fiona A. Hagenbeek",
    "affiliation": "Vrije Universiteit Amsterdam"
}))
Naama_Karu = crate.add(Person(crate, Naama_Karu_id, properties={
    "name": "Naama Karu",
    "affiliation": "Leiden University"
}))
Alida_Kindt = crate.add(Person(crate, Alida_Kindt_id, properties={
    "name": "Alida S.D. Kindt",
    "affiliation": "Leiden University"
}))
Purva_Kulkarni = crate.add(Person(crate, Purva_Kulkarni_id, properties={
    "name": "Purva Kulkarni",
    "affiliation": "Radboud university medical center"
}))
Rene_Pool = crate.add(Person(crate, Rene_Pool_id, properties={
    "name": "René Pool",
    "affiliation": "Vrije Universiteit Amsterdam"
}))
Dorret_Boomsma = crate.add(Person(crate, Dorret_Boomsma_id, properties={
    "name": "Dorret I. Boomsma",
    "affiliation": "Vrije Universiteit Amsterdam"
}))
Jenny_van_Dongen = crate.add(Person(crate, Jenny_van_Dongen_id, properties={
    "name": "Jenny van Dongen",
    "affiliation": "Vrije Universiteit Amsterdam"
}))
Alain_van_Gool = crate.add(Person(crate, Alain_van_Gool_id, properties={
    "name": "Alain J. van Gool",
    "affiliation": "Radboud university medical center"
}))
PeterBram_t_Hoen = crate.add(Person(crate, PeterBram_t_Hoen_id, properties={
    "name": "Peter A.C. 't Hoen",
    "affiliation": "Radboud university medical center"
}))

In [35]:
# Define X-omics organization
# TODO: Can another url identifier be used here?

x_omics_id = "https://x-omics.nl/"

x_omics = crate.add(ContextEntity(crate, x_omics_id, properties={
    "@type": "Organization",
    "name": "The Netherlands X-omics intiative",
    "url": "https://x-omics.nl/"  
}))

## ISA files

In [36]:
# Define ISA files

# Define data entity IDs
investigation_id = "Synthetic_data/i_investigation.txt"
study_id = "Synthetic_data/s_study.txt"
assay_epigenomics_id = "Synthetic_data/a_assay_methylation.txt"
assay_amines_id = "Synthetic_data/a_assay_metabolomics_amines.txt"
assay_oa_id = "Synthetic_data/a_assay_metabolomics_OA.txt"
assay_steroids_id = "Synthetic_data/a_assay_metabolomics_steroids.txt"



investigation = crate.add(File(crate, investigation_id, dest_path=investigation_id, properties={
    "@id": investigation_id,
    "@type": "FormalParameter",
    "name": "Investigation file",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C165216", #Experiment Metadata
                        "@id": "http://purl.obolibrary.org/obo/OBI_0000066"}, #investigation
    "format": {"@id": "http://edamontology.org/format_3687"} #ISA-TAB
}))

study = crate.add(File(crate, study_id, dest_path=study_id, properties={
    "@id": study_id,
    "@type": "FormalParameter",
    "name": "Study file",
    #"valueRequired": true,
    "additionalType":  {"@id": "http://purl.obolibrary.org/obo/NCIT_C165216", #Experiment Metadata
                        "@id": "http://purl.obolibrary.org/obo/NCIT_C63536"}, #study
    "format": {"@id": "http://edamontology.org/format_3687"} #ISA-TAB
}))

assay_epigenomics = crate.add(File(crate, assay_epigenomics_id, dest_path=assay_epigenomics_id, properties={
    "@id": assay_epigenomics_id,
    "@type": "FormalParameter",
    "name": "Assay epigenomics file",
    #"valueRequired": true,
    "additionalType":  {"@id": "http://purl.obolibrary.org/obo/NCIT_C165216", #Experiment Metadata
                        "@id": "http://purl.obolibrary.org/obo/OBI_0000070"}, #assay
    "format": {"@id": "http://edamontology.org/format_3687"} #ISA-TAB
}))

assay_amines = crate.add(File(crate, assay_amines_id, dest_path=assay_amines_id, properties={
    "@id": assay_amines_id,
    "@type": "FormalParameter",
    "name": "Assay amines file",
    #"valueRequired": true,
    "additionalType":  {"@id": "http://purl.obolibrary.org/obo/NCIT_C165216", #Experiment Metadata
                        "@id": "http://purl.obolibrary.org/obo/OBI_0000070"}, #assay
    "format": {"@id": "http://edamontology.org/format_3687"} #ISA-TAB
}))

assay_oa = crate.add(File(crate, assay_oa_id, dest_path=assay_oa_id, properties={
    "@id": assay_oa_id,
    "@type": "FormalParameter",
    "name": "Assay organic acids file",
    #"valueRequired": true,
    "additionalType":  {"@id": "http://purl.obolibrary.org/obo/NCIT_C165216", #Experiment Metadata
                        "@id": "http://purl.obolibrary.org/obo/OBI_0000070"}, #assay
    "format": {"@id": "http://edamontology.org/format_3687"} #ISA-TAB
}))

assay_steroids = crate.add(File(crate, assay_steroids_id, dest_path=assay_steroids_id, properties={
    "@id": assay_steroids_id,
    "@type": "FormalParameter",
    "name": "Assay steroids file",
    #"valueRequired": true,
    "additionalType":  {"@id": "http://purl.obolibrary.org/obo/NCIT_C165216", #Experiment Metadata
                        "@id": "http://purl.obolibrary.org/obo/OBI_0000070"}, #assay
    "format": {"@id": "http://edamontology.org/format_3687"} #ISA-TAB
}))

## Sub-workflows with input/output files

### Analyze missing values

In [37]:
# Define entity IDs
missing_data_heatmap_id = "modules/heatmap_missingness.nf"
missing_data_script =  "bin/heatmap_missingness.R"


# Heatmap missing data points
missing_data_heatmap = crate.add(ComputationalWorkflow(crate, missing_data_heatmap_id, dest_path=missing_data_heatmap_id, properties={
    "@id": missing_data_heatmap_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "Heatmap missingness",
    "input": {"@id": "epigenomics_values",
              "@id": "metabolomics_values",
              "@id": "behavioral_data",
              "@id": "phenotype_covariates"},  
    #"output": {} ,
    "hasPart": [
        {"@id": missing_data_script},

    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C142610", #Missing Data
                       "@id": "http://semanticscience.org/resource/SIO_000449"} #plot 

}))


missing_data_script = crate.add(File(crate, missing_data_script, dest_path=missing_data_script, properties={
    "@id": missing_data_script,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Heatmap NA values script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C142610", #Missing Data
                       "@id": "http://semanticscience.org/resource/SIO_000449"} #plot 
}))

### Epigenomics pre-processing

In [38]:
# Define entity IDs
epi_preprocessing_id = "modules/epigenetics_preprocessing.nf"
epi_annotation_script_id = "bin/epigenomics_annotation.R"
epi_filtering_script_id = "bin/epigenomics_filtering.R"
epi_imputation_script_id = "bin/epigenomics_imputation.R"
epi_covariates_correction_script_id = "bin/CovariateCorrection.R"
epi_subset_features_script_id = "bin/sort_cols_sd.R"
epi_scaling_script_id = "bin/epigenomics_scaling.R"


# Define epigenomics pre-processing sub-workflow
epi_preprocessing = crate.add(ComputationalWorkflow(crate, epi_preprocessing_id, dest_path=epi_preprocessing_id, properties={
    "@id": epi_preprocessing_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "Epigenetics preprocessing",
    #"input": {"@id": "epigenomics_values"}, #Add later
    "output": {"@id": "epigenomics_preprocessed_data"},
    "hasPart": [
        {"@id": epi_annotation_script_id},
        {"@id": epi_filtering_script_id},
        {"@id": epi_imputation_script_id},
        {"@id": epi_covariates_correction_script_id},
        {"@id": epi_subset_features_script_id},
        {"@id": epi_scaling_script_id}
    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://edamontology.org/operation_0226", #annotation
                       "@id": "http://purl.obolibrary.org/obo/MS_1001486", #filtering
                       "@id": "http://edamontology.org/operation_3557", #imputation
                       "@id": "http://purl.obolibrary.org/obo/OBI_0200185", #scaling
                       "@id": "http://semanticscience.org/resource/SIO_000594"} #data transformation
}))


# Scripts
epi_annotation_script = crate.add(File(crate, epi_annotation_script_id, dest_path=epi_annotation_script_id, properties={
    "@id": epi_annotation_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Epigenomics annotation script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://edamontology.org/operation_0226"} #annotation
   }))

epi_filtering_script = crate.add(File(crate, epi_filtering_script_id, dest_path=epi_filtering_script_id, properties={
    "@id": epi_filtering_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Epigenomics filtering script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1001486"} #filtering
}))

epi_imputation_script = crate.add(File(crate, epi_imputation_script_id, dest_path=epi_imputation_script_id, properties={
    "@id": epi_imputation_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Epigenomics imputation script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://edamontology.org/operation_3557"} #imputation
}))

epi_covariates_correction_script = crate.add(File(crate, epi_covariates_correction_script_id, dest_path=epi_covariates_correction_script_id, properties={
    "@id": epi_covariates_correction_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Epigenomics covariates correction script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://semanticscience.org/resource/SIO_000594"} #data transformation #TODO find more specific term
}))

epi_subset_features_script = crate.add(File(crate, epi_subset_features_script_id, dest_path=epi_subset_features_script_id, properties={
    "@id": epi_subset_features_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Epigenomics subset features script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://semanticscience.org/resource/SIO_000594"} #data transformation #TODO find more specific term
}))

epi_scaling_script = crate.add(File(crate, epi_scaling_script_id, dest_path=epi_scaling_script_id, properties={
    "@id": epi_scaling_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Epigenomics scaling script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/OBI_0200185"} #scaling
}))

In [39]:
# Define epigenomics data entities

# Define data entity IDs
epigenomics_data_id = "Synthetic_data/synthetic_epigenomics.csv"
epigenomics_meta_id = "Synthetic_data/synthetic_epigenomics_meta.csv"

epigenomics_data = crate.add(File(crate, epigenomics_data_id, dest_path=epigenomics_data_id, properties={
    "@id": epigenomics_data_id,
    "@type": "FormalParameter",
    "name": "epigenomics_data",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C153195"}, #epigenome
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

epigenomics_meta = crate.add(File(crate, epigenomics_meta_id, dest_path=epigenomics_meta_id, properties={
    "@id": epigenomics_meta_id,
    "@type": "FormalParameter",
    "name": "epigenomics_data",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C52095"}, #metadata
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

epigenomics_preprocessed = crate.add(DataEntity(crate, "epigenomics_preprocessed", properties={
    "@id": "epigenomics_preprocessed_data",
    "@type": "FormalParameter",
    "name": "epigenomics_preprocessed",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C153195", #epigenome
                       "@id": "http://www.ebi.ac.uk/efo/EFO_0004096" #processed array data file
                         },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

epi_preprocessing["input"] = [epigenomics_data_id, epigenomics_meta_id]

## Check if Methylation EPIC file is present, if not download from url

In [40]:
import requests, os.path
import zipfile
from os import path
from io import BytesIO

url_methylationEPIC = 'https://webdata.illumina.com/downloads/productfiles/methylationEPIC/infinium-methylationepic-v-1-0-b4-manifest-file-csv.zip'

if path.exists("EPIC_annotation/raw/MethylationEPIC_v-1-0_B4.csv") is False:
    req = requests.get(url_methylationEPIC)

    # Writing the file to the local file system
    zipfile= zipfile.ZipFile(BytesIO(req.content))
    zipfile.extractall('EPIC_annotation/raw/')

In [41]:
# EPIC annotation files used for epigenomics data

# Define data entity IDs
EPIC_MOESM1_id = "EPIC_annotation/raw/13059_2016_1066_MOESM1_ESM.csv"
EPIC_MOESM4_id = "EPIC_annotation/raw/13059_2016_1066_MOESM4_ESM.csv"
EPIC_MOESM5_id = "EPIC_annotation/raw/13059_2016_1066_MOESM5_ESM.csv"
Methylation_EPIC_id = "EPIC_annotation/raw/MethylationEPIC_v-1-0_B4.csv"
Annotation_EPIC_id = "EPIC_annotation/anno_epic_072017.RData"


Epic_MOESM1 = crate.add(File(crate, EPIC_MOESM1_id, dest_path=EPIC_MOESM1_id, properties={
    "@id": EPIC_MOESM1_id,
    "@type": "FormalParameter",
    "name": "epigenomics_preprocessed",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C43523" #probe #TODO: find better term(s)
                      },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

Epic_MOESM4 = crate.add(File(crate, EPIC_MOESM4_id, dest_path=EPIC_MOESM4_id, properties={
    "@id": EPIC_MOESM4_id,
    "@type": "FormalParameter",
    "name": "epigenomics_preprocessed",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C43523" #probe #TODO: find better term(s)
                      },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

Epic_MOESM5 = crate.add(File(crate, EPIC_MOESM5_id, dest_path=EPIC_MOESM5_id, properties={
    "@id": EPIC_MOESM5_id,
    "@type": "FormalParameter",
    "name": "epigenomics_preprocessed",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C43523" #probe #TODO: find better term(s)
                      },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

Methylation_EPIC = crate.add(File(crate, Methylation_EPIC_id, dest_path=Methylation_EPIC_id, properties={
    "@id": Methylation_EPIC_id,
    "@type": "FormalParameter",
    "name": "epigenomics_preprocessed",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/OBI_0002131" #Illumina Infinium MethylationEPIC BeadChip
                            #TODO: find better term(s)
                      },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

Annotation_EPIC = crate.add(File(crate, Annotation_EPIC_id, dest_path=Annotation_EPIC_id, properties={
    "@id": Annotation_EPIC_id,
    "@type": "FormalParameter",
    "name": "epigenomics_preprocessed",
    #"valueRequired": true,
    "additionalType": { "@id": "http://edamontology.org/operation_0226" #annotation
                      },
    #"format":  #TODO add alternative to .RData
}))


epi_annotation_script["hasPart"] = [EPIC_MOESM1_id, EPIC_MOESM4_id, EPIC_MOESM5_id, Methylation_EPIC_id, Annotation_EPIC_id]

### Metabolomics pre-processing

In [42]:
# Define data entity IDs
mtblmcs_preprocessing_id = "modules/metabolomics_preprocessing.nf"
mtblmcs_filtering_script_id = "bin/metabolomics_filter.Rmd"
mtblmcs_normalization_script_id = "bin/metabolomics_normalization.R"
mtblmcs_scaling_script_id = "bin/metabolomics_scaling.R"
mtblmcs_concatenate_script_id = "bin/concatenate_MAF.R"


# Define metabolomics pre-processing sub-workflow
mtblmcs_preprocessing = crate.add(ComputationalWorkflow(crate, mtblmcs_preprocessing_id, dest_path=mtblmcs_preprocessing_id, properties={
    "@id": mtblmcs_preprocessing_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "Metabolomics preprocessing",
    #"input": #Add later
    "output": {"@id": "metabolomics_preprocessed_data"},
    "hasPart": [
        {"@id": mtblmcs_filtering_script_id},
        {"@id": mtblmcs_normalization_script_id},
        {"@id": mtblmcs_scaling_script_id},
        {"@id": mtblmcs_concatenate_script_id}
    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1001486", #filtering
                       "@id": "http://purl.obolibrary.org/obo/OBI_0200169" #normalization
                        }
}))


# Define scripts
mtblmcs_filtering_script = crate.add(File(crate, mtblmcs_filtering_script_id, dest_path=mtblmcs_filtering_script_id, properties={
    "@id": mtblmcs_filtering_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Metabolomics filtering script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_4000"}, #R markdown
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1001486"} #filtering
}))


mtblmcs_normalization_script = crate.add(File(crate, mtblmcs_normalization_script_id, dest_path=mtblmcs_normalization_script_id, properties={
    "@id": mtblmcs_normalization_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Metabolomics normalization script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/OBI_0200169"} #normalization data transformation
}))

mtblmcs_scaling_script = crate.add(File(crate, mtblmcs_scaling_script_id, dest_path=mtblmcs_scaling_script_id, properties={
    "@id": mtblmcs_scaling_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Metabolomics scaling script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/OBI_0200037"} #pareto scaling
}))

mtblmcs_concatenate_script = crate.add(File(crate, mtblmcs_concatenate_script_id, dest_path=mtblmcs_concatenate_script_id, properties={
    "@id": mtblmcs_concatenate_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Concatenate MAFs script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/OBI_0002566"} #file merge
}))



In [43]:
#Define metabolomics data entities

metabolomics_data_id = "Synthetic_data/synthetic_metabolomics.csv"
amines_data_id = "Synthetic_data/amines_MAF.tsv"
OA_data_id = "Synthetic_data/OA_MAF.tsv"
steroids_data_id = "Synthetic_data/steroids_MAF.tsv" 

metabolomics_data = crate.add(File(crate, metabolomics_data_id, dest_path=metabolomics_data_id, properties={
    "@id": metabolomics_data_id,
    "@type": "FormalParameter",
    "name": "metabolomics_data",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1003084", #processed data file (not raw)
                        "@id": "http://purl.obolibrary.org/obo/CHEBI_32952", #amine
                        "@id": "http://purl.obolibrary.org/obo/CHEBI_64709", #organic acid
                        "@id": "http://purl.obolibrary.org/obo/CHEBI_35341", #steroid
                         },
    "format": {"@id": "http://purl.obolibrary.org/obo/MS_1000914"} #tsv
}))

amines_data = crate.add(File(crate, amines_data_id, dest_path=amines_data_id, properties={
    "@id": amines_data_id,
    "@type": "FormalParameter",
    "name": "metabolomics_data",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1003084", #processed data file (not raw)
                        "@id": "http://purl.obolibrary.org/obo/CHEBI_32952", #amine
                         },
    "format": {"@id": "http://purl.obolibrary.org/obo/MS_1000914"} #tsv
}))

OA_data = crate.add(File(crate, OA_data_id, dest_path=OA_data_id, properties={
    "@id": OA_data_id,
    "@type": "FormalParameter",
    "name": "metabolomics_data",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1003084", #processed data file (not raw)
                        "@id": "http://purl.obolibrary.org/obo/CHEBI_64709", #organic acid
                         },
    "format": {"@id": "http://purl.obolibrary.org/obo/MS_1000914"} #tsv
}))

steroids_data = crate.add(File(crate, steroids_data_id, dest_path=steroids_data_id, properties={
    "@id": steroids_data_id,
    "@type": "FormalParameter",
    "name": "metabolomics_data",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1003084", #processed data file (not raw)
                        "@id": "http://purl.obolibrary.org/obo/CHEBI_35341", #steroid
                         },
    "format": {"@id": "http://purl.obolibrary.org/obo/MS_1000914"} #tsv
}))

metabolomics_preprocessed = DataEntity(crate, "metabolomics_preprocessed", properties={
    "@id": "metabolomics_preprocessed_data",
    "@type": "FormalParameter",
    "name": "metabolomics_preprocessed",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1003084", #processed data file (not raw),
                       "@id": "http://purl.obolibrary.org/obo/OBI_0000451" #normalized data set
                        },
    "format": {"@id": "http://purl.obolibrary.org/obo/MS_1000914"} #tsv
})

mtblmcs_preprocessing["input"] = [metabolomics_data_id, amines_data_id, OA_data_id, steroids_data_id]

### Phenotypes preparation

In [44]:
# Define data entity IDs
cbcl_imputation_mca_id = "modules/CBCL_MCA.nf"
cbcl_imputation_mca_script_id = "bin/CBCL_filter_impute_MCA.Rmd"


# Define behavioral data pre-processing sub-workflow
cbcl_imputation_mca = crate.add(ComputationalWorkflow(crate, cbcl_imputation_mca_id, dest_path=cbcl_imputation_mca_id, properties={
    "@id": cbcl_imputation_mca_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "CBCL imputation",
    "input": {"@id": "behavioral_data"},
    "output": {"@id": "behavioral_data"},
    "hasPart": [
        {"@id": cbcl_imputation_mca_script_id }
    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://edamontology.org/operation_3557", #imputation
                       "@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning 
                       }
}))


# Define script
cbcl_imputation_mca_script = crate.add(File(crate, cbcl_imputation_mca_script_id, dest_path=cbcl_imputation_mca_script_id , properties={
    "@id": cbcl_imputation_mca_script_id ,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "Filter, impute CBCL data and MCA",
    "programmingLanguage": {"@id": "http://edamontology.org/format_4000"}, #R markdown
    "additionalType": {"@id": "http://edamontology.org/operation_3557", #imputation
                       "@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning 
                       }
}))

In [45]:
#Define phenotypic data entities

phenotype_covariates_id = "Synthetic_data/synthetic_phenotype_covariates_data.csv"
behavioral_data_id = "Synthetic_data/synthetic_cbcl_data.csv"

phenotype_covariates = crate.add(File(crate, phenotype_covariates_id, dest_path=phenotype_covariates_id, properties={
    "@id": phenotype_covariates_id,
    "@type": "FormalParameter",
    "name": "phenotype_covariates",
    #"valueRequired": true,
    "additionalType": { "@id": "http://purl.obolibrary.org/obo/NCIT_C16977" #Phenotype
                        },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

behavioral_data = crate.add(File(crate, behavioral_data_id, dest_path=behavioral_data_id, properties={
    "@id": behavioral_data_id,
    "@type": "FormalParameter",
    "name": "behavioral_data",
    #"valueRequired": true,
    "additionalType": { "@id": "http://www.ebi.ac.uk/efo/EFO_0005661" #CBCL assessment
                        },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

### Map omics files 


In [46]:
# Define data entity IDs
id_mapping_id = "modules/map_IDs.nf"
id_file_id = "Synthetic_data/ACTIONdemonstrator_XOmics_IDs_synthetic.csv"
id_mapping_script_id = "bin/map_IDs.py"

id_file = crate.add(File(crate, id_file_id, dest_path=id_file_id, properties={
    "@id": id_file_id,
    "@type": "FormalParameter",
    "name": "behavioral_data",
    #"valueRequired": true,
    "additionalType": { "@id": "http://purl.enanomapper.org/onto/ENM_9000071" #sample identifier
                        },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
}))

# Define module of sample identifier mapping
id_mapping = crate.add(ComputationalWorkflow(crate, id_mapping_id, dest_path=id_mapping_id, properties={
    "@id": id_mapping_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "Sample ID mapping",
    "input": {"@id": "metabolomics_preprocessed_data",
              "@id": "epigenomics_preprocessed_data",
              "@id": id_file_id},
    "output": {"@id": "metabolomics_preprocessed_data",
              "@id": "epigenomics_preprocessed_data"},
    "hasPart": [
        {"@id": id_mapping_script_id}
    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://edamontology.org/operation_3282"} #ID mapping
}))


# ID mapping script
id_mapping_script = crate.add(File(crate, id_mapping_script_id, dest_path=id_mapping_script_id, properties={
    "@id": id_mapping_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "ID mapping script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3996"}, #Python script
    "additionalType": {"@id": "http://edamontology.org/operation_3282"} #ID mapping
}))

### Principal Component Analysis

In [47]:
# Define data entity IDs
pca_id = "modules/pca.nf"
pca_script_id = "bin/pca.R"


# Define pca sub-workflow
pca = crate.add(ComputationalWorkflow(crate, pca_id, dest_path=pca_id, properties={
    "@id": pca_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "Principal Component Analysis",
    "input": {"@id": "processed_omics_data"},
    "output": {"@id": "pca_report"},
    "hasPart": [
        {"@id": pca_script_id}
    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C49291" #PCA
                        }
}))


# PCA script
pca_script = crate.add(File(crate, pca_script_id, dest_path=pca_script_id, properties={
    "@id": pca_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "PCA script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/NCIT_C49291"} #PCA
}))

In [48]:
#Define PCA input/output entities

processed_omics_data = DataEntity(crate, "omics_data.csv", properties={
    "@id": "processed_omics_data",
    "@type": "FormalParameter",
    "name": "processed_omics_data",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/MS_1003084", #processed data file 
                     "@id": "http://edamontology.org/topic_3391" #omics
                         },
    "format": {"@id": "http://edamontology.org/format_3752"} #csv
})

#TODO What should be output here? .pdf file of plots?

pca_report = DataEntity(crate, "pca.pdf", properties={
    "@id": "pca_report",
    "@type": "FormalParameter",
    "name": "pca_report",
    #"valueRequired": true,
    "additionalType": {"@id": "http://edamontology.org/data_2884" #plot (multiple) 
                         },
    "format": {"@id": "http://edamontology.org/format_3508"} #pdf
})



### Similarity Network Fusion

In [49]:
# Define data entity IDs
snf_id = "modules/snf.nf"
snf_script_id = "bin/perform_snf.py"
snf_analysis_script_id = "bin/snf_analysis.ipynb"
snf_gee_script_id = "bin/snf_gee_analysis.ipynb"


# Define SNF sub-workflow
snf = crate.add(ComputationalWorkflow(crate, snf_id, dest_path=snf_id, properties={
    "@id": snf_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "Similarity Network Fusion",
    "input": {"@id": "processed_omics_data"},
    "output": {"@id": "snf_report"},
    "hasPart": [
        {"@id": snf_script_id},
        {"@id": snf_analysis_script_id},
        {"@id": snf_gee_script_id}
    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://edamontology.org/operation_3432", #Clustering 
                       "@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning
                         }
}))


# SNF scripts
snf_script = crate.add(File(crate, snf_script_id, dest_path=snf_script_id, properties={
    "@id": snf_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "SNF script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3996"}, #Python script
    "additionalType": {"@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning
                       "@id": "http://edamontology.org/operation_3432", #Clustering 
                         }
}))

snf_analysis_script = crate.add(File(crate, snf_analysis_script_id, dest_path=snf_analysis_script_id, properties={
    "@id": snf_analysis_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "SNF downstream analysis script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3996"}, #Python script
    "additionalType": {"@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning
                       "@id": "http://edamontology.org/operation_3432",  #Clustering 
                       "@id": "http://semanticscience.org/resource/SIO_000449" #plot
                         }
}))

snf_gee_script = crate.add(File(crate, snf_gee_script_id, dest_path=snf_gee_script_id, properties={
    "@id": snf_gee_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "SNF GEE models",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3996"}, #Python script
    "additionalType": {"@id": "http://purl.enanomapper.org/onto/ENM_8000003" #Unsupervised learning
                         }
}))

In [50]:
# Define SNF input/output entities

snf_report = crate.add(DataEntity(crate, "snf_report", properties={
    "@id": "snf_report",
    "@type": "FormalParameter",
    "name": "snf_report",
    #"valueRequired": true,
    "additionalType": {"@id": "http://semanticscience.org/resource/SIO_000449" #plot 
                         },
    "format": {"@id": "http://edamontology.org/format_3508"} #pdf
}))

#TODO add snf matrix

### Multi-Omics Factor Analysis

In [51]:
# Define data entity IDs
mofa_id = "modules/mofa.nf"
mofa_script_id = "bin/mofa.R"
mofa_analysis_script_id = "bin/MOFA_downstream_analysis_report.Rmd"
mofa_gee_script_id = "bin/MOFA_downstream_analysis_report_gee.Rmd"


# Define MOFA module
mofa = crate.add(ComputationalWorkflow(crate, mofa_id, dest_path=mofa_id, properties={
    "@id": mofa_id,
    "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"],
    "name": "Multi-Omics Factor Analysis",
    "programmingLanguage": {"@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"},
    "input": {"@id": "processed_omics_data"},
    "output": {"@id": "mofa_model"},
    "hasPart": [
        {"@id": mofa_script_id},
        {"@id": mofa_analysis_script_id},
        {"@id": mofa_gee_script_id}
    ],
    "license": "https://opensource.org/licenses/MIT",
    "additionalType": {"@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning
                       "@id":  "http://edamontology.org/topic_3474"   #Machine learning
                         }
}))


# MOFA script
mofa_script = crate.add(File(crate, mofa_script_id, dest_path=mofa_script_id, properties={
    "@id": mofa_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "MOFA script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_3999"}, #Rscript
    "additionalType": {"@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning
                       "@id":  "http://edamontology.org/topic_3474"   #Machine learning
                         }
}))

# MOFA script
mofa_analysis_script = crate.add(File(crate, mofa_analysis_script_id, dest_path=mofa_analysis_script_id, properties={
    "@id": mofa_analysis_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "MOFA downstream analysis script",
    "programmingLanguage": {"@id": "http://edamontology.org/format_4000"}, #R markdown
    "additionalType": {"@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning
                       "@id":  "http://edamontology.org/topic_3474"   #Machine learning
                         }
}))

# MOFA GEE script
mofa_analysis_gee_script = crate.add(File(crate, mofa_gee_script_id, dest_path=mofa_gee_script_id, properties={
    "@id": mofa_gee_script_id,
    "@type": ["File", "SoftwareSourceCode"],
    "name": "MOFA downstream analysis script with GEE",
    "programmingLanguage": {"@id": "http://edamontology.org/format_4000"}, #R markdown
    "additionalType": {"@id": "http://purl.enanomapper.org/onto/ENM_8000003", #Unsupervised learning
                       "@id":  "http://edamontology.org/topic_3474"   #Machine learning
                         }
}))

In [52]:
# Define MOFA output mode

mofa_model = crate.add(DataEntity(crate, "mofa_model", properties={
    "@id": "mofa_model",
    "@type": "FormalParameter",
    "name": "mofa_model",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/STATO_0000107" #statistical model 
                         },
    "format": {"@id": "http://edamontology.org/format_3590"} #HDF5
}))

## Data entities on Root directory

In [53]:
from rocrate.model.data_entity import DataEntity

nextflow_config = crate.add_file( "nextflow.config", properties={
    "@id": "nextflow.config",
    "@type": "FormalParameter",
    "name": "Nextflow configuration file",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/ONTOAVIDA_00000001" #configuration file 
                         },
    "format": {"@id": "http://edamontology.org/format_3464"} #JSON
})

dre_config = crate.add_file( "dre.config", properties={
    "@id": "dre.config",
    "@type": "FormalParameter",
    "name": "Nextflow configuration file",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/ONTOAVIDA_00000001" #configuration file 
                         },
    "format": {"@id": "http://edamontology.org/format_3464"} #JSON
})

readme = crate.add_file("README.md", properties={
    "@id": "README.md",
    "@type": "FormalParameter",
    "name": "Nextflow configuration file",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/OMIT_00055391" #Documentation
                         },
    #"format": {} #TODO: markdown ontology term needed here. 
})

action_documentation = crate.add_file("Documentation.md", properties={
    "@id": "ACTION_documentation.md",
    "@type": "FormalParameter",
    "name": "Documentation on Workflow",
    #"valueRequired": true,
    "additionalType": {"@id": "http://purl.obolibrary.org/obo/OMIT_00055391" #Documentation
                         },
    #"format": {} #TODO: markdown ontology term needed here. 
})

 # Define the diagram that provides an overview of the main workflow

diagram = crate.add_file("flowchart.png", properties={
    "@id": "flowchart.png",
    "@type": ["File", "ImageObject"],
    "name": "Workflow overview" ,
    "about": {"@id": "action.nf"}
})

## Add entities to the main workflow entity

In [54]:
# Add entities/attributes to the workflow

MainWorkflow["author"] = [Anna_Niehues, Casper_de_Visser, Fiona_Hagenbeek, Naama_Karu, Alida_Kindt, Purva_Kulkarni, Rene_Pool, Dorret_Boomsma, Jenny_van_Dongen, Alain_van_Gool, PeterBram_t_Hoen]
MainWorkflow["programmingLanguage"] =  Nextflow
MainWorkflow["image"] = diagram
MainWorkflow["config"] = [nextflow_config, dre_config]
MainWorkflow["sdPublisher"] =  x_omics
MainWorkflow["hasPart"] = [missing_data_heatmap, epi_preprocessing, mtblmcs_preprocessing, cbcl_imputation_mca, id_mapping, pca, snf, mofa]

## Add publications

In [55]:
# Rio Journal abstract

RIO_abstract = crate.add(ContextEntity(crate, "https://doi.org/10.3897/rio.8.e94042", properties={
    "@id": "https://doi.org/10.3897/rio.8.e94042",
    "@type": ["ScholartlyArtcile", "CreativeWork"],
    "name": "A Multi-omics Data Analysis Workflow Packaged as a FAIR Digital Object",
    "dateCreated": "25-08-2022",
    "keywords": ["Multi-omics", "Metabolomics", "Epigenomics", "Behavioral data", "FAIR"],
}))

In [56]:
# Add information / link entitites to the crate

crate.mainEntity = MainWorkflow
crate.name =  "X-omics ACTIONdemonstrator analysis workflow"
crate.author = [Anna_Niehues, Casper_de_Visser, Fiona_Hagenbeek, Naama_Karu, Alida_Kindt, Purva_Kulkarni, Rene_Pool, Dorret_Boomsma, Jenny_van_Dongen, Alain_van_Gool, PeterBram_t_Hoen]
crate.license = "https://opensource.org/licenses/MIT"
crate.keywords =  ["Multi-omics", "Metabolomics", "Epigenomics", "Behavioral data", "FAIR"]
crate.datePublished = str(today)
crate.description = "This workflow is designed to analyze to a multi-omics data set that comprises genome-wide DNA methylation profiles, targeted metabolomics, and behavioral data of two cohorts that participated in the ACTION Biomarker Study (ACTION, Aggression in Children: Unraveling gene-environment interplay to inform Treatment and InterventiON strategies. (Boomsma 2015, Bartels 2018, Hagenbeek 2020, van Dongen 2021, Hagenbeek 2022). The ACTION-NTR cohort consists of twins that are either longitudinally concordant or discordant for childhood aggression. The ACTION-Curium-LUMC cohort consists of children referred to the Dutch LUMC Curium academic center for child and youth psychiatry. With the joint analysis of multi-omics data and behavioral data, we aim to identify substructures in the ACTION-NTR cohort and link them to aggressive behavior. First, the individuals are clustered using Similarity Network Fusion (SNF, Wang 2014), and latent feature dimensions are uncovered using different unsupervised methods including Multi-Omics Factor Analysis (MOFA) (Argelaguet 2018) and Multiple Correspondence Analysis (MCA, Lê 2008, Husson 2017). In a second step, we determine correlations between -omics and phenotype dimensions, and use them to explain the subgroups of individuals from the ACTION-NTR cohort. In order to validate the results, we project data of the ACTION-Curium-LUMC cohort onto the latent dimensions and determine if correlations between omics and phenotype data can be reproduced."

In [57]:
# Save to JSON-LD

#crate.write("exp_crate")

In [58]:
# Save to ZIP-file

crate.write_zip("ro-crate/exp_crate.zip")

'/ro-crate_container/volume/ACTIONdemonstrator_workflow/ro-crate/exp_crate.zip'