Merge pull request #145 from jhiemstrawisc/add-spras-image

Add new SPRAS image and handle unpacked singularity images
Reed-CompBio · Jul 1, 2024 · 0557289 · 0557289
2 parents ad4da94 + 6cee79f
commit 0557289
Show file tree

Hide file tree

Showing 28 changed files with 444 additions and 29 deletions.
diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml
@@ -84,6 +84,7 @@ jobs:
         docker pull reedcompbio/allpairs:v2
         docker pull reedcompbio/domino:latest
         docker pull reedcompbio/py4cytoscape:v2
+        docker pull reedcompbio/spras:v0.1.0
     - name: Build Omics Integrator 1 Docker image
       uses: docker/build-push-action@v1
       with:
@@ -156,6 +157,15 @@ jobs:
         tags: v2
         cache_froms: reedcompbio/py4cytoscape:latest
         push: false
+    - name: Build SPRAS Docker image
+      uses: docker/build-push-action@v1
+      with:
+        path: .
+        dockerfile: docker-wrappers/SPRAS/Dockerfile
+        repository: reedcompbio/spras
+        tags: v0.1.0
+        cache_froms: reedcompbio/spras:v0.1.0
+        push: false
 
   # Run pre-commit checks on source files
   pre-commit:
@@ -167,6 +177,6 @@ jobs:
     - name: Setup Python
       uses: actions/setup-python@v4
       with:
-        python-version: '3.8' # Match this to the version specified in environment.yml
+        python-version: '3.11' # Match this to the version specified in environment.yml
     - name: Run pre-commit checks
       uses: pre-commit/action@v3.0.0
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@
 # See https://pre-commit.com/ for documentation
 default_language_version:
   # Match this to the version specified in environment.yml
-  python: python3.8
+  python: python3.11
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.4.0  # Use the ref you want to point at

diff --git a/Snakefile b/Snakefile
@@ -219,7 +219,7 @@ rule reconstruct:
         # Create a copy so that the updates are not written to the parameters logfile
         params = reconstruction_params(wildcards.algorithm, wildcards.params).copy()
         # Add the input files
-        params.update(dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input})))
+        params.update(dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True)))
         # Add the output file
         # All run functions can accept a relative path to the output file that should be written that is called 'output_file'
         params['output_file'] = output.pathway_file

diff --git a/config/config.yaml b/config/config.yaml
@@ -7,6 +7,14 @@ hash_length: 7
 # 'singularity'. If container_framework is not specified, SPRAS will default to docker.
 container_framework: docker
 
+# Only used if container_framework is set to singularity, this will unpack the singularity containers
+# to the local filesystem. This is useful when PRM containers need to run inside another container,
+# such as would be the case in an HTCondor/OSPool environment.
+# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
+# that persists after the workflow is complete. To clean up the unpacked containers, the user must
+# manually delete them.
+unpack_singularity: false
+
 # Allow the user to configure which container registry containers should be pulled from
 # Note that this assumes container names are consistent across registries, and that the
 # registry being passed doesn't require authentication for pull actions

diff --git a/docker-wrappers/SPRAS/Dockerfile b/docker-wrappers/SPRAS/Dockerfile
@@ -0,0 +1,16 @@
+FROM almalinux:9
+
+RUN dnf install -y epel-release
+
+# gcc/g++ are required for building several of the packages if you're using apple silicon
+RUN dnf update -y && \
+    dnf install -y gcc gcc-c++ \
+    python3.11 python3.11-pip python3.11-devel \
+    docker apptainer
+
+COPY / /spras/
+RUN chmod -R 777 /spras
+WORKDIR /spras
+
+# Install spras into the container
+RUN pip3.11 install .
diff --git a/docker-wrappers/SPRAS/README.md b/docker-wrappers/SPRAS/README.md
@@ -0,0 +1,81 @@
+# SPRAS Docker image
+
+## Building
+
+A Docker image for SPRAS that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/spras)
+This image comes bundled with all of the necessary software packages to run SPRAS, and can be used for execution in distributed environments (like HTCondor).
+
+To create the Docker image, make sure you are in this repository's root directory, and from your terminal run:
+
+```
+docker build -t <project name>/<image name>:<tag name> -f docker-wrappers/SPRAS/Dockerfile .
+```
+
+For example, to build this image with the intent of pushing it to DockerHub as reedcompbio/spras:v0.1.0, you'd run:
+```
+docker build -t reedcompbio/spras:v0.1.0 -f docker-wrappers/SPRAS/Dockerfile .
+```
+
+This will copy the entire SPRAS repository into the container and install SPRAS with `pip`. As such, any changes you've made to the current SPRAS repository will be reflected in version of SPRAS installed in the container. Since SPRAS
+is being installed with `pip`, it's also possible to specify that you want development modules installed as well. If you're using the container for development and you want the optional `pre-commit` and `pytest` packages as well as a
+spras package that receives changes without re-installation, change the
+`pip` installation line to:
+
+```
+pip install -e .[dev]
+```
+
+This will cause changes to spras source code to update the installed package.
+
+**Note:** This image will build for the same platform that is native to your system (i.e. amd64 or arm64). If you need to run this in a remote environment like HTCondor that is almost certainly `amd64` but you're building from Apple Silicon, it is recommended to either modify the Dockerfile to pin the platform:
+
+```
+FROM --platform=linux/amd64 almalinux:9
+```
+
+Or to temporarily override your system's default during the build, prepend your build command with:
+```
+DOCKER_DEFAULT_PLATFORM=linux/amd64
+```
+
+For example, to build reedcompbio/spras:v0.1.0 on Apple Silicon as a linux/amd64 container, you'd run:
+```
+DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t reedcompbio/spras:v0.1.0 -f docker-wrappers/SPRAS/Dockerfile .
+```
+
+## Testing
+
+The folder `docker-wrappers/SPRAS` also contains several files that can be used to test this container on HTCondor. To test the `spras` container
+in this environment, first login to an HTCondor Access Point (AP). Then, from the AP clone this repo:
+
+```
+git clone https://github.com/Reed-CompBio/spras.git
+```
+
+When you're ready to run SPRAS as an HTCondor workflow, navigate to the `spras/docker-wrappers/SPRAS` directory and create the `logs/` directory. Then run
+`condor_submit spras.sub`, which will submit SPRAS to HTCondor as a single job with as many cores as indicated by the `NUM_PROCS` line in `spras.sub`, using
+the value of `EXAMPLE_CONFIG` as the SPRAS configuration file. Note that you can alter the configuration file to test various workflows, but you should leave
+`unpack_singularity = true`, or it is likely the job will be unsuccessful. By default, the `example_config.yaml` runs everything except for `cytoscape`, which
+appears to fail periodically in HTCondor.
+
+To monitor the state of the job, you can run `condor_q` for a snapshot of how the job is doing, or you can run `condor_watch_q` if you want realtime updates.
+Upon completion, the `output` directory from the workflow should be returned as `spras/docker-wrappers/SPRAS/output`, along with several files containing the
+workflow's logging information (anything that matches `logs/spras_*` and ending in `.out`, `.err`, or `.log`). If the job was unsuccessful, these files should
+contain useful debugging clues about what may have gone wrong.
+
+**Note**: If you want to run the workflow with a different version of SPRAS, or one that contains development updates you've made, rebuild this image against
+the version of SPRAS you want to test, and push the image to your image repository. To use that container in the workflow, change the `container_image` line of
+`spras.sub` to point to the new image.
+
+**Note**: In some cases, especially if you're encountering an error like `/srv//spras.sh: line 10: snakemake: command not found`, it may be necessary to convert
+the SPRAS image to a `.sif` container image before running someplace like the OSPool. To do this, run:
+```
+apptainer build spras.sif docker://reedcompbio/spras:v0.1.0
+```
+to produce the file `spras.sif`. Then, substitute this value as the `container_image` in the submit file.
+
+## Versions:
+
+The versions of this image match the version of the spras package within it.
+- v0.1.0: Created an image with SPRAS as an installed python module. This makes SPRAS runnable anywhere with Docker/Singularity. Note that the Snakefile should be
+  runnable from any directory within the container.
diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml
@@ -0,0 +1,151 @@
+# Global workflow control
+
+# The length of the hash used to identify a parameter combination
+hash_length: 7
+
+# Specify the container framework. Current supported versions include 'docker' and
+# 'singularity'. If container_framework is not specified, SPRAS will default to docker.
+container_framework: singularity
+
+# Unpack singularity. See config/config.yaml for details.
+unpack_singularity: true
+
+# Allow the user to configure which container registry containers should be pulled from
+# Note that this assumes container names are consistent across registries, and that the
+# registry being passed doesn't require authentication for pull actions
+container_registry:
+   base_url: docker.io
+   # The owner or project of the registry
+   # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
+   owner: reedcompbio
+
+# This list of algorithms should be generated by a script which checks the filesystem for installs.
+# It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm
+# in the list to reduce the number of assumptions of the program at the cost of making the config a little more involved)
+# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change
+# which algorithms are run in a given experiment.
+#
+# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple
+# parameters are specified then the algorithm will be run as many times as needed to cover all parameter
+# combinations. For instance if we have the following:
+# - name: "myAlg"
+#   params:
+#         include: true
+#         a: [1,2]
+#         b: [0.5,0.75]
+#
+# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be
+# careful: too many parameters might make your runs take a long time.
+
+algorithms:
+      - name: "pathlinker"
+        params:
+              include: false
+              run1:
+                  k: range(100,201,100)
+
+      - name: "omicsintegrator1"
+        params:
+              include: true
+              run1:
+                  r: [5]
+                  b: [5, 6]
+                  w: np.linspace(0,5,2)
+                  g: [3]
+                  d: [10]
+
+      - name: "omicsintegrator2"
+        params:
+              include: true
+              run1:
+                  b: [4]
+                  g: [0]
+              run2:
+                  b: [2]
+                  g: [3]
+
+      - name: "meo"
+        params:
+              include: true
+              run1:
+                  max_path_length: [3]
+                  local_search: ["Yes"]
+                  rand_restarts: [10]
+
+      - name: "mincostflow"
+        params:
+              include: true
+              run1:
+                  flow: [1] # The flow must be an int
+                  capacity: [1]
+
+      - name: "allpairs"
+        params:
+              include: true
+
+      - name: "domino"
+        params:
+              include: true
+              run1:
+                  slice_threshold: [0.3]
+                  module_threshold: [0.05]
+
+
+# Here we specify which pathways to run and other file location information.
+# DataLoader.py can currently only load a single dataset
+# Assume that if a dataset label does not change, the lists of associated input files do not change
+datasets:
+    -
+      label: data0
+      node_files: ["node-prizes.txt", "sources.txt", "targets.txt"]
+      # DataLoader.py can currently only load a single edge file, which is the primary network
+      edge_files: ["network.txt"]
+      # Placeholder
+      other_files: []
+      # Relative path from the spras directory
+      data_dir: "input"
+    # -
+    #   label: data1
+    #   # Reuse some of the same sources file as 'data0' but different network and targets
+    #   node_files: ["node-prizes.txt", "sources.txt", "alternative-targets.txt"]
+    #   edge_files: ["alternative-network.txt"]
+    #   other_files: []
+    #   # Relative path from the spras directory
+    #   data_dir: "input"
+
+# If we want to reconstruct then we should set run to true.
+# TODO: if include is true above but run is false here, algs are not run.
+# is this the behavior we want?
+reconstruction_settings:
+
+        #set where everything is saved
+        locations:
+
+              #place the save path here
+              # TODO move to global
+              reconstruction_dir: "output"
+
+        run: true
+
+analysis:
+      # Create one summary per pathway file and a single summary table for all pathways for each dataset
+      summary:
+        include: true
+      # Create output files for each pathway that can be visualized with GraphSpace
+      graphspace:
+        include: true
+      # Create Cytoscape session file with all pathway graphs for each dataset
+      cytoscape:
+        include: false
+      # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
+      ml:
+        include: true
+        # specify how many principal components to calculate
+        components: 2
+        # boolean to show the labels on the pca graph
+        labels: true
+        # 'ward', 'complete', 'average', 'single'
+        # if linkage: ward, must use metric: euclidean
+        linkage: 'ward'
+        # 'euclidean', 'manhattan', 'cosine'
+        metric: 'euclidean'
diff --git a/docker-wrappers/SPRAS/spras.sh b/docker-wrappers/SPRAS/spras.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Fail early if there's an issue
+set -e
+
+# When .cache files are created, they need to know where HOME is to write there.
+# In this case, that should be the HTCondor scratch dir the job is executing in.
+export HOME=$(pwd)
+
+snakemake "$@"