diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8017ac4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Alejandro Barrera + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 2cbc1a8..67a277a 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # ggr-cwl-ipynb-gen Jupyter notebook generator to download and execute the processing files for GGR related datasets. At this point, is not intented to cover all use cases, but to serve as a quick generator of all -related files and scripts to pre-process sequences generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics). +related files and scripts to pre-process genomic data generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics). Example of usage: ``` $ python ggr-cwl-ipynb-gen.py \ - --conf examples/conf.yaml \ + --root-dir /path/to/rootdir \ --metadata examples/Hong_3979_170316B1.xlsx \ --out /path/to/output_dir \ --force @@ -16,33 +16,26 @@ The information in the example metadata and configuration file should reveal wha For a full list of options: ``` $ python ggr-cwl-ipynb-gen.py -h -usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines - [-h] -o OUT -c CONF_FILE -m METADATA [-f] [-n] [--metadata-sep SEP] - [--project-name PROJECT_NAME] [--data-from {sftp,miseq,other,dukeds}] +usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines [-h] -o OUT -m METADATA [-f] [-n] [--metadata-sep SEP] [--project-name PROJECT_NAME] + [--data-from {sftp,miseq,other,dukeds,local}] [-c CONF_FILE] [-u USER] [-e USER_DUKE_EMAIL] [-r ROOT_DIR] optional arguments: -h, --help show this help message and exit -o OUT, --out OUT Jupyter notebook output file name - -c CONF_FILE, --conf-file CONF_FILE - YAML configuration file (see examples) -m METADATA, --metadata METADATA Metadata file with samples information -f, --force Force to overwrite output file - -n, --no-upload Avoids uploading generated data to database when - specified - --metadata-sep SEP Separator for metadata file (when different than Excel - spread sheet) + -n, --no-upload Avoids uploading generated data to database when specified + --metadata-sep SEP Separator for metadata file (when different than Excel spread sheet) --project-name PROJECT_NAME - Project name (by default, basename of metadata file - name) - --data-from {sftp,miseq,other,dukeds} - Choices: sftp, miseq, other, dukeds + Project name (by default, basename of metadata file name) + --data-from {sftp,miseq,other,dukeds,local} + Choices: sftp, miseq, other, dukeds, local + -c CONF_FILE, --conf-file CONF_FILE + YAML configuration file (see examples) + -u USER, --user USER HARDAC User used in SLURM (default: ${USER}) + -e USER_DUKE_EMAIL, --user-duke-email USER_DUKE_EMAIL + Email(s) notified when execution is finished (default: ${USER}@duke.edu) + -r ROOT_DIR, --root-dir ROOT_DIR + Root directory where all subfolders and files will be created (semi-required: either defined here or in conf-file) ``` - -### Dependencies -- jinja2 >=2.8 -- nbformat >=4.0.1 -- numpy >=1.10.4 -- pandas >=0.17.1 -- xlrd >=1.0.0 -- ruamel >=0.11.11 diff --git a/chipdb_upload/__init__.py b/chipdb_upload/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_upload.py b/chipdb_upload/data_upload.py similarity index 99% rename from data_upload.py rename to chipdb_upload/data_upload.py index 91116fd..47a875f 100644 --- a/data_upload.py +++ b/chipdb_upload/data_upload.py @@ -1,10 +1,10 @@ +#!/usr/bin/env python from pymongo import MongoClient import datetime import os, csv import argparse import pandas as pd import base64 -import consts import logging # Python script and command line tool for compiling fingerprint and QC data from ChIP-seq diff --git a/ggr_cwl_ipynb_gen/__init__.py b/ggr_cwl_ipynb_gen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/consts.py b/ggr_cwl_ipynb_gen/consts.py similarity index 97% rename from consts.py rename to ggr_cwl_ipynb_gen/consts.py index 32518da..03fc3a7 100644 --- a/consts.py +++ b/ggr_cwl_ipynb_gen/consts.py @@ -53,3 +53,6 @@ qc_script_dir = '/data/reddylab/software/cwl/bin' data_upload_script = '/data/reddylab/Darryl/GitHub/reddylab/csv_to_mongo.py' HOST_FOR_TUNNELED_DOWNLOAD = "Hardac-xfer.genome.duke.edu" + +# Package constants +PACKAGE_NAME = "ggr_cwl_ipynb_gen" diff --git a/ggr-cwl-ipynb-gen.py b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py similarity index 95% rename from ggr-cwl-ipynb-gen.py rename to ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py index 6afc68d..2de202e 100644 --- a/ggr-cwl-ipynb-gen.py +++ b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py @@ -1,27 +1,34 @@ +#!/usr/bin/env python import argparse import nbformat import nbformat.v3 as nbf import sys import os import pandas as pd -from jinja2 import FileSystemLoader +from jinja2 import FileSystemLoader, PackageLoader from xlrd import XLRDError import ruamel.yaml -import consts +import ggr_cwl_ipynb_gen.consts as consts import jinja2 import inspect -import glob +from jinja2.exceptions import TemplateNotFound import numpy as np encoding = sys.getfilesystemencoding() -EXEC_DIR = os.path.dirname(unicode(__file__, encoding)) +EXEC_DIR = os.path.dirname(str(__file__)) def render(tpl_path, context): path, filename = os.path.split(tpl_path) - return jinja2.Environment( - loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates")) - ).get_template(filename).render(context) + try: + jinja_rendered = jinja2.Environment( + loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates")) + ).get_template(filename).render(context) + except TemplateNotFound: + jinja_rendered = jinja2.Environment( + loader=PackageLoader(consts.PACKAGE_NAME, "templates") + ).get_template(filename).render(context) + return jinja_rendered class Cell(object): @@ -209,7 +216,7 @@ def merge_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=N array="0-%d%%20" % (num_samples-1), partition=",".join(consts.slurm_partitions), script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'], - inspect.stack()[0][3]),) + inspect.stack()[0][3]), ) cells.extend(execute_cell.to_list()) return cells @@ -432,7 +439,7 @@ def data_upload(conf_args, lib_type, pipeline_type): depends_on=True, prolog=["source %s alex" % consts.conda_activate], partition=",".join(consts.slurm_partitions), - description="### Upload ChIP-seq to web-application") + description="### Upload ChIP-seq to web-application") cells.extend(execute_cell.to_list()) return cells @@ -566,10 +573,11 @@ def get_samples_by_library_type(metadata_file, sep='\t'): :return: generator of panda's dataframe """ try: - md = pd.read_excel(metadata_file, + md = pd.read_excel(metadata_file.name, true_values=['Yes', 'Y', 'yes', 'y', 1], false_values=['No', 'N', 'no', 'n', 0]) except XLRDError: + print (XLRDError) md = pd.read_csv(metadata_file.name, true_values=['Yes', 'Y', 'yes', 'y', 1], false_values=['No', 'N', 'no', 'n', 0], sep=sep) @@ -592,8 +600,8 @@ def init_conf_args(args, conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r] try: assert conf_args[r] is not None - except AssertionError, e: - print "[ERROR]", r, "not defined" + except AssertionError as e: + print("[ERROR]", r, "not defined") raise for o in optional_args: conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None) @@ -606,7 +614,8 @@ def init_conf_args(args, def main(): parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines') parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name') - parser.add_argument('-m', '--metadata', required=True, type=file, help='Metadata file with samples information') + parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'), + help='Metadata file with samples information') parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file') parser.add_argument('-n', '--no-upload', action='store_false', help='Avoids uploading generated data to database when specified') @@ -617,14 +626,16 @@ def main(): parser.add_argument('--data-from', required=False, choices=consts.data_sources, default=consts.data_sources[0], help='Choices: %s' % (', '.join(consts.data_sources))) - parser.add_argument('-c', '--conf-file', required=False, type=file, help='YAML configuration file (see examples)') + parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)') parser.add_argument('-u', '--user', required=False, help='HARDAC User used in SLURM (default: ${USER})') parser.add_argument('-e', '--user-duke-email', required=False, help='Email(s) notified when execution is finished (default: ${USER}@duke.edu)') parser.add_argument('-r', '--root-dir', required=False, help='Root directory where all subfolders and files will be created ' - '(semi-required: either defined here or in conf-file') + '(semi-required: either defined here or in conf-file)') + parser.add_argument('-v', '--version', required=False, + help='Print version of the program and exit') args = parser.parse_args() @@ -638,7 +649,7 @@ def main(): outfile = args.out if os.path.isfile(outfile) and not args.force: - print outfile, "is an existing file. Please use -f or --force to overwrite the contents" + print(outfile, "is an existing file. Please use -f or --force to overwrite the contents") sys.exit(1) conf_args['upload'] = args.no_upload diff --git a/templates/cwl_json_gen.j2 b/ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2 similarity index 100% rename from templates/cwl_json_gen.j2 rename to ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2 diff --git a/templates/cwl_slurm_array_gen.j2 b/ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2 similarity index 100% rename from templates/cwl_slurm_array_gen.j2 rename to ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2 diff --git a/templates/data_upload.j2 b/ggr_cwl_ipynb_gen/templates/data_upload.j2 similarity index 100% rename from templates/data_upload.j2 rename to ggr_cwl_ipynb_gen/templates/data_upload.j2 diff --git a/templates/download_fastq_files.j2 b/ggr_cwl_ipynb_gen/templates/download_fastq_files.j2 similarity index 100% rename from templates/download_fastq_files.j2 rename to ggr_cwl_ipynb_gen/templates/download_fastq_files.j2 diff --git a/templates/generate_plots.j2 b/ggr_cwl_ipynb_gen/templates/generate_plots.j2 similarity index 100% rename from templates/generate_plots.j2 rename to ggr_cwl_ipynb_gen/templates/generate_plots.j2 diff --git a/templates/generate_qc_cell.j2 b/ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2 similarity index 100% rename from templates/generate_qc_cell.j2 rename to ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2 diff --git a/templates/merge_lanes_fastq.j2 b/ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2 similarity index 100% rename from templates/merge_lanes_fastq.j2 rename to ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2 diff --git a/templates/ungzip_fastq_files.j2 b/ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2 similarity index 100% rename from templates/ungzip_fastq_files.j2 rename to ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5a3c46 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e0a34ba --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +jinja2 >=2.8 +nbformat >=4.0.1 +numpy >=1.10.4 +pandas >=0.17.1 +xlrd >=1.0.0 +ruamel.yaml >=0.11.11 +pymongo >=3.4.0 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..4b62c45 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,26 @@ +[metadata] +name = ggr-cwl-ipynb-gen +version = 0.5.0 +author = Alejandro Barrera +author_email = alejandro.barrera@duke.edu +description = IPython notebook generator for GGR CWL processing pipelines of genomic data +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/ReddyLab/ggr-cwl-ipynb-gen +project_urls = + Bug Tracker = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/issues + Source = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/ + +classifiers = + Development Status :: 3 - Alpha + Intended Audience :: Science/Research + Topic :: Scientific/Engineering :: Bio-Informatics + License :: OSI Approved :: MIT License + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + +[options] +packages = find: diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7cf4142 --- /dev/null +++ b/setup.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +"""Description: +Setup script for ggr-cwl-ipynb-gen +IPython Notebook generator for processing genomic data from GGR project, in CWL + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file LICENSE included with +the distribution). +""" +import sys +if sys.version_info[0] == 2: + sys.exit("Sorry, Python 2 is not supported anymore. Please check and old branch (pre 2021)") +from setuptools import setup, find_packages + +setup( + long_description_content_type='text/markdown', + url='https://github.com/alexbarrera/ggr-cwl-ipynb-gen', + keywords='cwl, bioinformatics, development', + scripts=[ + "ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py", + "chipdb_upload/data_upload.py", + ], + include_package_data=True, + py_modules=["ggr_cwl_ipynb_gen"], + python_requires='>=3.1', + install_requires=[ + 'jinja2 >=2.8', + 'nbformat >=4.0.1', + 'numpy >=1.10.4', + 'pandas >=0.17.1', + 'xlrd >=1.0.0', + 'ruamel.yaml >=0.11.11', + 'setuptools', + 'pymongo >=3.4.0' + ], # Optional + data_files=[ + ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/data_upload.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/download_fastq_files.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/generate_plots.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2']), + ] + +) \ No newline at end of file