ReddyLab · alexbarrera · Mar 15, 2021 · Mar 8, 2021 · Mar 9, 2021 · Mar 9, 2021
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Alejandro Barrera
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # ggr-cwl-ipynb-gen
 Jupyter notebook generator to download and execute the processing files for GGR related datasets. 
 At this point, is not intented to cover all use cases, but to serve as a quick generator of all 
-related files and scripts to pre-process sequences generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics).
+related files and scripts to pre-process genomic data generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics).
 
 Example of usage:
 ```
 $ python ggr-cwl-ipynb-gen.py \
-  --conf examples/conf.yaml \
+  --root-dir /path/to/rootdir \
   --metadata examples/Hong_3979_170316B1.xlsx \
   --out /path/to/output_dir \
   --force
@@ -16,33 +16,26 @@ The information in the example metadata and configuration file should reveal wha
 For a full list of options:
 ```
 $ python ggr-cwl-ipynb-gen.py -h
-usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines
-       [-h] -o OUT -c CONF_FILE -m METADATA [-f] [-n] [--metadata-sep SEP]
-       [--project-name PROJECT_NAME] [--data-from {sftp,miseq,other,dukeds}]
+usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines [-h] -o OUT -m METADATA [-f] [-n] [--metadata-sep SEP] [--project-name PROJECT_NAME]
+                                                                              [--data-from {sftp,miseq,other,dukeds,local}] [-c CONF_FILE] [-u USER] [-e USER_DUKE_EMAIL] [-r ROOT_DIR]
 
 optional arguments:
   -h, --help            show this help message and exit
   -o OUT, --out OUT     Jupyter notebook output file name
-  -c CONF_FILE, --conf-file CONF_FILE
-                        YAML configuration file (see examples)
   -m METADATA, --metadata METADATA
                         Metadata file with samples information
   -f, --force           Force to overwrite output file
-  -n, --no-upload       Avoids uploading generated data to database when
-                        specified
-  --metadata-sep SEP    Separator for metadata file (when different than Excel
-                        spread sheet)
+  -n, --no-upload       Avoids uploading generated data to database when specified
+  --metadata-sep SEP    Separator for metadata file (when different than Excel spread sheet)
   --project-name PROJECT_NAME
-                        Project name (by default, basename of metadata file
-                        name)
-  --data-from {sftp,miseq,other,dukeds}
-                        Choices: sftp, miseq, other, dukeds
+                        Project name (by default, basename of metadata file name)
+  --data-from {sftp,miseq,other,dukeds,local}
+                        Choices: sftp, miseq, other, dukeds, local
+  -c CONF_FILE, --conf-file CONF_FILE
+                        YAML configuration file (see examples)
+  -u USER, --user USER  HARDAC User used in SLURM (default: ${USER})
+  -e USER_DUKE_EMAIL, --user-duke-email USER_DUKE_EMAIL
+                        Email(s) notified when execution is finished (default: ${USER}@duke.edu)
+  -r ROOT_DIR, --root-dir ROOT_DIR
+                        Root directory where all subfolders and files will be created (semi-required: either defined here or in conf-file)
 ```
-
-### Dependencies
-- jinja2 >=2.8
-- nbformat >=4.0.1
-- numpy >=1.10.4
-- pandas >=0.17.1
-- xlrd >=1.0.0
-- ruamel >=0.11.11
diff --git a/chipdb_upload/__init__.py b/chipdb_upload/__init__.py
diff --git a/data_upload.py → chipdb_upload/data_upload.py b/data_upload.py → chipdb_upload/data_upload.py
@@ -1,10 +1,10 @@
+#!/usr/bin/env python
 from pymongo import MongoClient
 import datetime
 import os, csv
 import argparse
 import pandas as pd
 import base64
-import consts
 import logging
 
 # Python script and command line tool for compiling fingerprint and QC data from ChIP-seq

diff --git a/ggr_cwl_ipynb_gen/__init__.py b/ggr_cwl_ipynb_gen/__init__.py
diff --git a/consts.py → ggr_cwl_ipynb_gen/consts.py b/consts.py → ggr_cwl_ipynb_gen/consts.py
@@ -53,3 +53,6 @@
 qc_script_dir = '/data/reddylab/software/cwl/bin'
 data_upload_script = '/data/reddylab/Darryl/GitHub/reddylab/csv_to_mongo.py'
 HOST_FOR_TUNNELED_DOWNLOAD = "Hardac-xfer.genome.duke.edu"
+
+# Package constants
+PACKAGE_NAME = "ggr_cwl_ipynb_gen"
diff --git a/ggr-cwl-ipynb-gen.py → ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py → ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
@@ -1,27 +1,34 @@
+#!/usr/bin/env python
 import argparse
 import nbformat
 import nbformat.v3 as nbf
 import sys
 import os
 import pandas as pd
-from jinja2 import FileSystemLoader
+from jinja2 import FileSystemLoader, PackageLoader
 from xlrd import XLRDError
 import ruamel.yaml
-import consts
+import ggr_cwl_ipynb_gen.consts as consts
 import jinja2
 import inspect
-import glob
+from jinja2.exceptions import TemplateNotFound
 import numpy as np
 
 encoding = sys.getfilesystemencoding()
-EXEC_DIR = os.path.dirname(unicode(__file__, encoding))
+EXEC_DIR = os.path.dirname(str(__file__))
 
 
 def render(tpl_path, context):
     path, filename = os.path.split(tpl_path)
-    return jinja2.Environment(
-        loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates"))
-    ).get_template(filename).render(context)
+    try:
+        jinja_rendered = jinja2.Environment(
+            loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates"))
+        ).get_template(filename).render(context)
+    except TemplateNotFound:
+        jinja_rendered = jinja2.Environment(
+            loader=PackageLoader(consts.PACKAGE_NAME, "templates")
+        ).get_template(filename).render(context)
+    return jinja_rendered
 
 
 class Cell(object):
@@ -209,7 +216,7 @@ def merge_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=N
                               array="0-%d%%20" % (num_samples-1),
                               partition=",".join(consts.slurm_partitions),
                               script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'],
-                                                                  inspect.stack()[0][3]),)
+                                                                  inspect.stack()[0][3]), )
     cells.extend(execute_cell.to_list())
 
     return cells
@@ -432,7 +439,7 @@ def data_upload(conf_args, lib_type, pipeline_type):
                               depends_on=True,
                               prolog=["source %s alex" % consts.conda_activate],
                               partition=",".join(consts.slurm_partitions),
-                            description="### Upload ChIP-seq to web-application")
+                              description="### Upload ChIP-seq to web-application")
     cells.extend(execute_cell.to_list())
 
     return cells
@@ -566,10 +573,11 @@ def get_samples_by_library_type(metadata_file, sep='\t'):
     :return: generator of panda's dataframe
     """
     try:
-        md = pd.read_excel(metadata_file,
+        md = pd.read_excel(metadata_file.name,
                            true_values=['Yes', 'Y', 'yes', 'y', 1],
                            false_values=['No', 'N', 'no', 'n', 0])
     except XLRDError:
+        print (XLRDError)
         md = pd.read_csv(metadata_file.name,
                          true_values=['Yes', 'Y', 'yes', 'y', 1],
                          false_values=['No', 'N', 'no', 'n', 0], sep=sep)
@@ -592,8 +600,8 @@ def init_conf_args(args,
         conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r]
         try:
             assert conf_args[r] is not None
-        except AssertionError, e:
-            print "[ERROR]", r, "not defined"
+        except AssertionError as e:
+            print("[ERROR]", r, "not defined")
             raise
     for o in optional_args:
         conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None)
@@ -606,7 +614,8 @@ def init_conf_args(args,
 def main():
     parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines')
     parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name')
-    parser.add_argument('-m', '--metadata', required=True, type=file, help='Metadata file with samples information')
+    parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'),
+                        help='Metadata file with samples information')
     parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file')
     parser.add_argument('-n', '--no-upload', action='store_false', 
                         help='Avoids uploading generated data to database when specified')
@@ -617,14 +626,16 @@ def main():
     parser.add_argument('--data-from', required=False, choices=consts.data_sources,
                         default=consts.data_sources[0],
                         help='Choices: %s' % (', '.join(consts.data_sources)))
-    parser.add_argument('-c', '--conf-file', required=False, type=file, help='YAML configuration file (see examples)')
+    parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)')
     parser.add_argument('-u', '--user', required=False,
                         help='HARDAC User used in SLURM (default: ${USER})')
     parser.add_argument('-e', '--user-duke-email', required=False,
                         help='Email(s) notified when execution is finished (default: ${USER}@duke.edu)')
     parser.add_argument('-r', '--root-dir', required=False,
                         help='Root directory where all subfolders and files will be created '
-                             '(semi-required: either defined here or in conf-file')
+                             '(semi-required: either defined here or in conf-file)')
+    parser.add_argument('-v', '--version', required=False,
+                        help='Print version of the program and exit')
 
     args = parser.parse_args()
 
@@ -638,7 +649,7 @@ def main():
         outfile = args.out
 
     if os.path.isfile(outfile) and not args.force:
-        print outfile, "is an existing file. Please use -f or --force to overwrite the contents"
+        print(outfile, "is an existing file. Please use -f or --force to overwrite the contents")
         sys.exit(1)
 
     conf_args['upload'] = args.no_upload

diff --git a/templates/cwl_json_gen.j2 → ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2 b/templates/cwl_json_gen.j2 → ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2
diff --git a/templates/cwl_slurm_array_gen.j2 → ...pynb_gen/templates/cwl_slurm_array_gen.j2 b/templates/cwl_slurm_array_gen.j2 → ...pynb_gen/templates/cwl_slurm_array_gen.j2
diff --git a/templates/data_upload.j2 → ggr_cwl_ipynb_gen/templates/data_upload.j2 b/templates/data_upload.j2 → ggr_cwl_ipynb_gen/templates/data_upload.j2
diff --git a/templates/download_fastq_files.j2 → ...ynb_gen/templates/download_fastq_files.j2 b/templates/download_fastq_files.j2 → ...ynb_gen/templates/download_fastq_files.j2
diff --git a/templates/generate_plots.j2 → ...cwl_ipynb_gen/templates/generate_plots.j2 b/templates/generate_plots.j2 → ...cwl_ipynb_gen/templates/generate_plots.j2
diff --git a/templates/generate_qc_cell.j2 → ...l_ipynb_gen/templates/generate_qc_cell.j2 b/templates/generate_qc_cell.j2 → ...l_ipynb_gen/templates/generate_qc_cell.j2
diff --git a/templates/merge_lanes_fastq.j2 → ..._ipynb_gen/templates/merge_lanes_fastq.j2 b/templates/merge_lanes_fastq.j2 → ..._ipynb_gen/templates/merge_lanes_fastq.j2
diff --git a/templates/ungzip_fastq_files.j2 → ...ipynb_gen/templates/ungzip_fastq_files.j2 b/templates/ungzip_fastq_files.j2 → ...ipynb_gen/templates/ungzip_fastq_files.j2
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+jinja2 >=2.8
+nbformat >=4.0.1
+numpy >=1.10.4
+pandas >=0.17.1
+xlrd >=1.0.0
+ruamel.yaml >=0.11.11
+pymongo >=3.4.0
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,26 @@
+[metadata]
+name = ggr-cwl-ipynb-gen
+version = 0.5.0
+author = Alejandro Barrera
+author_email = alejandro.barrera@duke.edu
+description = IPython notebook generator for GGR CWL processing pipelines of genomic data
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/ReddyLab/ggr-cwl-ipynb-gen
+project_urls =
+    Bug Tracker = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/issues
+    Source = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/
+
+classifiers =
+    Development Status :: 3 - Alpha
+    Intended Audience :: Science/Research
+    Topic :: Scientific/Engineering :: Bio-Informatics
+    License :: OSI Approved :: MIT License
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+
+[options]
+packages = find:
diff --git a/setup.py b/setup.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+"""Description:
+Setup script for ggr-cwl-ipynb-gen
+IPython Notebook generator for processing genomic data from GGR project, in CWL
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+import sys
+if sys.version_info[0] == 2:
+    sys.exit("Sorry, Python 2 is not supported anymore. Please check and old branch (pre 2021)")
+from setuptools import setup, find_packages
+
+setup(
+    long_description_content_type='text/markdown',
+    url='https://github.com/alexbarrera/ggr-cwl-ipynb-gen',
+    keywords='cwl, bioinformatics, development',
+    scripts=[
+        "ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py",
+        "chipdb_upload/data_upload.py",
+    ],
+    include_package_data=True,
+    py_modules=["ggr_cwl_ipynb_gen"],
+    python_requires='>=3.1',
+    install_requires=[
+        'jinja2 >=2.8',
+        'nbformat >=4.0.1',
+        'numpy >=1.10.4',
+        'pandas >=0.17.1',
+        'xlrd >=1.0.0',
+        'ruamel.yaml >=0.11.11',
+        'setuptools',
+        'pymongo >=3.4.0'
+    ],  # Optional
+    data_files=[
+        ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/data_upload.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/download_fastq_files.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/generate_plots.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2']),
+    ]
+
+)