From b504e74198109292104b0f5c89c0f763867244ef Mon Sep 17 00:00:00 2001
From: Alejandro Barrera <alejandro.barrera@duke.edu>
Date: Mon, 8 Mar 2021 15:45:15 -0500
Subject: [PATCH 1/6] Upgrade code to python 3

---
 ggr-cwl-ipynb-gen.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py
index 6afc68d..18ce00b 100644
--- a/ggr-cwl-ipynb-gen.py
+++ b/ggr-cwl-ipynb-gen.py
@@ -10,11 +10,10 @@
 import consts
 import jinja2
 import inspect
-import glob
 import numpy as np
 
 encoding = sys.getfilesystemencoding()
-EXEC_DIR = os.path.dirname(unicode(__file__, encoding))
+EXEC_DIR = os.path.dirname(str(__file__))
 
 
 def render(tpl_path, context):
@@ -566,10 +565,11 @@ def get_samples_by_library_type(metadata_file, sep='\t'):
     :return: generator of panda's dataframe
     """
     try:
-        md = pd.read_excel(metadata_file,
+        md = pd.read_excel(metadata_file.name,
                            true_values=['Yes', 'Y', 'yes', 'y', 1],
                            false_values=['No', 'N', 'no', 'n', 0])
     except XLRDError:
+        print (XLRDError)
         md = pd.read_csv(metadata_file.name,
                          true_values=['Yes', 'Y', 'yes', 'y', 1],
                          false_values=['No', 'N', 'no', 'n', 0], sep=sep)
@@ -592,8 +592,8 @@ def init_conf_args(args,
         conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r]
         try:
             assert conf_args[r] is not None
-        except AssertionError, e:
-            print "[ERROR]", r, "not defined"
+        except AssertionError as e:
+            print("[ERROR]", r, "not defined")
             raise
     for o in optional_args:
         conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None)
@@ -606,7 +606,8 @@ def init_conf_args(args,
 def main():
     parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines')
     parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name')
-    parser.add_argument('-m', '--metadata', required=True, type=file, help='Metadata file with samples information')
+    parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'),
+                        help='Metadata file with samples information')
     parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file')
     parser.add_argument('-n', '--no-upload', action='store_false', 
                         help='Avoids uploading generated data to database when specified')
@@ -617,7 +618,7 @@ def main():
     parser.add_argument('--data-from', required=False, choices=consts.data_sources,
                         default=consts.data_sources[0],
                         help='Choices: %s' % (', '.join(consts.data_sources)))
-    parser.add_argument('-c', '--conf-file', required=False, type=file, help='YAML configuration file (see examples)')
+    parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)')
     parser.add_argument('-u', '--user', required=False,
                         help='HARDAC User used in SLURM (default: ${USER})')
     parser.add_argument('-e', '--user-duke-email', required=False,
@@ -638,7 +639,7 @@ def main():
         outfile = args.out
 
     if os.path.isfile(outfile) and not args.force:
-        print outfile, "is an existing file. Please use -f or --force to overwrite the contents"
+        print(outfile, "is an existing file. Please use -f or --force to overwrite the contents")
         sys.exit(1)
 
     conf_args['upload'] = args.no_upload

From 2613d3a91be5d6a32caf60193ac0677ec76167a6 Mon Sep 17 00:00:00 2001
From: Alejandro Barrera <alejandro.barrera@duke.edu>
Date: Tue, 9 Mar 2021 10:29:37 -0500
Subject: [PATCH 2/6] Moderate refactor needed to create an installable package

---
 LICENSE                                       |  21 +
 MANIFEST.in                                   |   1 +
 README.md                                     |  39 +-
 VERSION.py                                    |   1 +
 chipdb_upload/__init__.py                     |   0
 chipdb_upload/data_upload.py                  | 257 +++++++
 data_upload.py                                | 258 +------
 ggr-cwl-ipynb-gen.py                          | 654 +----------------
 ggr_cwl_ipynb_gen/__init__.py                 |   2 +
 consts.py => ggr_cwl_ipynb_gen/consts.py      |   3 +
 ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py        | 661 ++++++++++++++++++
 .../templates}/cwl_json_gen.j2                |   0
 .../templates}/cwl_slurm_array_gen.j2         |   0
 .../templates}/data_upload.j2                 |   0
 .../templates}/download_fastq_files.j2        |   0
 .../templates}/generate_plots.j2              |   0
 .../templates}/generate_qc_cell.j2            |   0
 .../templates}/merge_lanes_fastq.j2           |   0
 .../templates}/ungzip_fastq_files.j2          |   0
 pyproject.toml                                |   6 +
 requirements.txt                              |   9 +
 setup.cfg                                     |  18 +
 setup.py                                      |  76 ++
 23 files changed, 1073 insertions(+), 933 deletions(-)
 create mode 100644 LICENSE
 create mode 100644 MANIFEST.in
 create mode 100644 VERSION.py
 create mode 100644 chipdb_upload/__init__.py
 create mode 100644 chipdb_upload/data_upload.py
 mode change 100644 => 120000 data_upload.py
 mode change 100644 => 120000 ggr-cwl-ipynb-gen.py
 create mode 100644 ggr_cwl_ipynb_gen/__init__.py
 rename consts.py => ggr_cwl_ipynb_gen/consts.py (97%)
 create mode 100644 ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
 rename {templates => ggr_cwl_ipynb_gen/templates}/cwl_json_gen.j2 (100%)
 rename {templates => ggr_cwl_ipynb_gen/templates}/cwl_slurm_array_gen.j2 (100%)
 rename {templates => ggr_cwl_ipynb_gen/templates}/data_upload.j2 (100%)
 rename {templates => ggr_cwl_ipynb_gen/templates}/download_fastq_files.j2 (100%)
 rename {templates => ggr_cwl_ipynb_gen/templates}/generate_plots.j2 (100%)
 rename {templates => ggr_cwl_ipynb_gen/templates}/generate_qc_cell.j2 (100%)
 rename {templates => ggr_cwl_ipynb_gen/templates}/merge_lanes_fastq.j2 (100%)
 rename {templates => ggr_cwl_ipynb_gen/templates}/ungzip_fastq_files.j2 (100%)
 create mode 100644 pyproject.toml
 create mode 100644 requirements.txt
 create mode 100644 setup.cfg
 create mode 100644 setup.py

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8017ac4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Alejandro Barrera
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..903bcc4
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include templates/*
\ No newline at end of file
diff --git a/README.md b/README.md
index 2cbc1a8..67a277a 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 # ggr-cwl-ipynb-gen
 Jupyter notebook generator to download and execute the processing files for GGR related datasets. 
 At this point, is not intented to cover all use cases, but to serve as a quick generator of all 
-related files and scripts to pre-process sequences generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics).
+related files and scripts to pre-process genomic data generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics).
 
 Example of usage:
 ```
 $ python ggr-cwl-ipynb-gen.py \
-  --conf examples/conf.yaml \
+  --root-dir /path/to/rootdir \
   --metadata examples/Hong_3979_170316B1.xlsx \
   --out /path/to/output_dir \
   --force
@@ -16,33 +16,26 @@ The information in the example metadata and configuration file should reveal wha
 For a full list of options:
 ```
 $ python ggr-cwl-ipynb-gen.py -h
-usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines
-       [-h] -o OUT -c CONF_FILE -m METADATA [-f] [-n] [--metadata-sep SEP]
-       [--project-name PROJECT_NAME] [--data-from {sftp,miseq,other,dukeds}]
+usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines [-h] -o OUT -m METADATA [-f] [-n] [--metadata-sep SEP] [--project-name PROJECT_NAME]
+                                                                              [--data-from {sftp,miseq,other,dukeds,local}] [-c CONF_FILE] [-u USER] [-e USER_DUKE_EMAIL] [-r ROOT_DIR]
 
 optional arguments:
   -h, --help            show this help message and exit
   -o OUT, --out OUT     Jupyter notebook output file name
-  -c CONF_FILE, --conf-file CONF_FILE
-                        YAML configuration file (see examples)
   -m METADATA, --metadata METADATA
                         Metadata file with samples information
   -f, --force           Force to overwrite output file
-  -n, --no-upload       Avoids uploading generated data to database when
-                        specified
-  --metadata-sep SEP    Separator for metadata file (when different than Excel
-                        spread sheet)
+  -n, --no-upload       Avoids uploading generated data to database when specified
+  --metadata-sep SEP    Separator for metadata file (when different than Excel spread sheet)
   --project-name PROJECT_NAME
-                        Project name (by default, basename of metadata file
-                        name)
-  --data-from {sftp,miseq,other,dukeds}
-                        Choices: sftp, miseq, other, dukeds
+                        Project name (by default, basename of metadata file name)
+  --data-from {sftp,miseq,other,dukeds,local}
+                        Choices: sftp, miseq, other, dukeds, local
+  -c CONF_FILE, --conf-file CONF_FILE
+                        YAML configuration file (see examples)
+  -u USER, --user USER  HARDAC User used in SLURM (default: ${USER})
+  -e USER_DUKE_EMAIL, --user-duke-email USER_DUKE_EMAIL
+                        Email(s) notified when execution is finished (default: ${USER}@duke.edu)
+  -r ROOT_DIR, --root-dir ROOT_DIR
+                        Root directory where all subfolders and files will be created (semi-required: either defined here or in conf-file)
 ```
-
-### Dependencies
-- jinja2 >=2.8
-- nbformat >=4.0.1
-- numpy >=1.10.4
-- pandas >=0.17.1
-- xlrd >=1.0.0
-- ruamel >=0.11.11
diff --git a/VERSION.py b/VERSION.py
new file mode 100644
index 0000000..f1c763a
--- /dev/null
+++ b/VERSION.py
@@ -0,0 +1 @@
+VERSION = '0.5.0'
\ No newline at end of file
diff --git a/chipdb_upload/__init__.py b/chipdb_upload/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/chipdb_upload/data_upload.py b/chipdb_upload/data_upload.py
new file mode 100644
index 0000000..47a875f
--- /dev/null
+++ b/chipdb_upload/data_upload.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python
+from pymongo import MongoClient
+import datetime
+import os, csv
+import argparse
+import pandas as pd
+import base64
+import logging
+
+# Python script and command line tool for compiling fingerprint and QC data from ChIP-seq
+# experiments. Make sure to activate the 'alex' virtual environment from miniconda using
+# `source /data/reddylab/software/miniconda2/bin/activate alex` command from HARDAC. To
+# run full workflow, run the `countFactors_standard.sh` that outputs data directories
+# then run this script on those outputs.
+
+# VERSION 1.0 Notes: pandas.dataframe.set_value() method is deprecated and will be removed
+# in later iterations. 
+
+
+CWD = os.getcwd() + "/"
+OUT_DIR = CWD + "QC_summary/"
+
+
+def pretty_print(df):
+    with pd.option_context('display.max_rows', None, 'display.max_columns', len(df)):
+        print(df)
+
+
+def stringFormat(string):
+    return string.strip().lower().replace(" ", "_").replace('-', '_').replace("%", "percent")
+
+
+def read_file_base64(in_file):
+    """
+    Helper function that reads file into binary
+    :param in_file: Absolute path to file
+    :return: The file contents as a string
+    """
+    try:
+        with open(in_file, 'rb') as f:
+            return base64.b64encode(f.read())
+    # Exception for symlinks
+    except IOError:
+        with open(os.readlink(in_file), 'rb') as f:
+            return base64.b64encode(f.read())
+
+
+def read_metadata(in_file):
+    """
+    Helper function that reads a metadata file and returns a dictionary of values
+    :param in_file: The full metadata file path as a string
+    :return: A dictionary of the files' attributes
+    """
+    attr = {}
+    # Read a 2-line tab-delimited file with header and contents
+    with open(in_file, 'rb') as f:
+        reader = csv.reader(f, delimiter='\t')
+        header = [stringFormat(ele) for ele in next(reader)]
+        contents = [stringFormat(ele) for ele in next(reader)]
+        attr = dict(zip(header, contents))
+
+    return attr
+
+
+def standardize_header(arr):
+    """Returns a dataframe header as list, standardized to
+    QC naming convention
+    :param arr: A list of strings representing header
+    :return: Standardized column names, list of strings
+    """
+    header_dict = {"sample": "sample", "raw": "reads_sequenced",
+                   "reads_sequenced": "reads_sequenced", "reads after trimming": "reads_after_trimming",
+                   "trimmed": "reads_after_trimming", "mapped": "reads_mapped",
+                   "reads_mapped": "reads_mapped", "percentage_unique": "percent_unique",
+                   "%reads unique": "percent_unique",
+                   "percentage_unique_mapped_and_filtered": "percent_unique_mapped_filtered",
+                   "%reads mapped after de-dup & filtering": "percent_unique_mapped_filtered",
+                   "reads in peaks": "reads_in_peaks", "in_peaks": "reads_in_peaks",
+                   "percent_in_peaks": "percent_in_peaks", "% reads in peaks": "percent_in_peaks",
+                   "broadpeak_count": "broad_peak_count", "narrowpeak_count": "narrow_peak_count",
+                   "nrf": "nrf", "pbc": "pbc_one", "nsc": "nsc", "rsc": "rsc", "comment": "comment"}
+    elements = []
+    useColumns = []
+    for i, ele in enumerate(arr):
+        if ele.lower() in header_dict.keys():
+            elements.append(header_dict[ele.lower()])
+            useColumns.append(i)
+    return elements, useColumns
+
+
+def process_directory(in_dir):
+    """
+    Processes data in directory, returns as Pandas dataframe
+    :param in_dir: Input data directory, String
+    :return: A Pandas dataframe containing fingerprint data, QCs, and images
+    """
+    qc_file = ""
+    fingerprint_qc_arr = []
+    spp_data_arr = []
+    images = []
+    metadata_files = []
+    # Separate files into appropriate lists
+    for filename in os.listdir(in_dir):
+        # Append the file path
+        file_path = os.path.join(in_dir, filename)
+        if os.stat(file_path).st_size != 0:
+            if filename.lower().endswith('_metadata.txt'):                      # Find metadata
+                metadata_files.append(file_path)
+            elif filename.endswith('_QCmetrics.txt'):                           # If fingerprint QC file, add to array
+                fingerprint_qc_arr.append(file_path)
+            elif filename.lower() == 'qc.csv' or filename.lower() == 'qc.txt' \
+                    or filename.lower() == 'chip_seq_summary_iter0.tsv':        # If lab-computed QC file, set var
+                qc_file = file_path
+            elif filename.endswith(".png") or filename.endswith(".pdf"):
+                images.append(file_path)
+            elif filename.endswith('.cross_corr.txt'):                          # If cross corr data, add to array
+                spp_data_arr.append(file_path)
+
+    # Raise error if QC file was not found. 
+    if not os.path.isfile(qc_file):
+        logging.error("QC file was not found in the data directory (i.e. qc.csv, qc.txt)")
+        raise IOError("QC file was not found in the data directory (i.e. qc.csv, qc.txt)")
+
+    # Process QC file into a dataframe
+    try:
+        with open(qc_file, 'rb') as f:
+            # Find delimiter using Sniffer class
+            dialect = csv.Sniffer().sniff(f.readline(), ['\t', ','])
+            reader = csv.reader(f, delimiter=dialect.delimiter)
+            f.seek(0)
+            column_names = standardize_header(next(reader))
+            # Read data into Pandas dataframe
+            df = pd.read_csv(f, delimiter=dialect.delimiter, header=None,
+                             names=column_names[0], usecols=column_names[1], engine='python')
+    # Catch if the filename is not an actual file, but a symlink
+    except IOError:
+        with open(os.readlink(qc_file), 'rb') as f:
+            # Find delimiter using Sniffer class
+            dialect = csv.Sniffer().sniff(f.readline(), ['\t', ','])
+            reader = csv.reader(f, delimiter=dialect.delimiter)
+            f.seek(0)
+            column_names = standardize_header(next(reader))
+            # Read data into Pandas dataframe
+            df = pd.read_csv(f, delimiter=dialect.delimiter, header=None,
+                             names=column_names[0], usecols=column_names[1], engine='python')
+
+    # Index the dataframe by sample
+    df.set_index('sample', inplace=True)
+
+    # If there are fingerprint files, add to array
+    if fingerprint_qc_arr:
+        # Add fingerprint data to dataframe
+        fp_df = pd.DataFrame()
+        for filename in fingerprint_qc_arr:
+            if os.stat(filename).st_size != 0:
+                with open(filename, 'rb') as f:
+                    reader = csv.reader(f, delimiter='\t')
+                    header = [stringFormat(ele) for ele in next(reader)]
+                    new_fp_df = pd.read_csv(f, delimiter='\t', header=None,
+                                            names=header, engine='python')
+                    fp_df = fp_df.append(new_fp_df)
+        fp_df.drop_duplicates(subset='sample', keep='last', inplace=True)
+        fp_df.set_index('sample', inplace=True)
+        df = df.merge(fp_df, left_index=True, right_index=True, how='outer')
+
+    # Add fingerprint images and metadata information
+    for sample in df.index.values:                                  # Index is sample name
+        fp_image = ''
+        spp_image = ''
+        metadata_file = ''
+        for filename in images:
+            if filename.endswith('.png') and sample in filename:
+                fp_image = filename
+            elif filename.endswith('.pdf') and sample in filename:
+                spp_image = filename
+        for filename in metadata_files:
+            if sample in filename:
+                metadata_file = filename
+        if fp_image:
+            df.set_value(sample, 'fp_image', read_file_base64(fp_image))
+        if spp_image:
+            df.set_value(sample, 'spp_image', read_file_base64(spp_image))
+        if metadata_file:
+            # Read in all metadata attributes into df
+            for key, value in read_metadata(metadata_file).iteritems():
+                df.set_value(sample, key, value)
+        # Set flowcell name to base directory
+        df.set_value(sample, 'flowcell', os.path.basename(in_dir))
+
+    return df
+
+
+def main():
+    parser = argparse.ArgumentParser('Generates QC metric summary file for available ChIP-seq samples')
+    parser.add_argument('-i', '--in_dirs', required=True, nargs='+',
+                        help='Directory(ies)for fingerprint data')
+    parser.add_argument('-u', '--uri', required=True,
+                        help='URI for database upload')
+    parser.add_argument('-d', '--database', required=True,
+                        help='Database name for upload')
+    parser.add_argument('-c', '--collection', required=True,
+                        help='Collection name for database')
+    parser.add_argument('-o', '--output', required=True, help="Filename for output log")
+    args = parser.parse_args()
+
+    logging.basicConfig(filename=args.output, level=logging.DEBUG)
+
+
+    # Process each given data directory
+    df = pd.DataFrame()
+    for i in range(len(args.in_dirs)):
+        if os.path.isdir(args.in_dirs[i]):
+            new_df = process_directory(args.in_dirs[i])
+            df = df.append(new_df)
+    factor_names = [row.split('.')[0] for row in df.index.values]
+    df.rename(columns={'diff._enrichment':'diff_enrichment'}, inplace=True)
+
+    # Convert Pandas dataframe into list of dictionaries
+    data = df.to_dict(orient='index')
+
+    # Insert documents (list of dicts) to web-application database
+    uri = args.uri
+    client = MongoClient(uri)
+    sample_coll = client[args.database][args.collection]
+    flowcell_coll = client[args.database]["flowcell"]
+
+    # Initialize a flowcell data
+    flowcell_name = ""
+    flowcell_data = {"samples": []}
+
+    # For each sample, replace if it exists, otherwise insert (upsert)
+    for sample_name in data:
+        # Set sample data
+        sample = data[sample_name]
+        sample['sample'] = sample_name
+        sample['last_modified'] =  datetime.datetime.utcnow()
+        logging.info("Uploading sample: %s" % sample_name)
+        sample_coll.replace_one({'sample': sample_name}, sample, upsert=True)
+
+        # Set flowcell data
+        flowcell_name = sample['flowcell']
+        flowcell_data['name'] = flowcell_name
+        flowcell_data['date'] = sample['timestamp']
+        flowcell_data['samples'].append(sample_name)
+
+    # Upsert the flowcell
+    logging.info("Uploading flowcell: %s" % flowcell_data)
+    flowcell_coll.replace_one({'name': flowcell_name}, flowcell_data, upsert=True)
+
+    logging.info("Data upload terminated successfully")
+
+
+    return
+    
+
+if __name__ == '__main__':
+    main()
diff --git a/data_upload.py b/data_upload.py
deleted file mode 100644
index 91116fd..0000000
--- a/data_upload.py
+++ /dev/null
@@ -1,257 +0,0 @@
-from pymongo import MongoClient
-import datetime
-import os, csv
-import argparse
-import pandas as pd
-import base64
-import consts
-import logging
-
-# Python script and command line tool for compiling fingerprint and QC data from ChIP-seq
-# experiments. Make sure to activate the 'alex' virtual environment from miniconda using
-# `source /data/reddylab/software/miniconda2/bin/activate alex` command from HARDAC. To
-# run full workflow, run the `countFactors_standard.sh` that outputs data directories
-# then run this script on those outputs.
-
-# VERSION 1.0 Notes: pandas.dataframe.set_value() method is deprecated and will be removed
-# in later iterations. 
-
-
-CWD = os.getcwd() + "/"
-OUT_DIR = CWD + "QC_summary/"
-
-
-def pretty_print(df):
-    with pd.option_context('display.max_rows', None, 'display.max_columns', len(df)):
-        print(df)
-
-
-def stringFormat(string):
-    return string.strip().lower().replace(" ", "_").replace('-', '_').replace("%", "percent")
-
-
-def read_file_base64(in_file):
-    """
-    Helper function that reads file into binary
-    :param in_file: Absolute path to file
-    :return: The file contents as a string
-    """
-    try:
-        with open(in_file, 'rb') as f:
-            return base64.b64encode(f.read())
-    # Exception for symlinks
-    except IOError:
-        with open(os.readlink(in_file), 'rb') as f:
-            return base64.b64encode(f.read())
-
-
-def read_metadata(in_file):
-    """
-    Helper function that reads a metadata file and returns a dictionary of values
-    :param in_file: The full metadata file path as a string
-    :return: A dictionary of the files' attributes
-    """
-    attr = {}
-    # Read a 2-line tab-delimited file with header and contents
-    with open(in_file, 'rb') as f:
-        reader = csv.reader(f, delimiter='\t')
-        header = [stringFormat(ele) for ele in next(reader)]
-        contents = [stringFormat(ele) for ele in next(reader)]
-        attr = dict(zip(header, contents))
-
-    return attr
-
-
-def standardize_header(arr):
-    """Returns a dataframe header as list, standardized to
-    QC naming convention
-    :param arr: A list of strings representing header
-    :return: Standardized column names, list of strings
-    """
-    header_dict = {"sample": "sample", "raw": "reads_sequenced",
-                   "reads_sequenced": "reads_sequenced", "reads after trimming": "reads_after_trimming",
-                   "trimmed": "reads_after_trimming", "mapped": "reads_mapped",
-                   "reads_mapped": "reads_mapped", "percentage_unique": "percent_unique",
-                   "%reads unique": "percent_unique",
-                   "percentage_unique_mapped_and_filtered": "percent_unique_mapped_filtered",
-                   "%reads mapped after de-dup & filtering": "percent_unique_mapped_filtered",
-                   "reads in peaks": "reads_in_peaks", "in_peaks": "reads_in_peaks",
-                   "percent_in_peaks": "percent_in_peaks", "% reads in peaks": "percent_in_peaks",
-                   "broadpeak_count": "broad_peak_count", "narrowpeak_count": "narrow_peak_count",
-                   "nrf": "nrf", "pbc": "pbc_one", "nsc": "nsc", "rsc": "rsc", "comment": "comment"}
-    elements = []
-    useColumns = []
-    for i, ele in enumerate(arr):
-        if ele.lower() in header_dict.keys():
-            elements.append(header_dict[ele.lower()])
-            useColumns.append(i)
-    return elements, useColumns
-
-
-def process_directory(in_dir):
-    """
-    Processes data in directory, returns as Pandas dataframe
-    :param in_dir: Input data directory, String
-    :return: A Pandas dataframe containing fingerprint data, QCs, and images
-    """
-    qc_file = ""
-    fingerprint_qc_arr = []
-    spp_data_arr = []
-    images = []
-    metadata_files = []
-    # Separate files into appropriate lists
-    for filename in os.listdir(in_dir):
-        # Append the file path
-        file_path = os.path.join(in_dir, filename)
-        if os.stat(file_path).st_size != 0:
-            if filename.lower().endswith('_metadata.txt'):                      # Find metadata
-                metadata_files.append(file_path)
-            elif filename.endswith('_QCmetrics.txt'):                           # If fingerprint QC file, add to array
-                fingerprint_qc_arr.append(file_path)
-            elif filename.lower() == 'qc.csv' or filename.lower() == 'qc.txt' \
-                    or filename.lower() == 'chip_seq_summary_iter0.tsv':        # If lab-computed QC file, set var
-                qc_file = file_path
-            elif filename.endswith(".png") or filename.endswith(".pdf"):
-                images.append(file_path)
-            elif filename.endswith('.cross_corr.txt'):                          # If cross corr data, add to array
-                spp_data_arr.append(file_path)
-
-    # Raise error if QC file was not found. 
-    if not os.path.isfile(qc_file):
-        logging.error("QC file was not found in the data directory (i.e. qc.csv, qc.txt)")
-        raise IOError("QC file was not found in the data directory (i.e. qc.csv, qc.txt)")
-
-    # Process QC file into a dataframe
-    try:
-        with open(qc_file, 'rb') as f:
-            # Find delimiter using Sniffer class
-            dialect = csv.Sniffer().sniff(f.readline(), ['\t', ','])
-            reader = csv.reader(f, delimiter=dialect.delimiter)
-            f.seek(0)
-            column_names = standardize_header(next(reader))
-            # Read data into Pandas dataframe
-            df = pd.read_csv(f, delimiter=dialect.delimiter, header=None,
-                             names=column_names[0], usecols=column_names[1], engine='python')
-    # Catch if the filename is not an actual file, but a symlink
-    except IOError:
-        with open(os.readlink(qc_file), 'rb') as f:
-            # Find delimiter using Sniffer class
-            dialect = csv.Sniffer().sniff(f.readline(), ['\t', ','])
-            reader = csv.reader(f, delimiter=dialect.delimiter)
-            f.seek(0)
-            column_names = standardize_header(next(reader))
-            # Read data into Pandas dataframe
-            df = pd.read_csv(f, delimiter=dialect.delimiter, header=None,
-                             names=column_names[0], usecols=column_names[1], engine='python')
-
-    # Index the dataframe by sample
-    df.set_index('sample', inplace=True)
-
-    # If there are fingerprint files, add to array
-    if fingerprint_qc_arr:
-        # Add fingerprint data to dataframe
-        fp_df = pd.DataFrame()
-        for filename in fingerprint_qc_arr:
-            if os.stat(filename).st_size != 0:
-                with open(filename, 'rb') as f:
-                    reader = csv.reader(f, delimiter='\t')
-                    header = [stringFormat(ele) for ele in next(reader)]
-                    new_fp_df = pd.read_csv(f, delimiter='\t', header=None,
-                                            names=header, engine='python')
-                    fp_df = fp_df.append(new_fp_df)
-        fp_df.drop_duplicates(subset='sample', keep='last', inplace=True)
-        fp_df.set_index('sample', inplace=True)
-        df = df.merge(fp_df, left_index=True, right_index=True, how='outer')
-
-    # Add fingerprint images and metadata information
-    for sample in df.index.values:                                  # Index is sample name
-        fp_image = ''
-        spp_image = ''
-        metadata_file = ''
-        for filename in images:
-            if filename.endswith('.png') and sample in filename:
-                fp_image = filename
-            elif filename.endswith('.pdf') and sample in filename:
-                spp_image = filename
-        for filename in metadata_files:
-            if sample in filename:
-                metadata_file = filename
-        if fp_image:
-            df.set_value(sample, 'fp_image', read_file_base64(fp_image))
-        if spp_image:
-            df.set_value(sample, 'spp_image', read_file_base64(spp_image))
-        if metadata_file:
-            # Read in all metadata attributes into df
-            for key, value in read_metadata(metadata_file).iteritems():
-                df.set_value(sample, key, value)
-        # Set flowcell name to base directory
-        df.set_value(sample, 'flowcell', os.path.basename(in_dir))
-
-    return df
-
-
-def main():
-    parser = argparse.ArgumentParser('Generates QC metric summary file for available ChIP-seq samples')
-    parser.add_argument('-i', '--in_dirs', required=True, nargs='+',
-                        help='Directory(ies)for fingerprint data')
-    parser.add_argument('-u', '--uri', required=True,
-                        help='URI for database upload')
-    parser.add_argument('-d', '--database', required=True,
-                        help='Database name for upload')
-    parser.add_argument('-c', '--collection', required=True,
-                        help='Collection name for database')
-    parser.add_argument('-o', '--output', required=True, help="Filename for output log")
-    args = parser.parse_args()
-
-    logging.basicConfig(filename=args.output, level=logging.DEBUG)
-
-
-    # Process each given data directory
-    df = pd.DataFrame()
-    for i in range(len(args.in_dirs)):
-        if os.path.isdir(args.in_dirs[i]):
-            new_df = process_directory(args.in_dirs[i])
-            df = df.append(new_df)
-    factor_names = [row.split('.')[0] for row in df.index.values]
-    df.rename(columns={'diff._enrichment':'diff_enrichment'}, inplace=True)
-
-    # Convert Pandas dataframe into list of dictionaries
-    data = df.to_dict(orient='index')
-
-    # Insert documents (list of dicts) to web-application database
-    uri = args.uri
-    client = MongoClient(uri)
-    sample_coll = client[args.database][args.collection]
-    flowcell_coll = client[args.database]["flowcell"]
-
-    # Initialize a flowcell data
-    flowcell_name = ""
-    flowcell_data = {"samples": []}
-
-    # For each sample, replace if it exists, otherwise insert (upsert)
-    for sample_name in data:
-        # Set sample data
-        sample = data[sample_name]
-        sample['sample'] = sample_name
-        sample['last_modified'] =  datetime.datetime.utcnow()
-        logging.info("Uploading sample: %s" % sample_name)
-        sample_coll.replace_one({'sample': sample_name}, sample, upsert=True)
-
-        # Set flowcell data
-        flowcell_name = sample['flowcell']
-        flowcell_data['name'] = flowcell_name
-        flowcell_data['date'] = sample['timestamp']
-        flowcell_data['samples'].append(sample_name)
-
-    # Upsert the flowcell
-    logging.info("Uploading flowcell: %s" % flowcell_data)
-    flowcell_coll.replace_one({'name': flowcell_name}, flowcell_data, upsert=True)
-
-    logging.info("Data upload terminated successfully")
-
-
-    return
-    
-
-if __name__ == '__main__':
-    main()
diff --git a/data_upload.py b/data_upload.py
new file mode 120000
index 0000000..9aae8f7
--- /dev/null
+++ b/data_upload.py
@@ -0,0 +1 @@
+chipdb_upload/data_upload.py
\ No newline at end of file
diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py
deleted file mode 100644
index 18ce00b..0000000
--- a/ggr-cwl-ipynb-gen.py
+++ /dev/null
@@ -1,653 +0,0 @@
-import argparse
-import nbformat
-import nbformat.v3 as nbf
-import sys
-import os
-import pandas as pd
-from jinja2 import FileSystemLoader
-from xlrd import XLRDError
-import ruamel.yaml
-import consts
-import jinja2
-import inspect
-import numpy as np
-
-encoding = sys.getfilesystemencoding()
-EXEC_DIR = os.path.dirname(str(__file__))
-
-
-def render(tpl_path, context):
-    path, filename = os.path.split(tpl_path)
-    return jinja2.Environment(
-        loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates"))
-    ).get_template(filename).render(context)
-
-
-class Cell(object):
-    def __init__(self, contents, description=None):
-        self.contents = contents
-        self.description = description
-        if type(self.description) is not list:
-            self.description = [self.description]
-        self.header = []
-        # self.header_inputs = []
-        # self.header_outputs = []
-
-    def writefile_to(self, dest):
-        self.header = ["%%%%writefile %s" % dest]
-
-    def to_list(self):
-        cells = []
-        if self.description:
-            cells.append(nbf.new_text_cell('markdown', source=self.description))
-        if self.contents:
-            cells.append(nbf.new_code_cell(input=self.header + self.contents))
-        return cells
-
-
-class CellSbatch(Cell):
-    def __init__(self, script_output=None, depends_on=False, mem=None,
-                 cpus=None, partition=None, wrap_command=None, array=None,
-                 prolog=list(), **kwargs):
-        super(CellSbatch, self).__init__(**kwargs)
-
-        content_prolog = ['sbatch']
-        if script_output:
-            content_prolog.extend(['-o', script_output, '\\\n'])
-        if partition:
-            content_prolog.extend(['-p', partition, '\\\n'])
-        if mem:
-            content_prolog.extend(['--mem', str(mem), '\\\n'])
-        if cpus:
-            content_prolog.extend(['-c', str(cpus), '\\\n'])
-        if depends_on:
-            content_prolog.extend(['--depend', 'afterok:$1', '\\\n'])
-        if array is not None:
-            content_prolog.extend(['--array', array, '\\\n'])
-        if wrap_command:
-            content_prolog.append('--wrap="%s' % wrap_command)
-            self.contents.append('"')
-        self.contents = content_prolog + self.contents
-        self.contents = prolog + [' '.join(self.contents)]
-
-        self.header = ["%%script"]
-        self.header.append('--out blocking_job_str')
-        self.header.append("bash")
-
-        if depends_on:
-            self.header.append('-s "$blocking_job"')
-        self.header = [' '.join(self.header)]
-
-    def to_list(self):
-        cells = super(CellSbatch, self).to_list()
-
-        # We need to add an extra code cell to compute the SLURM job id
-        extra_cell = Cell(
-            contents=["import re", "blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)"],
-            description="Extract blocking job id"
-        )
-        cells.extend(extra_cell.to_list())
-        return cells
-
-
-def save_metadata(samples_df, conf_args, lib_type):
-    cells = []
-    cell_mkdir = Cell(contents=["%%bash",
-                                "mkdir -p %s/data/%s/metadata" % (conf_args['root_dir'], lib_type),
-                                "mkdir -p %s/data/%s/raw_reads" % (conf_args['root_dir'], lib_type),
-                                "mkdir -p %s/data/%s/processed_raw_reads" % (conf_args['root_dir'], lib_type),
-                                "mkdir -p %s/processing/%s/scripts" % (conf_args['root_dir'], lib_type),
-                                "mkdir -p %s/processing/%s/jsons" % (conf_args['root_dir'], lib_type),
-                                "mkdir -p %s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
-                                ],
-                      description=["# %s - %s" % (conf_args['project_name'], lib_type),
-                                   consts.notebook_blurb,
-                                   "#### Create necessary folder(s)"])
-    cells.extend(cell_mkdir.to_list())
-
-
-    outfile = "%s/data/%s/metadata/%s_download_metadata.%s.txt" % \
-              (conf_args['root_dir'], lib_type, lib_type,
-               conf_args['project_name'])
-    contents = ["%%%%writefile %s" %
-                outfile, samples_df.to_csv(index=False,
-                                           sep=conf_args['sep'],
-                                           encoding='utf-8',
-                                           header=[x.capitalize() for x in samples_df.columns.values])]
-    cell = Cell(contents=contents, description="Save metadata file")
-    cells.extend(cell.to_list())
-
-    return cells, outfile
-
-
-def download_fastq_files(conf_args, lib_type, metadata_fn=None):
-    cells = []
-
-    download_fn = "%s/processing/%s/scripts/download_%s.sh" % (conf_args['root_dir'], lib_type,
-                                                                conf_args['project_name'])
-    context = {
-        'output_fn': download_fn,
-        'project_name': conf_args['project_name'],
-        'metadata_filename': metadata_fn,
-        'root_dir': conf_args['root_dir'],
-        'user': conf_args['user'],
-        'lib_type': lib_type,
-        'data_source': conf_args['data_from'],
-        'consts': consts
-    }
-    contents = [render('templates/download_fastq_files.j2', context)]
-
-    cell_write_dw_file = Cell(contents=contents,
-                              description=["#### Download FASTQ from %s" % conf_args['data_from'],
-                                           "Create file to download FASTQ files"])
-    cells.extend(cell_write_dw_file.to_list())
-
-    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
-    execute_cell = CellSbatch(contents=list(),
-                              partition=",".join(consts.slurm_partitions),
-                              wrap_command="ssh %s@%s 'sh %s'" % (conf_args['user'],
-                                                                  consts.HOST_FOR_TUNNELED_DOWNLOAD,
-                                                                  download_fn),
-                              description="Execute file to download files",
-                              script_output="%s/%s_%s.out" % (logs_dir, conf_args['project_name'],
-                                                                  inspect.stack()[0][3]))
-    cells.extend(execute_cell.to_list())
-
-    return cells
-
-
-def ungzip_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None):
-    cells = []
-    ungzip_fn = "%s/processing/%s/scripts/ungzip_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name'])
-    context = {
-        'output_fn' : ungzip_fn,
-        'metadata_filename': metadata_filename,
-        'project_name': conf_args['project_name'],
-        'root_dir': conf_args['root_dir'],
-        'lib_type': lib_type,
-        'num_samples': num_samples
-    }
-    contents = [render('templates/ungzip_fastq_files.j2', context)]
-
-    cell_write_dw_file = Cell(contents=contents, description="#### Ungzip FASTQ files")
-    cells.extend(cell_write_dw_file.to_list())
-
-    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
-    execute_cell = CellSbatch(contents=[ungzip_fn],
-                              description="Execute file to ungzip FASTQ files",
-                              depends_on=True,
-                              partition=",".join(consts.slurm_partitions),
-                              array="0-%d%%20" % (num_samples - 1),
-                              script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'],
-                                                                  inspect.stack()[0][3]))
-    cells.extend(execute_cell.to_list())
-
-    return cells
-
-
-def merge_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None):
-    cells = []
-    merge_fn = "%s/processing/%s/scripts/merge_lanes_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name'])
-    context = {
-        'output_fn' : merge_fn,
-        'metadata_filename': metadata_filename,
-        'project_name': conf_args['project_name'],
-        'root_dir': conf_args['root_dir'],
-        'lib_type': lib_type,
-        'num_samples': num_samples
-    }
-    contents = [render('templates/merge_lanes_fastq.j2', context)]
-
-    cell_write_dw_file = Cell(contents=contents, description="#### Merge lanes of FASTQ files")
-    cells.extend(cell_write_dw_file.to_list())
-
-    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
-    execute_cell = CellSbatch(contents=[merge_fn],
-                              description="Execute file to merge lanes of FASTQ files",
-                              depends_on=True,
-                              array="0-%d%%20" % (num_samples-1),
-                              partition=",".join(consts.slurm_partitions),
-                              script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'],
-                                                                  inspect.stack()[0][3]),)
-    cells.extend(execute_cell.to_list())
-
-    return cells
-
-
-def cwl_json_gen(conf_args, lib_type, metadata_filename):
-    func_name = inspect.stack()[0][3]
-    cells = []
-    output_fn = "%s/processing/%s/scripts/%s_%s.sh" % (conf_args['root_dir'],
-                                                       lib_type,
-                                                       func_name,
-                                                       conf_args['project_name'])
-    context = {
-        'output_fn' : output_fn,
-        'metadata_filename': metadata_filename,
-        'project_name': conf_args['project_name'],
-        'root_dir': conf_args['root_dir'],
-        'lib_type': lib_type,
-        'star_genome': consts.star_genome,
-        'mem': consts.mem[lib_type.lower()],
-        'nthreads': consts.nthreads[lib_type.lower()],
-        'separate_jsons': consts.separate_jsons
-    }
-    contents = [render('templates/%s.j2' % func_name, context)]
-
-    cell_write_dw_file = Cell(contents=contents, description="#### Create JSON files for CWL pipeline files")
-    cells.extend(cell_write_dw_file.to_list())
-
-    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
-    execute_cell = CellSbatch(contents=[output_fn],
-                              description="Execute file to create JSON files",
-                              depends_on=True,
-                              partition=",".join(consts.slurm_partitions),
-                              prolog=["source %s %s" % (consts.conda_activate,
-                                                        consts.conda_environment)],
-                              script_output="%s/%s_%s.out" % (logs_dir,
-                                                                  conf_args['project_name'],
-                                                                  inspect.stack()[0][3]))
-    cells.extend(execute_cell.to_list())
-    return cells
-
-
-def cwl_slurm_array_gen(conf_args, lib_type, metadata_filename, pipeline_type, n_samples):
-    func_name = inspect.stack()[0][3]
-    cells = []
-    output_fn = "%s/processing/%s/scripts/%s-%s.sh" % (conf_args['root_dir'],
-                                                       lib_type,
-                                                       conf_args['project_name'],
-                                                       pipeline_type)
-    metadata_basename = os.path.splitext(os.path.basename(metadata_filename))[0]
-    context = {
-        'output_fn' : output_fn,
-        'metadata_basename': metadata_basename,
-        'project_name': conf_args['project_name'],
-        'root_dir': conf_args['root_dir'],
-        'user_duke_email': conf_args['user_duke_email'],
-        'lib_type': lib_type,
-        'mem': consts.mem[lib_type.lower()],
-        'nthreads': consts.nthreads[lib_type.lower()],
-        'pipeline_type': pipeline_type,
-        'consts': consts
-    }
-    contents = [render('templates/%s.j2' % func_name, context)]
-
-    cell_write_dw_file = Cell(contents=contents, description="#### Create SLURM array master bash file for %s samples" % pipeline_type)
-    cells.extend(cell_write_dw_file.to_list())
-
-    execute_cell = CellSbatch(contents=[output_fn],
-                              description="Execute SLURM array master file",
-                              depends_on=True,
-                              array="0-%d%%20" % (n_samples - 1),
-                              prolog=["source %s %s" % (consts.conda_activate,
-                                                        consts.conda_environment)],
-                              partition=",".join(consts.slurm_partitions))
-    cells.extend(execute_cell.to_list())
-
-    return cells
-
-
-def generate_qc_cell(conf_args, lib_type, pipeline_type):
-    func_name = inspect.stack()[0][3]
-    cells = []
-
-    # Python program has no 'se' or 'pe' abbreviation
-    end_type = pipeline_type.split("-")[0]
-    if end_type == "se":
-        end_type = "single_end"
-    elif end_type == "pe":
-        end_type = "paired_end"
-    else:
-        return CellSbatch(contents=[""])
-
-
-    output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"],
-                                                                   lib_type,
-                                                                   func_name,
-                                                                   conf_args["project_name"],
-                                                                   pipeline_type)
-    qc_type = lib_type.replace("_", "")
-    context = {
-        'output_fn': output_fn,
-        "conda_activate": consts.conda_activate,
-        'root_dir': conf_args["root_dir"],
-        "library_type": lib_type,
-        "project_name": conf_args["project_name"],
-        "pipeline_type": pipeline_type,
-        "qc_script_dir": consts.qc_script_dir,
-        "qc_type": qc_type,
-        "end_type": end_type
-    }
-    contents = [render('templates/%s.j2' % func_name, context)]
-
-    cell_write_dw_file = Cell(contents=contents, description="#### Create QC generating script")
-    cells.extend(cell_write_dw_file.to_list())
-
-    execute_cell = CellSbatch(contents=[output_fn],
-                              depends_on=True,
-                              partition=",".join(consts.slurm_partitions),
-                              description="Generate QCs for %s-%s" % (conf_args["project_name"], pipeline_type))
-
-    cells.extend(execute_cell.to_list())
-
-    return cells
-
-
-def generate_plots(conf_args, metadata_file, lib_type, pipeline_type, n_samples):
-    """
-    Generates cell for creating fingerprint data
-    :param conf_args: Dictionary containing data about directories, project name, etc.
-    :param metadata_file: File path to metadata
-    :param lib_type: Type of assay (RNA, ChIP, ATAC)
-    :param pipeline_type: Type of sequencing pipeline (end, control)
-    :return:
-    """
-    func_name = inspect.stack()[0][3]
-    cells = []
-    # Current iteration of web-application only accepts ChIP samples
-    if lib_type != "chip_seq":
-        return []
-
-    input_directory = "{}/processing/{}/{}-{}".format(conf_args['root_dir'],
-                                                      lib_type,
-                                                      conf_args['project_name'],
-                                                      pipeline_type)
-    output_directory = input_directory
-
-    output_fn = '%s/processing/%s/scripts/generate_plot.%s-%s.sh' % (conf_args["root_dir"],
-                                                                      lib_type,
-                                                                      conf_args["project_name"],
-                                                                      pipeline_type)
-
-    context = {
-        'output_fn': output_fn,
-        'env_activate': consts.conda_activate,
-        'root_dir': conf_args['root_dir'],
-        'lib_type': lib_type,
-        'project_name': conf_args['project_name'],
-        'pipeline_type': pipeline_type,
-        'metadata_file': metadata_file,
-        'input_dir': input_directory,
-        'output_dir': output_directory
-    }
-    contents = [render('templates/%s.j2' % func_name, context)]
-    cell_write_dw_file = Cell(contents=contents, description="#### Create plot generating script")
-    cells.extend(cell_write_dw_file.to_list())
-
-
-    execute_cell = CellSbatch(contents=[output_fn],
-                              depends_on=True,
-                              array="0-%d%%5" % (n_samples - 1),
-                              prolog=["source %s %s" % (consts.conda_activate, consts.conda_environment)],
-                              partition=",".join(consts.slurm_partitions),
-                              description="Generate plots and data for website")
-    cells.extend(execute_cell.to_list())
-
-    return cells
-
-
-def data_upload(conf_args, lib_type, pipeline_type):
-    """
-    Function for generating a cell that uploads notebook generated data
-    to database. Can be avoided with usage of tag "-n".
-    """
-    func_name = inspect.stack()[0][3]
-    cells = []
-
-    # Only upload data to web-app if it is ChIP-seq
-    if lib_type != "chip_seq" or not conf_args["upload"]:
-        return []
-
-    output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"],
-                                                          lib_type,
-                                                          func_name,
-                                                          conf_args["project_name"],
-                                                          pipeline_type)
-
-    script_dir = os.path.dirname(os.path.realpath(__file__))
-    data_dir = "{}/processing/chip_seq/{}-{}".format(conf_args['root_dir'],
-                                                   conf_args['project_name'], pipeline_type)
-
-    context = {
-      'output_fn': output_fn,
-      'root_dir': conf_args['root_dir'],
-      'pipeline_type': pipeline_type,
-      'library_type': lib_type,
-      'project_name': conf_args['project_name'],
-      'script_dir': script_dir,
-      'conda_activate': consts.conda_activate,
-      'data_dir': data_dir,
-      'uri': conf_args['uri'] if 'uri' in conf_args else None,
-      'database': conf_args['database'] if 'database' in conf_args else None,
-      'collection': conf_args['collection'] if 'collection' in conf_args else None
-      }
-
-    contents = [render('templates/%s.j2' % func_name, context)]
-    cell_write_dw_file = Cell(contents=contents, description="#### Create data upload script")
-    cells.extend(cell_write_dw_file.to_list())
-
-    execute_cell = CellSbatch(contents=[output_fn],
-                              depends_on=True,
-                              prolog=["source %s alex" % consts.conda_activate],
-                              partition=",".join(consts.slurm_partitions),
-                            description="### Upload ChIP-seq to web-application")
-    cells.extend(execute_cell.to_list())
-
-    return cells
-
-
-def get_pipeline_types(samples_df):
-    lib_type = samples_df['library type'].iloc[0].lower().replace('-', '_')
-    if lib_type == consts.library_type_chip_seq:
-        for seq_end in consts.seq_ends:
-            for with_control in consts.with_controls:
-                samples_filter = samples_df['paired-end or single-end'].str.lower() == seq_end
-                if with_control:
-                    samples_filter = samples_filter & (~samples_df['control'].isnull())
-                    pipeline_type = '-'.join([seq_end, with_control])
-                else:
-                    samples_filter = samples_filter & (samples_df['control'].isnull())
-                    pipeline_type = '-'.join([seq_end])
-                yield pipeline_type, np.sum(samples_filter)
-    if lib_type == consts.library_type_rna_seq:
-        for seq_end in consts.seq_ends:
-            for strandness in consts.strandnesses:
-                samples_filter = \
-                    (samples_df['paired-end or single-end'].str.lower() == seq_end) \
-                    & (samples_df['strand specificity'].str.lower() == strandness)
-                if consts.with_sjdb:
-                    pipeline_type = '-'.join([seq_end, strandness, 'with-sjdb'])
-                else:
-                    pipeline_type = '-'.join([seq_end, strandness])
-                yield pipeline_type, np.sum(samples_filter)
-    if lib_type == consts.library_type_atac_seq:
-        for seq_end in consts.seq_ends:
-            for with_blacklist_removal in consts.blacklist_removal:
-                samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end)
-                if with_blacklist_removal:
-                    pipeline_type = '-'.join([seq_end, with_blacklist_removal])
-                    samples_filter = samples_filter & (~samples_df['blacklist removal'].isnull())
-                else:
-                    pipeline_type = '-'.join([seq_end])
-                    samples_filter = samples_filter & (samples_df['blacklist removal'].isnull())
-                yield pipeline_type, np.sum(samples_filter)
-    if lib_type == consts.library_type_starr_seq:
-        for seq_end in consts.seq_ends:
-            for with_umis in consts.with_umis:
-                samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end)
-                if with_umis:
-                    pipeline_type = '-'.join([seq_end, with_umis])
-                    if 'umis' in samples_df.columns:
-                        samples_filter = samples_filter & samples_df['umis']
-                else:
-                    pipeline_type = '-'.join([seq_end])
-                    if 'umis' in samples_df.columns:
-                        samples_filter = samples_filter & ~samples_df['umis']
-                yield pipeline_type, np.sum(samples_filter)
-
-
-def data_acquisition_cells(conf_args, lib_type, metadata_file, nsamples):
-    cells = []
-    if conf_args['data_from'] != consts.DATA_SOURCES_LOCAL:
-        cells.extend(download_fastq_files(conf_args,
-                                          lib_type,
-                                          metadata_fn=metadata_file))
-        cells.extend(merge_fastq_files(conf_args,
-                                       lib_type,
-                                       metadata_filename=metadata_file,
-                                       num_samples=nsamples))
-    else:
-        download_fn = "%s/data/%s/processed_raw_reads/%s" % (
-            conf_args['root_dir'], lib_type,
-            conf_args['project_name'])
-        warning_cell =  Cell(contents=None,
-                             description=["### FASTQ files already available locally!!",
-                                           "Please, make sure the FASTQ files are correctly named, decompressed and located/symlinked in:",
-                                          "", "**", download_fn, "**"])
-        cells.extend(warning_cell.to_list())
-
-    return cells
-
-
-def create_cells(samples_df, conf_args=None):
-    """
-    Master function to write all code and text for the notebook.
-
-    Conceptually, there are a number of things that have to happen:
-        - save metadata txt file
-        - download FASTQ.gz files from sequencing core
-        - uncompress FASTQ.gz files
-        - rename and move FASTQ files
-        - create JSONs files for cwltool
-        - execute cwltool master file
-    """
-    lib_type = samples_df.iloc[0]['library type'].lower().replace('-', '_')
-    num_samples = samples_df.shape[0]
-    cells = []
-
-    cc, metadata_file = save_metadata(samples_df, conf_args, lib_type)
-    cells.extend(cc)
-
-    cells.extend(data_acquisition_cells(conf_args, lib_type, metadata_file, num_samples))
-    cells.extend(cwl_json_gen(conf_args, lib_type, metadata_filename=metadata_file))
-    for pipeline_type, n in get_pipeline_types(samples_df):
-        if n > 0:
-            cells.extend(cwl_slurm_array_gen(conf_args, lib_type, metadata_filename=metadata_file,
-                                             pipeline_type=pipeline_type, n_samples=n))
-            cells.extend(generate_qc_cell(conf_args, lib_type, pipeline_type=pipeline_type))
-            cells.extend(generate_plots(conf_args, metadata_file=metadata_file,
-                                        lib_type=lib_type, pipeline_type=pipeline_type, n_samples=n))
-            cells.extend(data_upload(conf_args, lib_type, pipeline_type))
-
-    return cells
-
-
-def make_notebook(outfile, metadata, conf_args=None):
-    """Create notebook with parsed contents from metadata"""
-    nb = nbf.new_notebook()
-
-    cells = []
-    # Create a notebook by Library type existing in the metadata file
-    for samples_df in get_samples_by_library_type(metadata, conf_args['sep']):
-        cells.extend(create_cells(samples_df, conf_args=conf_args))
-
-    nb['worksheets'].append(nbf.new_worksheet(cells=cells))
-
-    with open(outfile, 'w') as _:
-        nbformat.write(nb, _)
-
-
-def get_samples_by_library_type(metadata_file, sep='\t'):
-    """
-    Parse a metadata file (either a spreadsheet or a tab-delimited file.
-
-    :return: generator of panda's dataframe
-    """
-    try:
-        md = pd.read_excel(metadata_file.name,
-                           true_values=['Yes', 'Y', 'yes', 'y', 1],
-                           false_values=['No', 'N', 'no', 'n', 0])
-    except XLRDError:
-        print (XLRDError)
-        md = pd.read_csv(metadata_file.name,
-                         true_values=['Yes', 'Y', 'yes', 'y', 1],
-                         false_values=['No', 'N', 'no', 'n', 0], sep=sep)
-
-    md.columns = [x.lower() for x in md.columns]
-    named_cols = [c for c in md.columns if not c.startswith('unnamed: ')]
-    lib_types_found = set(md['library type'][~pd.isnull(md['library type'])])
-
-    for lt in lib_types_found:
-        yield md.loc[md['library type'] == lt, named_cols]
-
-
-def init_conf_args(args,
-                   required_args = ['root_dir'],
-                   optional_args = ['user', 'sep', 'user_duke_email', 'project_name']):
-    conf_args = {}
-    if args['conf_file']:
-        conf_args = ruamel.yaml.load(args['conf_file'], Loader=ruamel.yaml.Loader)
-    for r in required_args:
-        conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r]
-        try:
-            assert conf_args[r] is not None
-        except AssertionError as e:
-            print("[ERROR]", r, "not defined")
-            raise
-    for o in optional_args:
-        conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None)
-    conf_args['user'] = conf_args['user'] or os.environ['USER']
-    conf_args['user_duke_email'] = conf_args['user_duke_email'] or "%s@duke.edu" % conf_args['user']
-    conf_args['project_name'] = conf_args['project_name'] or os.path.splitext(os.path.basename(args['metadata'].name))[0]
-
-    return conf_args
-
-def main():
-    parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines')
-    parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name')
-    parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'),
-                        help='Metadata file with samples information')
-    parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file')
-    parser.add_argument('-n', '--no-upload', action='store_false', 
-                        help='Avoids uploading generated data to database when specified')
-    parser.add_argument('--metadata-sep', dest='sep', required=False, type=str, default='\t',
-                        help='Separator for metadata file (when different than Excel spread sheet)')
-    parser.add_argument('--project-name', required=False, type=str,
-                        help='Project name (by default, basename of metadata file name)')
-    parser.add_argument('--data-from', required=False, choices=consts.data_sources,
-                        default=consts.data_sources[0],
-                        help='Choices: %s' % (', '.join(consts.data_sources)))
-    parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)')
-    parser.add_argument('-u', '--user', required=False,
-                        help='HARDAC User used in SLURM (default: ${USER})')
-    parser.add_argument('-e', '--user-duke-email', required=False,
-                        help='Email(s) notified when execution is finished (default: ${USER}@duke.edu)')
-    parser.add_argument('-r', '--root-dir', required=False,
-                        help='Root directory where all subfolders and files will be created '
-                             '(semi-required: either defined here or in conf-file')
-
-    args = parser.parse_args()
-
-    conf_args = init_conf_args(vars(args))
-
-    outfile = "%s.ipynb" % conf_args['project_name']
-
-    if os.path.isdir(args.out):
-        outfile = os.path.join(args.out, outfile)
-    else:
-        outfile = args.out
-
-    if os.path.isfile(outfile) and not args.force:
-        print(outfile, "is an existing file. Please use -f or --force to overwrite the contents")
-        sys.exit(1)
-
-    conf_args['upload'] = args.no_upload
-    conf_args['data_from'] = args.data_from
-    make_notebook(outfile,
-                  args.metadata,
-                  conf_args=conf_args)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py
new file mode 120000
index 0000000..e27d141
--- /dev/null
+++ b/ggr-cwl-ipynb-gen.py
@@ -0,0 +1 @@
+ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
\ No newline at end of file
diff --git a/ggr_cwl_ipynb_gen/__init__.py b/ggr_cwl_ipynb_gen/__init__.py
new file mode 100644
index 0000000..a75ddec
--- /dev/null
+++ b/ggr_cwl_ipynb_gen/__init__.py
@@ -0,0 +1,2 @@
+if __name__ == '__main__':
+    ggr-main()
diff --git a/consts.py b/ggr_cwl_ipynb_gen/consts.py
similarity index 97%
rename from consts.py
rename to ggr_cwl_ipynb_gen/consts.py
index 32518da..03fc3a7 100644
--- a/consts.py
+++ b/ggr_cwl_ipynb_gen/consts.py
@@ -53,3 +53,6 @@
 qc_script_dir = '/data/reddylab/software/cwl/bin'
 data_upload_script = '/data/reddylab/Darryl/GitHub/reddylab/csv_to_mongo.py'
 HOST_FOR_TUNNELED_DOWNLOAD = "Hardac-xfer.genome.duke.edu"
+
+# Package constants
+PACKAGE_NAME = "ggr_cwl_ipynb_gen"
diff --git a/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
new file mode 100644
index 0000000..1cc9fde
--- /dev/null
+++ b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
@@ -0,0 +1,661 @@
+#!/usr/bin/env python
+import argparse
+import nbformat
+import nbformat.v3 as nbf
+import sys
+import os
+import pandas as pd
+from jinja2 import FileSystemLoader, PackageLoader
+from xlrd import XLRDError
+import ruamel.yaml
+import ggr_cwl_ipynb_gen.consts as consts
+import jinja2
+import inspect
+from jinja2.exceptions import TemplateNotFound
+import numpy as np
+
+encoding = sys.getfilesystemencoding()
+EXEC_DIR = os.path.dirname(str(__file__))
+
+
+def render(tpl_path, context):
+    path, filename = os.path.split(tpl_path)
+    try:
+        jinja_rendered = jinja2.Environment(
+            loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates"))
+        ).get_template(filename).render(context)
+    except TemplateNotFound:
+        jinja_rendered = jinja2.Environment(
+            loader=PackageLoader(consts.PACKAGE_NAME, "templates")
+        ).get_template(filename).render(context)
+    return jinja_rendered
+
+
+class Cell(object):
+    def __init__(self, contents, description=None):
+        self.contents = contents
+        self.description = description
+        if type(self.description) is not list:
+            self.description = [self.description]
+        self.header = []
+        # self.header_inputs = []
+        # self.header_outputs = []
+
+    def writefile_to(self, dest):
+        self.header = ["%%%%writefile %s" % dest]
+
+    def to_list(self):
+        cells = []
+        if self.description:
+            cells.append(nbf.new_text_cell('markdown', source=self.description))
+        if self.contents:
+            cells.append(nbf.new_code_cell(input=self.header + self.contents))
+        return cells
+
+
+class CellSbatch(Cell):
+    def __init__(self, script_output=None, depends_on=False, mem=None,
+                 cpus=None, partition=None, wrap_command=None, array=None,
+                 prolog=list(), **kwargs):
+        super(CellSbatch, self).__init__(**kwargs)
+
+        content_prolog = ['sbatch']
+        if script_output:
+            content_prolog.extend(['-o', script_output, '\\\n'])
+        if partition:
+            content_prolog.extend(['-p', partition, '\\\n'])
+        if mem:
+            content_prolog.extend(['--mem', str(mem), '\\\n'])
+        if cpus:
+            content_prolog.extend(['-c', str(cpus), '\\\n'])
+        if depends_on:
+            content_prolog.extend(['--depend', 'afterok:$1', '\\\n'])
+        if array is not None:
+            content_prolog.extend(['--array', array, '\\\n'])
+        if wrap_command:
+            content_prolog.append('--wrap="%s' % wrap_command)
+            self.contents.append('"')
+        self.contents = content_prolog + self.contents
+        self.contents = prolog + [' '.join(self.contents)]
+
+        self.header = ["%%script"]
+        self.header.append('--out blocking_job_str')
+        self.header.append("bash")
+
+        if depends_on:
+            self.header.append('-s "$blocking_job"')
+        self.header = [' '.join(self.header)]
+
+    def to_list(self):
+        cells = super(CellSbatch, self).to_list()
+
+        # We need to add an extra code cell to compute the SLURM job id
+        extra_cell = Cell(
+            contents=["import re", "blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)"],
+            description="Extract blocking job id"
+        )
+        cells.extend(extra_cell.to_list())
+        return cells
+
+
+def save_metadata(samples_df, conf_args, lib_type):
+    cells = []
+    cell_mkdir = Cell(contents=["%%bash",
+                                "mkdir -p %s/data/%s/metadata" % (conf_args['root_dir'], lib_type),
+                                "mkdir -p %s/data/%s/raw_reads" % (conf_args['root_dir'], lib_type),
+                                "mkdir -p %s/data/%s/processed_raw_reads" % (conf_args['root_dir'], lib_type),
+                                "mkdir -p %s/processing/%s/scripts" % (conf_args['root_dir'], lib_type),
+                                "mkdir -p %s/processing/%s/jsons" % (conf_args['root_dir'], lib_type),
+                                "mkdir -p %s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
+                                ],
+                      description=["# %s - %s" % (conf_args['project_name'], lib_type),
+                                   consts.notebook_blurb,
+                                   "#### Create necessary folder(s)"])
+    cells.extend(cell_mkdir.to_list())
+
+
+    outfile = "%s/data/%s/metadata/%s_download_metadata.%s.txt" % \
+              (conf_args['root_dir'], lib_type, lib_type,
+               conf_args['project_name'])
+    contents = ["%%%%writefile %s" %
+                outfile, samples_df.to_csv(index=False,
+                                           sep=conf_args['sep'],
+                                           encoding='utf-8',
+                                           header=[x.capitalize() for x in samples_df.columns.values])]
+    cell = Cell(contents=contents, description="Save metadata file")
+    cells.extend(cell.to_list())
+
+    return cells, outfile
+
+
+def download_fastq_files(conf_args, lib_type, metadata_fn=None):
+    cells = []
+
+    download_fn = "%s/processing/%s/scripts/download_%s.sh" % (conf_args['root_dir'], lib_type,
+                                                                conf_args['project_name'])
+    context = {
+        'output_fn': download_fn,
+        'project_name': conf_args['project_name'],
+        'metadata_filename': metadata_fn,
+        'root_dir': conf_args['root_dir'],
+        'user': conf_args['user'],
+        'lib_type': lib_type,
+        'data_source': conf_args['data_from'],
+        'consts': consts
+    }
+    contents = [render('templates/download_fastq_files.j2', context)]
+
+    cell_write_dw_file = Cell(contents=contents,
+                              description=["#### Download FASTQ from %s" % conf_args['data_from'],
+                                           "Create file to download FASTQ files"])
+    cells.extend(cell_write_dw_file.to_list())
+
+    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
+    execute_cell = CellSbatch(contents=list(),
+                              partition=",".join(consts.slurm_partitions),
+                              wrap_command="ssh %s@%s 'sh %s'" % (conf_args['user'],
+                                                                  consts.HOST_FOR_TUNNELED_DOWNLOAD,
+                                                                  download_fn),
+                              description="Execute file to download files",
+                              script_output="%s/%s_%s.out" % (logs_dir, conf_args['project_name'],
+                                                                  inspect.stack()[0][3]))
+    cells.extend(execute_cell.to_list())
+
+    return cells
+
+
+def ungzip_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None):
+    cells = []
+    ungzip_fn = "%s/processing/%s/scripts/ungzip_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name'])
+    context = {
+        'output_fn' : ungzip_fn,
+        'metadata_filename': metadata_filename,
+        'project_name': conf_args['project_name'],
+        'root_dir': conf_args['root_dir'],
+        'lib_type': lib_type,
+        'num_samples': num_samples
+    }
+    contents = [render('templates/ungzip_fastq_files.j2', context)]
+
+    cell_write_dw_file = Cell(contents=contents, description="#### Ungzip FASTQ files")
+    cells.extend(cell_write_dw_file.to_list())
+
+    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
+    execute_cell = CellSbatch(contents=[ungzip_fn],
+                              description="Execute file to ungzip FASTQ files",
+                              depends_on=True,
+                              partition=",".join(consts.slurm_partitions),
+                              array="0-%d%%20" % (num_samples - 1),
+                              script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'],
+                                                                  inspect.stack()[0][3]))
+    cells.extend(execute_cell.to_list())
+
+    return cells
+
+
+def merge_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None):
+    cells = []
+    merge_fn = "%s/processing/%s/scripts/merge_lanes_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name'])
+    context = {
+        'output_fn' : merge_fn,
+        'metadata_filename': metadata_filename,
+        'project_name': conf_args['project_name'],
+        'root_dir': conf_args['root_dir'],
+        'lib_type': lib_type,
+        'num_samples': num_samples
+    }
+    contents = [render('templates/merge_lanes_fastq.j2', context)]
+
+    cell_write_dw_file = Cell(contents=contents, description="#### Merge lanes of FASTQ files")
+    cells.extend(cell_write_dw_file.to_list())
+
+    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
+    execute_cell = CellSbatch(contents=[merge_fn],
+                              description="Execute file to merge lanes of FASTQ files",
+                              depends_on=True,
+                              array="0-%d%%20" % (num_samples-1),
+                              partition=",".join(consts.slurm_partitions),
+                              script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'],
+                                                                  inspect.stack()[0][3]), )
+    cells.extend(execute_cell.to_list())
+
+    return cells
+
+
+def cwl_json_gen(conf_args, lib_type, metadata_filename):
+    func_name = inspect.stack()[0][3]
+    cells = []
+    output_fn = "%s/processing/%s/scripts/%s_%s.sh" % (conf_args['root_dir'],
+                                                       lib_type,
+                                                       func_name,
+                                                       conf_args['project_name'])
+    context = {
+        'output_fn' : output_fn,
+        'metadata_filename': metadata_filename,
+        'project_name': conf_args['project_name'],
+        'root_dir': conf_args['root_dir'],
+        'lib_type': lib_type,
+        'star_genome': consts.star_genome,
+        'mem': consts.mem[lib_type.lower()],
+        'nthreads': consts.nthreads[lib_type.lower()],
+        'separate_jsons': consts.separate_jsons
+    }
+    contents = [render('templates/%s.j2' % func_name, context)]
+
+    cell_write_dw_file = Cell(contents=contents, description="#### Create JSON files for CWL pipeline files")
+    cells.extend(cell_write_dw_file.to_list())
+
+    logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type)
+    execute_cell = CellSbatch(contents=[output_fn],
+                              description="Execute file to create JSON files",
+                              depends_on=True,
+                              partition=",".join(consts.slurm_partitions),
+                              prolog=["source %s %s" % (consts.conda_activate,
+                                                        consts.conda_environment)],
+                              script_output="%s/%s_%s.out" % (logs_dir,
+                                                                  conf_args['project_name'],
+                                                                  inspect.stack()[0][3]))
+    cells.extend(execute_cell.to_list())
+    return cells
+
+
+def cwl_slurm_array_gen(conf_args, lib_type, metadata_filename, pipeline_type, n_samples):
+    func_name = inspect.stack()[0][3]
+    cells = []
+    output_fn = "%s/processing/%s/scripts/%s-%s.sh" % (conf_args['root_dir'],
+                                                       lib_type,
+                                                       conf_args['project_name'],
+                                                       pipeline_type)
+    metadata_basename = os.path.splitext(os.path.basename(metadata_filename))[0]
+    context = {
+        'output_fn' : output_fn,
+        'metadata_basename': metadata_basename,
+        'project_name': conf_args['project_name'],
+        'root_dir': conf_args['root_dir'],
+        'user_duke_email': conf_args['user_duke_email'],
+        'lib_type': lib_type,
+        'mem': consts.mem[lib_type.lower()],
+        'nthreads': consts.nthreads[lib_type.lower()],
+        'pipeline_type': pipeline_type,
+        'consts': consts
+    }
+    contents = [render('templates/%s.j2' % func_name, context)]
+
+    cell_write_dw_file = Cell(contents=contents, description="#### Create SLURM array master bash file for %s samples" % pipeline_type)
+    cells.extend(cell_write_dw_file.to_list())
+
+    execute_cell = CellSbatch(contents=[output_fn],
+                              description="Execute SLURM array master file",
+                              depends_on=True,
+                              array="0-%d%%20" % (n_samples - 1),
+                              prolog=["source %s %s" % (consts.conda_activate,
+                                                        consts.conda_environment)],
+                              partition=",".join(consts.slurm_partitions))
+    cells.extend(execute_cell.to_list())
+
+    return cells
+
+
+def generate_qc_cell(conf_args, lib_type, pipeline_type):
+    func_name = inspect.stack()[0][3]
+    cells = []
+
+    # Python program has no 'se' or 'pe' abbreviation
+    end_type = pipeline_type.split("-")[0]
+    if end_type == "se":
+        end_type = "single_end"
+    elif end_type == "pe":
+        end_type = "paired_end"
+    else:
+        return CellSbatch(contents=[""])
+
+
+    output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"],
+                                                                   lib_type,
+                                                                   func_name,
+                                                                   conf_args["project_name"],
+                                                                   pipeline_type)
+    qc_type = lib_type.replace("_", "")
+    context = {
+        'output_fn': output_fn,
+        "conda_activate": consts.conda_activate,
+        'root_dir': conf_args["root_dir"],
+        "library_type": lib_type,
+        "project_name": conf_args["project_name"],
+        "pipeline_type": pipeline_type,
+        "qc_script_dir": consts.qc_script_dir,
+        "qc_type": qc_type,
+        "end_type": end_type
+    }
+    contents = [render('templates/%s.j2' % func_name, context)]
+
+    cell_write_dw_file = Cell(contents=contents, description="#### Create QC generating script")
+    cells.extend(cell_write_dw_file.to_list())
+
+    execute_cell = CellSbatch(contents=[output_fn],
+                              depends_on=True,
+                              partition=",".join(consts.slurm_partitions),
+                              description="Generate QCs for %s-%s" % (conf_args["project_name"], pipeline_type))
+
+    cells.extend(execute_cell.to_list())
+
+    return cells
+
+
+def generate_plots(conf_args, metadata_file, lib_type, pipeline_type, n_samples):
+    """
+    Generates cell for creating fingerprint data
+    :param conf_args: Dictionary containing data about directories, project name, etc.
+    :param metadata_file: File path to metadata
+    :param lib_type: Type of assay (RNA, ChIP, ATAC)
+    :param pipeline_type: Type of sequencing pipeline (end, control)
+    :return:
+    """
+    func_name = inspect.stack()[0][3]
+    cells = []
+    # Current iteration of web-application only accepts ChIP samples
+    if lib_type != "chip_seq":
+        return []
+
+    input_directory = "{}/processing/{}/{}-{}".format(conf_args['root_dir'],
+                                                      lib_type,
+                                                      conf_args['project_name'],
+                                                      pipeline_type)
+    output_directory = input_directory
+
+    output_fn = '%s/processing/%s/scripts/generate_plot.%s-%s.sh' % (conf_args["root_dir"],
+                                                                      lib_type,
+                                                                      conf_args["project_name"],
+                                                                      pipeline_type)
+
+    context = {
+        'output_fn': output_fn,
+        'env_activate': consts.conda_activate,
+        'root_dir': conf_args['root_dir'],
+        'lib_type': lib_type,
+        'project_name': conf_args['project_name'],
+        'pipeline_type': pipeline_type,
+        'metadata_file': metadata_file,
+        'input_dir': input_directory,
+        'output_dir': output_directory
+    }
+    contents = [render('templates/%s.j2' % func_name, context)]
+    cell_write_dw_file = Cell(contents=contents, description="#### Create plot generating script")
+    cells.extend(cell_write_dw_file.to_list())
+
+
+    execute_cell = CellSbatch(contents=[output_fn],
+                              depends_on=True,
+                              array="0-%d%%5" % (n_samples - 1),
+                              prolog=["source %s %s" % (consts.conda_activate, consts.conda_environment)],
+                              partition=",".join(consts.slurm_partitions),
+                              description="Generate plots and data for website")
+    cells.extend(execute_cell.to_list())
+
+    return cells
+
+
+def data_upload(conf_args, lib_type, pipeline_type):
+    """
+    Function for generating a cell that uploads notebook generated data
+    to database. Can be avoided with usage of tag "-n".
+    """
+    func_name = inspect.stack()[0][3]
+    cells = []
+
+    # Only upload data to web-app if it is ChIP-seq
+    if lib_type != "chip_seq" or not conf_args["upload"]:
+        return []
+
+    output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"],
+                                                          lib_type,
+                                                          func_name,
+                                                          conf_args["project_name"],
+                                                          pipeline_type)
+
+    script_dir = os.path.dirname(os.path.realpath(__file__))
+    data_dir = "{}/processing/chip_seq/{}-{}".format(conf_args['root_dir'],
+                                                   conf_args['project_name'], pipeline_type)
+
+    context = {
+      'output_fn': output_fn,
+      'root_dir': conf_args['root_dir'],
+      'pipeline_type': pipeline_type,
+      'library_type': lib_type,
+      'project_name': conf_args['project_name'],
+      'script_dir': script_dir,
+      'conda_activate': consts.conda_activate,
+      'data_dir': data_dir,
+      'uri': conf_args['uri'] if 'uri' in conf_args else None,
+      'database': conf_args['database'] if 'database' in conf_args else None,
+      'collection': conf_args['collection'] if 'collection' in conf_args else None
+      }
+
+    contents = [render('templates/%s.j2' % func_name, context)]
+    cell_write_dw_file = Cell(contents=contents, description="#### Create data upload script")
+    cells.extend(cell_write_dw_file.to_list())
+
+    execute_cell = CellSbatch(contents=[output_fn],
+                              depends_on=True,
+                              prolog=["source %s alex" % consts.conda_activate],
+                              partition=",".join(consts.slurm_partitions),
+                              description="### Upload ChIP-seq to web-application")
+    cells.extend(execute_cell.to_list())
+
+    return cells
+
+
+def get_pipeline_types(samples_df):
+    lib_type = samples_df['library type'].iloc[0].lower().replace('-', '_')
+    if lib_type == consts.library_type_chip_seq:
+        for seq_end in consts.seq_ends:
+            for with_control in consts.with_controls:
+                samples_filter = samples_df['paired-end or single-end'].str.lower() == seq_end
+                if with_control:
+                    samples_filter = samples_filter & (~samples_df['control'].isnull())
+                    pipeline_type = '-'.join([seq_end, with_control])
+                else:
+                    samples_filter = samples_filter & (samples_df['control'].isnull())
+                    pipeline_type = '-'.join([seq_end])
+                yield pipeline_type, np.sum(samples_filter)
+    if lib_type == consts.library_type_rna_seq:
+        for seq_end in consts.seq_ends:
+            for strandness in consts.strandnesses:
+                samples_filter = \
+                    (samples_df['paired-end or single-end'].str.lower() == seq_end) \
+                    & (samples_df['strand specificity'].str.lower() == strandness)
+                if consts.with_sjdb:
+                    pipeline_type = '-'.join([seq_end, strandness, 'with-sjdb'])
+                else:
+                    pipeline_type = '-'.join([seq_end, strandness])
+                yield pipeline_type, np.sum(samples_filter)
+    if lib_type == consts.library_type_atac_seq:
+        for seq_end in consts.seq_ends:
+            for with_blacklist_removal in consts.blacklist_removal:
+                samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end)
+                if with_blacklist_removal:
+                    pipeline_type = '-'.join([seq_end, with_blacklist_removal])
+                    samples_filter = samples_filter & (~samples_df['blacklist removal'].isnull())
+                else:
+                    pipeline_type = '-'.join([seq_end])
+                    samples_filter = samples_filter & (samples_df['blacklist removal'].isnull())
+                yield pipeline_type, np.sum(samples_filter)
+    if lib_type == consts.library_type_starr_seq:
+        for seq_end in consts.seq_ends:
+            for with_umis in consts.with_umis:
+                samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end)
+                if with_umis:
+                    pipeline_type = '-'.join([seq_end, with_umis])
+                    if 'umis' in samples_df.columns:
+                        samples_filter = samples_filter & samples_df['umis']
+                else:
+                    pipeline_type = '-'.join([seq_end])
+                    if 'umis' in samples_df.columns:
+                        samples_filter = samples_filter & ~samples_df['umis']
+                yield pipeline_type, np.sum(samples_filter)
+
+
+def data_acquisition_cells(conf_args, lib_type, metadata_file, nsamples):
+    cells = []
+    if conf_args['data_from'] != consts.DATA_SOURCES_LOCAL:
+        cells.extend(download_fastq_files(conf_args,
+                                          lib_type,
+                                          metadata_fn=metadata_file))
+        cells.extend(merge_fastq_files(conf_args,
+                                       lib_type,
+                                       metadata_filename=metadata_file,
+                                       num_samples=nsamples))
+    else:
+        download_fn = "%s/data/%s/processed_raw_reads/%s" % (
+            conf_args['root_dir'], lib_type,
+            conf_args['project_name'])
+        warning_cell =  Cell(contents=None,
+                             description=["### FASTQ files already available locally!!",
+                                           "Please, make sure the FASTQ files are correctly named, decompressed and located/symlinked in:",
+                                          "", "**", download_fn, "**"])
+        cells.extend(warning_cell.to_list())
+
+    return cells
+
+
+def create_cells(samples_df, conf_args=None):
+    """
+    Master function to write all code and text for the notebook.
+
+    Conceptually, there are a number of things that have to happen:
+        - save metadata txt file
+        - download FASTQ.gz files from sequencing core
+        - uncompress FASTQ.gz files
+        - rename and move FASTQ files
+        - create JSONs files for cwltool
+        - execute cwltool master file
+    """
+    lib_type = samples_df.iloc[0]['library type'].lower().replace('-', '_')
+    num_samples = samples_df.shape[0]
+    cells = []
+
+    cc, metadata_file = save_metadata(samples_df, conf_args, lib_type)
+    cells.extend(cc)
+
+    cells.extend(data_acquisition_cells(conf_args, lib_type, metadata_file, num_samples))
+    cells.extend(cwl_json_gen(conf_args, lib_type, metadata_filename=metadata_file))
+    for pipeline_type, n in get_pipeline_types(samples_df):
+        if n > 0:
+            cells.extend(cwl_slurm_array_gen(conf_args, lib_type, metadata_filename=metadata_file,
+                                             pipeline_type=pipeline_type, n_samples=n))
+            cells.extend(generate_qc_cell(conf_args, lib_type, pipeline_type=pipeline_type))
+            cells.extend(generate_plots(conf_args, metadata_file=metadata_file,
+                                        lib_type=lib_type, pipeline_type=pipeline_type, n_samples=n))
+            cells.extend(data_upload(conf_args, lib_type, pipeline_type))
+
+    return cells
+
+
+def make_notebook(outfile, metadata, conf_args=None):
+    """Create notebook with parsed contents from metadata"""
+    nb = nbf.new_notebook()
+
+    cells = []
+    # Create a notebook by Library type existing in the metadata file
+    for samples_df in get_samples_by_library_type(metadata, conf_args['sep']):
+        cells.extend(create_cells(samples_df, conf_args=conf_args))
+
+    nb['worksheets'].append(nbf.new_worksheet(cells=cells))
+
+    with open(outfile, 'w') as _:
+        nbformat.write(nb, _)
+
+
+def get_samples_by_library_type(metadata_file, sep='\t'):
+    """
+    Parse a metadata file (either a spreadsheet or a tab-delimited file.
+
+    :return: generator of panda's dataframe
+    """
+    try:
+        md = pd.read_excel(metadata_file.name,
+                           true_values=['Yes', 'Y', 'yes', 'y', 1],
+                           false_values=['No', 'N', 'no', 'n', 0])
+    except XLRDError:
+        print (XLRDError)
+        md = pd.read_csv(metadata_file.name,
+                         true_values=['Yes', 'Y', 'yes', 'y', 1],
+                         false_values=['No', 'N', 'no', 'n', 0], sep=sep)
+
+    md.columns = [x.lower() for x in md.columns]
+    named_cols = [c for c in md.columns if not c.startswith('unnamed: ')]
+    lib_types_found = set(md['library type'][~pd.isnull(md['library type'])])
+
+    for lt in lib_types_found:
+        yield md.loc[md['library type'] == lt, named_cols]
+
+
+def init_conf_args(args,
+                   required_args = ['root_dir'],
+                   optional_args = ['user', 'sep', 'user_duke_email', 'project_name']):
+    conf_args = {}
+    if args['conf_file']:
+        conf_args = ruamel.yaml.load(args['conf_file'], Loader=ruamel.yaml.Loader)
+    for r in required_args:
+        conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r]
+        try:
+            assert conf_args[r] is not None
+        except AssertionError as e:
+            print("[ERROR]", r, "not defined")
+            raise
+    for o in optional_args:
+        conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None)
+    conf_args['user'] = conf_args['user'] or os.environ['USER']
+    conf_args['user_duke_email'] = conf_args['user_duke_email'] or "%s@duke.edu" % conf_args['user']
+    conf_args['project_name'] = conf_args['project_name'] or os.path.splitext(os.path.basename(args['metadata'].name))[0]
+
+    return conf_args
+
+def main():
+    parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines')
+    parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name')
+    parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'),
+                        help='Metadata file with samples information')
+    parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file')
+    parser.add_argument('-n', '--no-upload', action='store_false', 
+                        help='Avoids uploading generated data to database when specified')
+    parser.add_argument('--metadata-sep', dest='sep', required=False, type=str, default='\t',
+                        help='Separator for metadata file (when different than Excel spread sheet)')
+    parser.add_argument('--project-name', required=False, type=str,
+                        help='Project name (by default, basename of metadata file name)')
+    parser.add_argument('--data-from', required=False, choices=consts.data_sources,
+                        default=consts.data_sources[0],
+                        help='Choices: %s' % (', '.join(consts.data_sources)))
+    parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)')
+    parser.add_argument('-u', '--user', required=False,
+                        help='HARDAC User used in SLURM (default: ${USER})')
+    parser.add_argument('-e', '--user-duke-email', required=False,
+                        help='Email(s) notified when execution is finished (default: ${USER}@duke.edu)')
+    parser.add_argument('-r', '--root-dir', required=False,
+                        help='Root directory where all subfolders and files will be created '
+                             '(semi-required: either defined here or in conf-file)')
+
+    args = parser.parse_args()
+
+    conf_args = init_conf_args(vars(args))
+
+    outfile = "%s.ipynb" % conf_args['project_name']
+
+    if os.path.isdir(args.out):
+        outfile = os.path.join(args.out, outfile)
+    else:
+        outfile = args.out
+
+    if os.path.isfile(outfile) and not args.force:
+        print(outfile, "is an existing file. Please use -f or --force to overwrite the contents")
+        sys.exit(1)
+
+    conf_args['upload'] = args.no_upload
+    conf_args['data_from'] = args.data_from
+    make_notebook(outfile,
+                  args.metadata,
+                  conf_args=conf_args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/templates/cwl_json_gen.j2 b/ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2
similarity index 100%
rename from templates/cwl_json_gen.j2
rename to ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2
diff --git a/templates/cwl_slurm_array_gen.j2 b/ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2
similarity index 100%
rename from templates/cwl_slurm_array_gen.j2
rename to ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2
diff --git a/templates/data_upload.j2 b/ggr_cwl_ipynb_gen/templates/data_upload.j2
similarity index 100%
rename from templates/data_upload.j2
rename to ggr_cwl_ipynb_gen/templates/data_upload.j2
diff --git a/templates/download_fastq_files.j2 b/ggr_cwl_ipynb_gen/templates/download_fastq_files.j2
similarity index 100%
rename from templates/download_fastq_files.j2
rename to ggr_cwl_ipynb_gen/templates/download_fastq_files.j2
diff --git a/templates/generate_plots.j2 b/ggr_cwl_ipynb_gen/templates/generate_plots.j2
similarity index 100%
rename from templates/generate_plots.j2
rename to ggr_cwl_ipynb_gen/templates/generate_plots.j2
diff --git a/templates/generate_qc_cell.j2 b/ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2
similarity index 100%
rename from templates/generate_qc_cell.j2
rename to ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2
diff --git a/templates/merge_lanes_fastq.j2 b/ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2
similarity index 100%
rename from templates/merge_lanes_fastq.j2
rename to ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2
diff --git a/templates/ungzip_fastq_files.j2 b/ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2
similarity index 100%
rename from templates/ungzip_fastq_files.j2
rename to ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b5a3c46
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c434e65
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+jinja2 >=2.8
+nbformat >=4.0.1
+numpy >=1.10.4
+pandas >=0.17.1
+xlrd >=1.0.0
+ruamel.yaml >=0.11.11
+
+setuptools
+pymongo
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..5b06ed2
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,18 @@
+[metadata]
+name = ggr-cwl-ipynb-gen-alexbarrera
+version = 0.5.0
+author = Alejandro Barrera
+author_email = alejandro.barrera@duke.edu
+description = IPython notebook generator for GGR CWL processing pipelines of genomic data
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/alexbarrera/ggr-cwl-ipynb-gen
+project_urls =
+    Bug Tracker = https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues
+classifiers =
+    Programming Language :: Python :: 3
+    License :: OSI Approved :: MIT License
+    Operating System :: OS Independent
+
+[options]
+packages = find:
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..27ce028
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""Description:
+Setup script for ggr-cwl-ipynb-gen
+IPython Notebook generator for processing genomic data from GGR project, in CWL
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+from setuptools import setup, find_packages
+import pathlib
+
+here = pathlib.Path(__file__).parent.resolve()
+
+# Get the long description from the README file
+long_description = (here / 'README.md').read_text(encoding='utf-8')
+
+# Load version as VERSION environmental variable
+exec(open("VERSION.py").read())
+
+setup(
+    name='ggr_cwl_ipynb_gen',
+    version=VERSION,
+    description='IPython notebook generator for GGR CWL processing pipelines of genomic data',
+    long_description=long_description,  # Optional
+    long_description_content_type='text/markdown',
+    url='https://github.com/alexbarrera/ggr-cwl-ipynb-gen',
+    author='Alejandro Barrera',
+    author_email='alejandro.barrera@duke.edu',
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Science/Research',
+        'Topic :: Scientific/Engineering :: Bio-Informatics',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9'
+    ],
+    keywords='cwl, bioinformatics, development',
+    scripts=[
+        "ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py",
+        "chipdb_upload/data_upload.py",
+    ],
+    include_package_data=True,
+    py_modules=["ggr_cwl_ipynb_gen"],
+    python_requires='>=2.7, <4',
+    install_requires=[
+        'jinja2 >=2.8',
+        'nbformat >=4.0.1',
+        'numpy >=1.10.4',
+        'pandas >=0.17.1',
+        'xlrd >=1.0.0',
+        'ruamel.yaml >=0.11.11',
+        'setuptools',
+        'pymongo'
+    ],  # Optional
+    data_files=[
+        ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/data_upload.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/download_fastq_files.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/generate_plots.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2']),
+        ('templates', ['ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2']),
+    ],  # Optional
+    project_urls={
+        'Bug Reports': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues',
+        'Source': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/',
+    }
+
+)
\ No newline at end of file

From 3edda0877cd11b69cdacd7b3683e23e70ab6f3b4 Mon Sep 17 00:00:00 2001
From: Alejandro Barrera <alejandro.barrera@duke.edu>
Date: Tue, 9 Mar 2021 10:49:33 -0500
Subject: [PATCH 3/6] Drop attempt to support Python 2

---
 setup.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 27ce028..05751ca 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,9 @@
 under the terms of the BSD License (see the file LICENSE included with
 the distribution).
 """
+import sys
+if sys.version_info[0] == 2:
+    sys.exit("Sorry, Python 2 is not supported anymore. Please check and old branch (pre 2021)")
 from setuptools import setup, find_packages
 import pathlib
 
@@ -32,8 +35,6 @@
         'Intended Audience :: Science/Research',
         'Topic :: Scientific/Engineering :: Bio-Informatics',
         'License :: OSI Approved :: MIT License',
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
@@ -47,7 +48,7 @@
     ],
     include_package_data=True,
     py_modules=["ggr_cwl_ipynb_gen"],
-    python_requires='>=2.7, <4',
+    python_requires='>=3.1',
     install_requires=[
         'jinja2 >=2.8',
         'nbformat >=4.0.1',

From a1b068447c1988aeafd4ce0c3b05b14066d2e1b2 Mon Sep 17 00:00:00 2001
From: Alejandro Barrera <alejandro.barrera@duke.edu>
Date: Mon, 15 Mar 2021 17:22:00 -0400
Subject: [PATCH 4/6] Resolve review comments from Thomas

---
 MANIFEST.in                            |  1 -
 VERSION.py                             |  1 -
 ggr_cwl_ipynb_gen/__init__.py          |  2 --
 ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py |  2 ++
 requirements.txt                       |  3 +--
 setup.cfg                              | 18 +++++++++++----
 setup.py                               | 32 +-------------------------
 7 files changed, 17 insertions(+), 42 deletions(-)
 delete mode 100644 MANIFEST.in
 delete mode 100644 VERSION.py

diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 903bcc4..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-include templates/*
\ No newline at end of file
diff --git a/VERSION.py b/VERSION.py
deleted file mode 100644
index f1c763a..0000000
--- a/VERSION.py
+++ /dev/null
@@ -1 +0,0 @@
-VERSION = '0.5.0'
\ No newline at end of file
diff --git a/ggr_cwl_ipynb_gen/__init__.py b/ggr_cwl_ipynb_gen/__init__.py
index a75ddec..e69de29 100644
--- a/ggr_cwl_ipynb_gen/__init__.py
+++ b/ggr_cwl_ipynb_gen/__init__.py
@@ -1,2 +0,0 @@
-if __name__ == '__main__':
-    ggr-main()
diff --git a/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
index 1cc9fde..2de202e 100644
--- a/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
+++ b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
@@ -634,6 +634,8 @@ def main():
     parser.add_argument('-r', '--root-dir', required=False,
                         help='Root directory where all subfolders and files will be created '
                              '(semi-required: either defined here or in conf-file)')
+    parser.add_argument('-v', '--version', required=False,
+                        help='Print version of the program and exit')
 
     args = parser.parse_args()
 
diff --git a/requirements.txt b/requirements.txt
index c434e65..48a30d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,5 +5,4 @@ pandas >=0.17.1
 xlrd >=1.0.0
 ruamel.yaml >=0.11.11
 
-setuptools
-pymongo
\ No newline at end of file
+pymongo
diff --git a/setup.cfg b/setup.cfg
index 5b06ed2..4b62c45 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,18 +1,26 @@
 [metadata]
-name = ggr-cwl-ipynb-gen-alexbarrera
+name = ggr-cwl-ipynb-gen
 version = 0.5.0
 author = Alejandro Barrera
 author_email = alejandro.barrera@duke.edu
 description = IPython notebook generator for GGR CWL processing pipelines of genomic data
 long_description = file: README.md
 long_description_content_type = text/markdown
-url = https://github.com/alexbarrera/ggr-cwl-ipynb-gen
+url = https://github.com/ReddyLab/ggr-cwl-ipynb-gen
 project_urls =
-    Bug Tracker = https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues
+    Bug Tracker = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/issues
+    Source = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/
+
 classifiers =
-    Programming Language :: Python :: 3
+    Development Status :: 3 - Alpha
+    Intended Audience :: Science/Research
+    Topic :: Scientific/Engineering :: Bio-Informatics
     License :: OSI Approved :: MIT License
-    Operating System :: OS Independent
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
 
 [options]
 packages = find:
diff --git a/setup.py b/setup.py
index 05751ca..a5d1e4e 100644
--- a/setup.py
+++ b/setup.py
@@ -11,36 +11,10 @@
 if sys.version_info[0] == 2:
     sys.exit("Sorry, Python 2 is not supported anymore. Please check and old branch (pre 2021)")
 from setuptools import setup, find_packages
-import pathlib
-
-here = pathlib.Path(__file__).parent.resolve()
-
-# Get the long description from the README file
-long_description = (here / 'README.md').read_text(encoding='utf-8')
-
-# Load version as VERSION environmental variable
-exec(open("VERSION.py").read())
 
 setup(
-    name='ggr_cwl_ipynb_gen',
-    version=VERSION,
-    description='IPython notebook generator for GGR CWL processing pipelines of genomic data',
-    long_description=long_description,  # Optional
     long_description_content_type='text/markdown',
     url='https://github.com/alexbarrera/ggr-cwl-ipynb-gen',
-    author='Alejandro Barrera',
-    author_email='alejandro.barrera@duke.edu',
-    classifiers=[
-        'Development Status :: 3 - Alpha',
-        'Intended Audience :: Science/Research',
-        'Topic :: Scientific/Engineering :: Bio-Informatics',
-        'License :: OSI Approved :: MIT License',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9'
-    ],
     keywords='cwl, bioinformatics, development',
     scripts=[
         "ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py",
@@ -68,10 +42,6 @@
         ('templates', ['ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2']),
         ('templates', ['ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2']),
         ('templates', ['ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2']),
-    ],  # Optional
-    project_urls={
-        'Bug Reports': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues',
-        'Source': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/',
-    }
+    ]
 
 )
\ No newline at end of file

From c1e5e23db5ef21d54aa47e0d45dfa85af1236206 Mon Sep 17 00:00:00 2001
From: Alejandro Barrera <alejandro.barrera@duke.edu>
Date: Mon, 15 Mar 2021 17:38:22 -0400
Subject: [PATCH 5/6] Add version to pymongo requirement

---
 requirements.txt | 3 +--
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 48a30d4..e0a34ba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,4 @@ numpy >=1.10.4
 pandas >=0.17.1
 xlrd >=1.0.0
 ruamel.yaml >=0.11.11
-
-pymongo
+pymongo >=3.4.0
diff --git a/setup.py b/setup.py
index a5d1e4e..7cf4142 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
         'xlrd >=1.0.0',
         'ruamel.yaml >=0.11.11',
         'setuptools',
-        'pymongo'
+        'pymongo >=3.4.0'
     ],  # Optional
     data_files=[
         ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2']),

From b777788549c3dd3872c1630debf05c0333daaf34 Mon Sep 17 00:00:00 2001
From: Alejandro Barrera <alejandro.barrera@duke.edu>
Date: Mon, 15 Mar 2021 17:41:48 -0400
Subject: [PATCH 6/6] Add version to pymongo requirement

---
 data_upload.py       | 1 -
 ggr-cwl-ipynb-gen.py | 1 -
 2 files changed, 2 deletions(-)
 delete mode 120000 data_upload.py
 delete mode 120000 ggr-cwl-ipynb-gen.py

diff --git a/data_upload.py b/data_upload.py
deleted file mode 120000
index 9aae8f7..0000000
--- a/data_upload.py
+++ /dev/null
@@ -1 +0,0 @@
-chipdb_upload/data_upload.py
\ No newline at end of file
diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py
deleted file mode 120000
index e27d141..0000000
--- a/ggr-cwl-ipynb-gen.py
+++ /dev/null
@@ -1 +0,0 @@
-ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py
\ No newline at end of file