From b504e74198109292104b0f5c89c0f763867244ef Mon Sep 17 00:00:00 2001 From: Alejandro Barrera Date: Mon, 8 Mar 2021 15:45:15 -0500 Subject: [PATCH 1/6] Upgrade code to python 3 --- ggr-cwl-ipynb-gen.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py index 6afc68d..18ce00b 100644 --- a/ggr-cwl-ipynb-gen.py +++ b/ggr-cwl-ipynb-gen.py @@ -10,11 +10,10 @@ import consts import jinja2 import inspect -import glob import numpy as np encoding = sys.getfilesystemencoding() -EXEC_DIR = os.path.dirname(unicode(__file__, encoding)) +EXEC_DIR = os.path.dirname(str(__file__)) def render(tpl_path, context): @@ -566,10 +565,11 @@ def get_samples_by_library_type(metadata_file, sep='\t'): :return: generator of panda's dataframe """ try: - md = pd.read_excel(metadata_file, + md = pd.read_excel(metadata_file.name, true_values=['Yes', 'Y', 'yes', 'y', 1], false_values=['No', 'N', 'no', 'n', 0]) except XLRDError: + print (XLRDError) md = pd.read_csv(metadata_file.name, true_values=['Yes', 'Y', 'yes', 'y', 1], false_values=['No', 'N', 'no', 'n', 0], sep=sep) @@ -592,8 +592,8 @@ def init_conf_args(args, conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r] try: assert conf_args[r] is not None - except AssertionError, e: - print "[ERROR]", r, "not defined" + except AssertionError as e: + print("[ERROR]", r, "not defined") raise for o in optional_args: conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None) @@ -606,7 +606,8 @@ def init_conf_args(args, def main(): parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines') parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name') - parser.add_argument('-m', '--metadata', required=True, type=file, help='Metadata file with samples information') + parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'), + help='Metadata file with samples information') parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file') parser.add_argument('-n', '--no-upload', action='store_false', help='Avoids uploading generated data to database when specified') @@ -617,7 +618,7 @@ def main(): parser.add_argument('--data-from', required=False, choices=consts.data_sources, default=consts.data_sources[0], help='Choices: %s' % (', '.join(consts.data_sources))) - parser.add_argument('-c', '--conf-file', required=False, type=file, help='YAML configuration file (see examples)') + parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)') parser.add_argument('-u', '--user', required=False, help='HARDAC User used in SLURM (default: ${USER})') parser.add_argument('-e', '--user-duke-email', required=False, @@ -638,7 +639,7 @@ def main(): outfile = args.out if os.path.isfile(outfile) and not args.force: - print outfile, "is an existing file. Please use -f or --force to overwrite the contents" + print(outfile, "is an existing file. Please use -f or --force to overwrite the contents") sys.exit(1) conf_args['upload'] = args.no_upload From 2613d3a91be5d6a32caf60193ac0677ec76167a6 Mon Sep 17 00:00:00 2001 From: Alejandro Barrera Date: Tue, 9 Mar 2021 10:29:37 -0500 Subject: [PATCH 2/6] Moderate refactor needed to create an installable package --- LICENSE | 21 + MANIFEST.in | 1 + README.md | 39 +- VERSION.py | 1 + chipdb_upload/__init__.py | 0 chipdb_upload/data_upload.py | 257 +++++++ data_upload.py | 258 +------ ggr-cwl-ipynb-gen.py | 654 +---------------- ggr_cwl_ipynb_gen/__init__.py | 2 + consts.py => ggr_cwl_ipynb_gen/consts.py | 3 + ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py | 661 ++++++++++++++++++ .../templates}/cwl_json_gen.j2 | 0 .../templates}/cwl_slurm_array_gen.j2 | 0 .../templates}/data_upload.j2 | 0 .../templates}/download_fastq_files.j2 | 0 .../templates}/generate_plots.j2 | 0 .../templates}/generate_qc_cell.j2 | 0 .../templates}/merge_lanes_fastq.j2 | 0 .../templates}/ungzip_fastq_files.j2 | 0 pyproject.toml | 6 + requirements.txt | 9 + setup.cfg | 18 + setup.py | 76 ++ 23 files changed, 1073 insertions(+), 933 deletions(-) create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 VERSION.py create mode 100644 chipdb_upload/__init__.py create mode 100644 chipdb_upload/data_upload.py mode change 100644 => 120000 data_upload.py mode change 100644 => 120000 ggr-cwl-ipynb-gen.py create mode 100644 ggr_cwl_ipynb_gen/__init__.py rename consts.py => ggr_cwl_ipynb_gen/consts.py (97%) create mode 100644 ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py rename {templates => ggr_cwl_ipynb_gen/templates}/cwl_json_gen.j2 (100%) rename {templates => ggr_cwl_ipynb_gen/templates}/cwl_slurm_array_gen.j2 (100%) rename {templates => ggr_cwl_ipynb_gen/templates}/data_upload.j2 (100%) rename {templates => ggr_cwl_ipynb_gen/templates}/download_fastq_files.j2 (100%) rename {templates => ggr_cwl_ipynb_gen/templates}/generate_plots.j2 (100%) rename {templates => ggr_cwl_ipynb_gen/templates}/generate_qc_cell.j2 (100%) rename {templates => ggr_cwl_ipynb_gen/templates}/merge_lanes_fastq.j2 (100%) rename {templates => ggr_cwl_ipynb_gen/templates}/ungzip_fastq_files.j2 (100%) create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8017ac4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Alejandro Barrera + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..903bcc4 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include templates/* \ No newline at end of file diff --git a/README.md b/README.md index 2cbc1a8..67a277a 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # ggr-cwl-ipynb-gen Jupyter notebook generator to download and execute the processing files for GGR related datasets. At this point, is not intented to cover all use cases, but to serve as a quick generator of all -related files and scripts to pre-process sequences generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics). +related files and scripts to pre-process genomic data generated at the [Duke-GCB Sequencing Core](https://genome.duke.edu/cores-and-services/sequencing-and-genomic-technologies) in [HARDAC](https://genome.duke.edu/cores-and-services/computational-solutions/compute-environments-genomics). Example of usage: ``` $ python ggr-cwl-ipynb-gen.py \ - --conf examples/conf.yaml \ + --root-dir /path/to/rootdir \ --metadata examples/Hong_3979_170316B1.xlsx \ --out /path/to/output_dir \ --force @@ -16,33 +16,26 @@ The information in the example metadata and configuration file should reveal wha For a full list of options: ``` $ python ggr-cwl-ipynb-gen.py -h -usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines - [-h] -o OUT -c CONF_FILE -m METADATA [-f] [-n] [--metadata-sep SEP] - [--project-name PROJECT_NAME] [--data-from {sftp,miseq,other,dukeds}] +usage: Generator of Jupyter notebooks to execute CWL pre-processing pipelines [-h] -o OUT -m METADATA [-f] [-n] [--metadata-sep SEP] [--project-name PROJECT_NAME] + [--data-from {sftp,miseq,other,dukeds,local}] [-c CONF_FILE] [-u USER] [-e USER_DUKE_EMAIL] [-r ROOT_DIR] optional arguments: -h, --help show this help message and exit -o OUT, --out OUT Jupyter notebook output file name - -c CONF_FILE, --conf-file CONF_FILE - YAML configuration file (see examples) -m METADATA, --metadata METADATA Metadata file with samples information -f, --force Force to overwrite output file - -n, --no-upload Avoids uploading generated data to database when - specified - --metadata-sep SEP Separator for metadata file (when different than Excel - spread sheet) + -n, --no-upload Avoids uploading generated data to database when specified + --metadata-sep SEP Separator for metadata file (when different than Excel spread sheet) --project-name PROJECT_NAME - Project name (by default, basename of metadata file - name) - --data-from {sftp,miseq,other,dukeds} - Choices: sftp, miseq, other, dukeds + Project name (by default, basename of metadata file name) + --data-from {sftp,miseq,other,dukeds,local} + Choices: sftp, miseq, other, dukeds, local + -c CONF_FILE, --conf-file CONF_FILE + YAML configuration file (see examples) + -u USER, --user USER HARDAC User used in SLURM (default: ${USER}) + -e USER_DUKE_EMAIL, --user-duke-email USER_DUKE_EMAIL + Email(s) notified when execution is finished (default: ${USER}@duke.edu) + -r ROOT_DIR, --root-dir ROOT_DIR + Root directory where all subfolders and files will be created (semi-required: either defined here or in conf-file) ``` - -### Dependencies -- jinja2 >=2.8 -- nbformat >=4.0.1 -- numpy >=1.10.4 -- pandas >=0.17.1 -- xlrd >=1.0.0 -- ruamel >=0.11.11 diff --git a/VERSION.py b/VERSION.py new file mode 100644 index 0000000..f1c763a --- /dev/null +++ b/VERSION.py @@ -0,0 +1 @@ +VERSION = '0.5.0' \ No newline at end of file diff --git a/chipdb_upload/__init__.py b/chipdb_upload/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chipdb_upload/data_upload.py b/chipdb_upload/data_upload.py new file mode 100644 index 0000000..47a875f --- /dev/null +++ b/chipdb_upload/data_upload.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python +from pymongo import MongoClient +import datetime +import os, csv +import argparse +import pandas as pd +import base64 +import logging + +# Python script and command line tool for compiling fingerprint and QC data from ChIP-seq +# experiments. Make sure to activate the 'alex' virtual environment from miniconda using +# `source /data/reddylab/software/miniconda2/bin/activate alex` command from HARDAC. To +# run full workflow, run the `countFactors_standard.sh` that outputs data directories +# then run this script on those outputs. + +# VERSION 1.0 Notes: pandas.dataframe.set_value() method is deprecated and will be removed +# in later iterations. + + +CWD = os.getcwd() + "/" +OUT_DIR = CWD + "QC_summary/" + + +def pretty_print(df): + with pd.option_context('display.max_rows', None, 'display.max_columns', len(df)): + print(df) + + +def stringFormat(string): + return string.strip().lower().replace(" ", "_").replace('-', '_').replace("%", "percent") + + +def read_file_base64(in_file): + """ + Helper function that reads file into binary + :param in_file: Absolute path to file + :return: The file contents as a string + """ + try: + with open(in_file, 'rb') as f: + return base64.b64encode(f.read()) + # Exception for symlinks + except IOError: + with open(os.readlink(in_file), 'rb') as f: + return base64.b64encode(f.read()) + + +def read_metadata(in_file): + """ + Helper function that reads a metadata file and returns a dictionary of values + :param in_file: The full metadata file path as a string + :return: A dictionary of the files' attributes + """ + attr = {} + # Read a 2-line tab-delimited file with header and contents + with open(in_file, 'rb') as f: + reader = csv.reader(f, delimiter='\t') + header = [stringFormat(ele) for ele in next(reader)] + contents = [stringFormat(ele) for ele in next(reader)] + attr = dict(zip(header, contents)) + + return attr + + +def standardize_header(arr): + """Returns a dataframe header as list, standardized to + QC naming convention + :param arr: A list of strings representing header + :return: Standardized column names, list of strings + """ + header_dict = {"sample": "sample", "raw": "reads_sequenced", + "reads_sequenced": "reads_sequenced", "reads after trimming": "reads_after_trimming", + "trimmed": "reads_after_trimming", "mapped": "reads_mapped", + "reads_mapped": "reads_mapped", "percentage_unique": "percent_unique", + "%reads unique": "percent_unique", + "percentage_unique_mapped_and_filtered": "percent_unique_mapped_filtered", + "%reads mapped after de-dup & filtering": "percent_unique_mapped_filtered", + "reads in peaks": "reads_in_peaks", "in_peaks": "reads_in_peaks", + "percent_in_peaks": "percent_in_peaks", "% reads in peaks": "percent_in_peaks", + "broadpeak_count": "broad_peak_count", "narrowpeak_count": "narrow_peak_count", + "nrf": "nrf", "pbc": "pbc_one", "nsc": "nsc", "rsc": "rsc", "comment": "comment"} + elements = [] + useColumns = [] + for i, ele in enumerate(arr): + if ele.lower() in header_dict.keys(): + elements.append(header_dict[ele.lower()]) + useColumns.append(i) + return elements, useColumns + + +def process_directory(in_dir): + """ + Processes data in directory, returns as Pandas dataframe + :param in_dir: Input data directory, String + :return: A Pandas dataframe containing fingerprint data, QCs, and images + """ + qc_file = "" + fingerprint_qc_arr = [] + spp_data_arr = [] + images = [] + metadata_files = [] + # Separate files into appropriate lists + for filename in os.listdir(in_dir): + # Append the file path + file_path = os.path.join(in_dir, filename) + if os.stat(file_path).st_size != 0: + if filename.lower().endswith('_metadata.txt'): # Find metadata + metadata_files.append(file_path) + elif filename.endswith('_QCmetrics.txt'): # If fingerprint QC file, add to array + fingerprint_qc_arr.append(file_path) + elif filename.lower() == 'qc.csv' or filename.lower() == 'qc.txt' \ + or filename.lower() == 'chip_seq_summary_iter0.tsv': # If lab-computed QC file, set var + qc_file = file_path + elif filename.endswith(".png") or filename.endswith(".pdf"): + images.append(file_path) + elif filename.endswith('.cross_corr.txt'): # If cross corr data, add to array + spp_data_arr.append(file_path) + + # Raise error if QC file was not found. + if not os.path.isfile(qc_file): + logging.error("QC file was not found in the data directory (i.e. qc.csv, qc.txt)") + raise IOError("QC file was not found in the data directory (i.e. qc.csv, qc.txt)") + + # Process QC file into a dataframe + try: + with open(qc_file, 'rb') as f: + # Find delimiter using Sniffer class + dialect = csv.Sniffer().sniff(f.readline(), ['\t', ',']) + reader = csv.reader(f, delimiter=dialect.delimiter) + f.seek(0) + column_names = standardize_header(next(reader)) + # Read data into Pandas dataframe + df = pd.read_csv(f, delimiter=dialect.delimiter, header=None, + names=column_names[0], usecols=column_names[1], engine='python') + # Catch if the filename is not an actual file, but a symlink + except IOError: + with open(os.readlink(qc_file), 'rb') as f: + # Find delimiter using Sniffer class + dialect = csv.Sniffer().sniff(f.readline(), ['\t', ',']) + reader = csv.reader(f, delimiter=dialect.delimiter) + f.seek(0) + column_names = standardize_header(next(reader)) + # Read data into Pandas dataframe + df = pd.read_csv(f, delimiter=dialect.delimiter, header=None, + names=column_names[0], usecols=column_names[1], engine='python') + + # Index the dataframe by sample + df.set_index('sample', inplace=True) + + # If there are fingerprint files, add to array + if fingerprint_qc_arr: + # Add fingerprint data to dataframe + fp_df = pd.DataFrame() + for filename in fingerprint_qc_arr: + if os.stat(filename).st_size != 0: + with open(filename, 'rb') as f: + reader = csv.reader(f, delimiter='\t') + header = [stringFormat(ele) for ele in next(reader)] + new_fp_df = pd.read_csv(f, delimiter='\t', header=None, + names=header, engine='python') + fp_df = fp_df.append(new_fp_df) + fp_df.drop_duplicates(subset='sample', keep='last', inplace=True) + fp_df.set_index('sample', inplace=True) + df = df.merge(fp_df, left_index=True, right_index=True, how='outer') + + # Add fingerprint images and metadata information + for sample in df.index.values: # Index is sample name + fp_image = '' + spp_image = '' + metadata_file = '' + for filename in images: + if filename.endswith('.png') and sample in filename: + fp_image = filename + elif filename.endswith('.pdf') and sample in filename: + spp_image = filename + for filename in metadata_files: + if sample in filename: + metadata_file = filename + if fp_image: + df.set_value(sample, 'fp_image', read_file_base64(fp_image)) + if spp_image: + df.set_value(sample, 'spp_image', read_file_base64(spp_image)) + if metadata_file: + # Read in all metadata attributes into df + for key, value in read_metadata(metadata_file).iteritems(): + df.set_value(sample, key, value) + # Set flowcell name to base directory + df.set_value(sample, 'flowcell', os.path.basename(in_dir)) + + return df + + +def main(): + parser = argparse.ArgumentParser('Generates QC metric summary file for available ChIP-seq samples') + parser.add_argument('-i', '--in_dirs', required=True, nargs='+', + help='Directory(ies)for fingerprint data') + parser.add_argument('-u', '--uri', required=True, + help='URI for database upload') + parser.add_argument('-d', '--database', required=True, + help='Database name for upload') + parser.add_argument('-c', '--collection', required=True, + help='Collection name for database') + parser.add_argument('-o', '--output', required=True, help="Filename for output log") + args = parser.parse_args() + + logging.basicConfig(filename=args.output, level=logging.DEBUG) + + + # Process each given data directory + df = pd.DataFrame() + for i in range(len(args.in_dirs)): + if os.path.isdir(args.in_dirs[i]): + new_df = process_directory(args.in_dirs[i]) + df = df.append(new_df) + factor_names = [row.split('.')[0] for row in df.index.values] + df.rename(columns={'diff._enrichment':'diff_enrichment'}, inplace=True) + + # Convert Pandas dataframe into list of dictionaries + data = df.to_dict(orient='index') + + # Insert documents (list of dicts) to web-application database + uri = args.uri + client = MongoClient(uri) + sample_coll = client[args.database][args.collection] + flowcell_coll = client[args.database]["flowcell"] + + # Initialize a flowcell data + flowcell_name = "" + flowcell_data = {"samples": []} + + # For each sample, replace if it exists, otherwise insert (upsert) + for sample_name in data: + # Set sample data + sample = data[sample_name] + sample['sample'] = sample_name + sample['last_modified'] = datetime.datetime.utcnow() + logging.info("Uploading sample: %s" % sample_name) + sample_coll.replace_one({'sample': sample_name}, sample, upsert=True) + + # Set flowcell data + flowcell_name = sample['flowcell'] + flowcell_data['name'] = flowcell_name + flowcell_data['date'] = sample['timestamp'] + flowcell_data['samples'].append(sample_name) + + # Upsert the flowcell + logging.info("Uploading flowcell: %s" % flowcell_data) + flowcell_coll.replace_one({'name': flowcell_name}, flowcell_data, upsert=True) + + logging.info("Data upload terminated successfully") + + + return + + +if __name__ == '__main__': + main() diff --git a/data_upload.py b/data_upload.py deleted file mode 100644 index 91116fd..0000000 --- a/data_upload.py +++ /dev/null @@ -1,257 +0,0 @@ -from pymongo import MongoClient -import datetime -import os, csv -import argparse -import pandas as pd -import base64 -import consts -import logging - -# Python script and command line tool for compiling fingerprint and QC data from ChIP-seq -# experiments. Make sure to activate the 'alex' virtual environment from miniconda using -# `source /data/reddylab/software/miniconda2/bin/activate alex` command from HARDAC. To -# run full workflow, run the `countFactors_standard.sh` that outputs data directories -# then run this script on those outputs. - -# VERSION 1.0 Notes: pandas.dataframe.set_value() method is deprecated and will be removed -# in later iterations. - - -CWD = os.getcwd() + "/" -OUT_DIR = CWD + "QC_summary/" - - -def pretty_print(df): - with pd.option_context('display.max_rows', None, 'display.max_columns', len(df)): - print(df) - - -def stringFormat(string): - return string.strip().lower().replace(" ", "_").replace('-', '_').replace("%", "percent") - - -def read_file_base64(in_file): - """ - Helper function that reads file into binary - :param in_file: Absolute path to file - :return: The file contents as a string - """ - try: - with open(in_file, 'rb') as f: - return base64.b64encode(f.read()) - # Exception for symlinks - except IOError: - with open(os.readlink(in_file), 'rb') as f: - return base64.b64encode(f.read()) - - -def read_metadata(in_file): - """ - Helper function that reads a metadata file and returns a dictionary of values - :param in_file: The full metadata file path as a string - :return: A dictionary of the files' attributes - """ - attr = {} - # Read a 2-line tab-delimited file with header and contents - with open(in_file, 'rb') as f: - reader = csv.reader(f, delimiter='\t') - header = [stringFormat(ele) for ele in next(reader)] - contents = [stringFormat(ele) for ele in next(reader)] - attr = dict(zip(header, contents)) - - return attr - - -def standardize_header(arr): - """Returns a dataframe header as list, standardized to - QC naming convention - :param arr: A list of strings representing header - :return: Standardized column names, list of strings - """ - header_dict = {"sample": "sample", "raw": "reads_sequenced", - "reads_sequenced": "reads_sequenced", "reads after trimming": "reads_after_trimming", - "trimmed": "reads_after_trimming", "mapped": "reads_mapped", - "reads_mapped": "reads_mapped", "percentage_unique": "percent_unique", - "%reads unique": "percent_unique", - "percentage_unique_mapped_and_filtered": "percent_unique_mapped_filtered", - "%reads mapped after de-dup & filtering": "percent_unique_mapped_filtered", - "reads in peaks": "reads_in_peaks", "in_peaks": "reads_in_peaks", - "percent_in_peaks": "percent_in_peaks", "% reads in peaks": "percent_in_peaks", - "broadpeak_count": "broad_peak_count", "narrowpeak_count": "narrow_peak_count", - "nrf": "nrf", "pbc": "pbc_one", "nsc": "nsc", "rsc": "rsc", "comment": "comment"} - elements = [] - useColumns = [] - for i, ele in enumerate(arr): - if ele.lower() in header_dict.keys(): - elements.append(header_dict[ele.lower()]) - useColumns.append(i) - return elements, useColumns - - -def process_directory(in_dir): - """ - Processes data in directory, returns as Pandas dataframe - :param in_dir: Input data directory, String - :return: A Pandas dataframe containing fingerprint data, QCs, and images - """ - qc_file = "" - fingerprint_qc_arr = [] - spp_data_arr = [] - images = [] - metadata_files = [] - # Separate files into appropriate lists - for filename in os.listdir(in_dir): - # Append the file path - file_path = os.path.join(in_dir, filename) - if os.stat(file_path).st_size != 0: - if filename.lower().endswith('_metadata.txt'): # Find metadata - metadata_files.append(file_path) - elif filename.endswith('_QCmetrics.txt'): # If fingerprint QC file, add to array - fingerprint_qc_arr.append(file_path) - elif filename.lower() == 'qc.csv' or filename.lower() == 'qc.txt' \ - or filename.lower() == 'chip_seq_summary_iter0.tsv': # If lab-computed QC file, set var - qc_file = file_path - elif filename.endswith(".png") or filename.endswith(".pdf"): - images.append(file_path) - elif filename.endswith('.cross_corr.txt'): # If cross corr data, add to array - spp_data_arr.append(file_path) - - # Raise error if QC file was not found. - if not os.path.isfile(qc_file): - logging.error("QC file was not found in the data directory (i.e. qc.csv, qc.txt)") - raise IOError("QC file was not found in the data directory (i.e. qc.csv, qc.txt)") - - # Process QC file into a dataframe - try: - with open(qc_file, 'rb') as f: - # Find delimiter using Sniffer class - dialect = csv.Sniffer().sniff(f.readline(), ['\t', ',']) - reader = csv.reader(f, delimiter=dialect.delimiter) - f.seek(0) - column_names = standardize_header(next(reader)) - # Read data into Pandas dataframe - df = pd.read_csv(f, delimiter=dialect.delimiter, header=None, - names=column_names[0], usecols=column_names[1], engine='python') - # Catch if the filename is not an actual file, but a symlink - except IOError: - with open(os.readlink(qc_file), 'rb') as f: - # Find delimiter using Sniffer class - dialect = csv.Sniffer().sniff(f.readline(), ['\t', ',']) - reader = csv.reader(f, delimiter=dialect.delimiter) - f.seek(0) - column_names = standardize_header(next(reader)) - # Read data into Pandas dataframe - df = pd.read_csv(f, delimiter=dialect.delimiter, header=None, - names=column_names[0], usecols=column_names[1], engine='python') - - # Index the dataframe by sample - df.set_index('sample', inplace=True) - - # If there are fingerprint files, add to array - if fingerprint_qc_arr: - # Add fingerprint data to dataframe - fp_df = pd.DataFrame() - for filename in fingerprint_qc_arr: - if os.stat(filename).st_size != 0: - with open(filename, 'rb') as f: - reader = csv.reader(f, delimiter='\t') - header = [stringFormat(ele) for ele in next(reader)] - new_fp_df = pd.read_csv(f, delimiter='\t', header=None, - names=header, engine='python') - fp_df = fp_df.append(new_fp_df) - fp_df.drop_duplicates(subset='sample', keep='last', inplace=True) - fp_df.set_index('sample', inplace=True) - df = df.merge(fp_df, left_index=True, right_index=True, how='outer') - - # Add fingerprint images and metadata information - for sample in df.index.values: # Index is sample name - fp_image = '' - spp_image = '' - metadata_file = '' - for filename in images: - if filename.endswith('.png') and sample in filename: - fp_image = filename - elif filename.endswith('.pdf') and sample in filename: - spp_image = filename - for filename in metadata_files: - if sample in filename: - metadata_file = filename - if fp_image: - df.set_value(sample, 'fp_image', read_file_base64(fp_image)) - if spp_image: - df.set_value(sample, 'spp_image', read_file_base64(spp_image)) - if metadata_file: - # Read in all metadata attributes into df - for key, value in read_metadata(metadata_file).iteritems(): - df.set_value(sample, key, value) - # Set flowcell name to base directory - df.set_value(sample, 'flowcell', os.path.basename(in_dir)) - - return df - - -def main(): - parser = argparse.ArgumentParser('Generates QC metric summary file for available ChIP-seq samples') - parser.add_argument('-i', '--in_dirs', required=True, nargs='+', - help='Directory(ies)for fingerprint data') - parser.add_argument('-u', '--uri', required=True, - help='URI for database upload') - parser.add_argument('-d', '--database', required=True, - help='Database name for upload') - parser.add_argument('-c', '--collection', required=True, - help='Collection name for database') - parser.add_argument('-o', '--output', required=True, help="Filename for output log") - args = parser.parse_args() - - logging.basicConfig(filename=args.output, level=logging.DEBUG) - - - # Process each given data directory - df = pd.DataFrame() - for i in range(len(args.in_dirs)): - if os.path.isdir(args.in_dirs[i]): - new_df = process_directory(args.in_dirs[i]) - df = df.append(new_df) - factor_names = [row.split('.')[0] for row in df.index.values] - df.rename(columns={'diff._enrichment':'diff_enrichment'}, inplace=True) - - # Convert Pandas dataframe into list of dictionaries - data = df.to_dict(orient='index') - - # Insert documents (list of dicts) to web-application database - uri = args.uri - client = MongoClient(uri) - sample_coll = client[args.database][args.collection] - flowcell_coll = client[args.database]["flowcell"] - - # Initialize a flowcell data - flowcell_name = "" - flowcell_data = {"samples": []} - - # For each sample, replace if it exists, otherwise insert (upsert) - for sample_name in data: - # Set sample data - sample = data[sample_name] - sample['sample'] = sample_name - sample['last_modified'] = datetime.datetime.utcnow() - logging.info("Uploading sample: %s" % sample_name) - sample_coll.replace_one({'sample': sample_name}, sample, upsert=True) - - # Set flowcell data - flowcell_name = sample['flowcell'] - flowcell_data['name'] = flowcell_name - flowcell_data['date'] = sample['timestamp'] - flowcell_data['samples'].append(sample_name) - - # Upsert the flowcell - logging.info("Uploading flowcell: %s" % flowcell_data) - flowcell_coll.replace_one({'name': flowcell_name}, flowcell_data, upsert=True) - - logging.info("Data upload terminated successfully") - - - return - - -if __name__ == '__main__': - main() diff --git a/data_upload.py b/data_upload.py new file mode 120000 index 0000000..9aae8f7 --- /dev/null +++ b/data_upload.py @@ -0,0 +1 @@ +chipdb_upload/data_upload.py \ No newline at end of file diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py deleted file mode 100644 index 18ce00b..0000000 --- a/ggr-cwl-ipynb-gen.py +++ /dev/null @@ -1,653 +0,0 @@ -import argparse -import nbformat -import nbformat.v3 as nbf -import sys -import os -import pandas as pd -from jinja2 import FileSystemLoader -from xlrd import XLRDError -import ruamel.yaml -import consts -import jinja2 -import inspect -import numpy as np - -encoding = sys.getfilesystemencoding() -EXEC_DIR = os.path.dirname(str(__file__)) - - -def render(tpl_path, context): - path, filename = os.path.split(tpl_path) - return jinja2.Environment( - loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates")) - ).get_template(filename).render(context) - - -class Cell(object): - def __init__(self, contents, description=None): - self.contents = contents - self.description = description - if type(self.description) is not list: - self.description = [self.description] - self.header = [] - # self.header_inputs = [] - # self.header_outputs = [] - - def writefile_to(self, dest): - self.header = ["%%%%writefile %s" % dest] - - def to_list(self): - cells = [] - if self.description: - cells.append(nbf.new_text_cell('markdown', source=self.description)) - if self.contents: - cells.append(nbf.new_code_cell(input=self.header + self.contents)) - return cells - - -class CellSbatch(Cell): - def __init__(self, script_output=None, depends_on=False, mem=None, - cpus=None, partition=None, wrap_command=None, array=None, - prolog=list(), **kwargs): - super(CellSbatch, self).__init__(**kwargs) - - content_prolog = ['sbatch'] - if script_output: - content_prolog.extend(['-o', script_output, '\\\n']) - if partition: - content_prolog.extend(['-p', partition, '\\\n']) - if mem: - content_prolog.extend(['--mem', str(mem), '\\\n']) - if cpus: - content_prolog.extend(['-c', str(cpus), '\\\n']) - if depends_on: - content_prolog.extend(['--depend', 'afterok:$1', '\\\n']) - if array is not None: - content_prolog.extend(['--array', array, '\\\n']) - if wrap_command: - content_prolog.append('--wrap="%s' % wrap_command) - self.contents.append('"') - self.contents = content_prolog + self.contents - self.contents = prolog + [' '.join(self.contents)] - - self.header = ["%%script"] - self.header.append('--out blocking_job_str') - self.header.append("bash") - - if depends_on: - self.header.append('-s "$blocking_job"') - self.header = [' '.join(self.header)] - - def to_list(self): - cells = super(CellSbatch, self).to_list() - - # We need to add an extra code cell to compute the SLURM job id - extra_cell = Cell( - contents=["import re", "blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)"], - description="Extract blocking job id" - ) - cells.extend(extra_cell.to_list()) - return cells - - -def save_metadata(samples_df, conf_args, lib_type): - cells = [] - cell_mkdir = Cell(contents=["%%bash", - "mkdir -p %s/data/%s/metadata" % (conf_args['root_dir'], lib_type), - "mkdir -p %s/data/%s/raw_reads" % (conf_args['root_dir'], lib_type), - "mkdir -p %s/data/%s/processed_raw_reads" % (conf_args['root_dir'], lib_type), - "mkdir -p %s/processing/%s/scripts" % (conf_args['root_dir'], lib_type), - "mkdir -p %s/processing/%s/jsons" % (conf_args['root_dir'], lib_type), - "mkdir -p %s/processing/%s/logs" % (conf_args['root_dir'], lib_type) - ], - description=["# %s - %s" % (conf_args['project_name'], lib_type), - consts.notebook_blurb, - "#### Create necessary folder(s)"]) - cells.extend(cell_mkdir.to_list()) - - - outfile = "%s/data/%s/metadata/%s_download_metadata.%s.txt" % \ - (conf_args['root_dir'], lib_type, lib_type, - conf_args['project_name']) - contents = ["%%%%writefile %s" % - outfile, samples_df.to_csv(index=False, - sep=conf_args['sep'], - encoding='utf-8', - header=[x.capitalize() for x in samples_df.columns.values])] - cell = Cell(contents=contents, description="Save metadata file") - cells.extend(cell.to_list()) - - return cells, outfile - - -def download_fastq_files(conf_args, lib_type, metadata_fn=None): - cells = [] - - download_fn = "%s/processing/%s/scripts/download_%s.sh" % (conf_args['root_dir'], lib_type, - conf_args['project_name']) - context = { - 'output_fn': download_fn, - 'project_name': conf_args['project_name'], - 'metadata_filename': metadata_fn, - 'root_dir': conf_args['root_dir'], - 'user': conf_args['user'], - 'lib_type': lib_type, - 'data_source': conf_args['data_from'], - 'consts': consts - } - contents = [render('templates/download_fastq_files.j2', context)] - - cell_write_dw_file = Cell(contents=contents, - description=["#### Download FASTQ from %s" % conf_args['data_from'], - "Create file to download FASTQ files"]) - cells.extend(cell_write_dw_file.to_list()) - - logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) - execute_cell = CellSbatch(contents=list(), - partition=",".join(consts.slurm_partitions), - wrap_command="ssh %s@%s 'sh %s'" % (conf_args['user'], - consts.HOST_FOR_TUNNELED_DOWNLOAD, - download_fn), - description="Execute file to download files", - script_output="%s/%s_%s.out" % (logs_dir, conf_args['project_name'], - inspect.stack()[0][3])) - cells.extend(execute_cell.to_list()) - - return cells - - -def ungzip_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None): - cells = [] - ungzip_fn = "%s/processing/%s/scripts/ungzip_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name']) - context = { - 'output_fn' : ungzip_fn, - 'metadata_filename': metadata_filename, - 'project_name': conf_args['project_name'], - 'root_dir': conf_args['root_dir'], - 'lib_type': lib_type, - 'num_samples': num_samples - } - contents = [render('templates/ungzip_fastq_files.j2', context)] - - cell_write_dw_file = Cell(contents=contents, description="#### Ungzip FASTQ files") - cells.extend(cell_write_dw_file.to_list()) - - logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) - execute_cell = CellSbatch(contents=[ungzip_fn], - description="Execute file to ungzip FASTQ files", - depends_on=True, - partition=",".join(consts.slurm_partitions), - array="0-%d%%20" % (num_samples - 1), - script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'], - inspect.stack()[0][3])) - cells.extend(execute_cell.to_list()) - - return cells - - -def merge_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None): - cells = [] - merge_fn = "%s/processing/%s/scripts/merge_lanes_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name']) - context = { - 'output_fn' : merge_fn, - 'metadata_filename': metadata_filename, - 'project_name': conf_args['project_name'], - 'root_dir': conf_args['root_dir'], - 'lib_type': lib_type, - 'num_samples': num_samples - } - contents = [render('templates/merge_lanes_fastq.j2', context)] - - cell_write_dw_file = Cell(contents=contents, description="#### Merge lanes of FASTQ files") - cells.extend(cell_write_dw_file.to_list()) - - logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) - execute_cell = CellSbatch(contents=[merge_fn], - description="Execute file to merge lanes of FASTQ files", - depends_on=True, - array="0-%d%%20" % (num_samples-1), - partition=",".join(consts.slurm_partitions), - script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'], - inspect.stack()[0][3]),) - cells.extend(execute_cell.to_list()) - - return cells - - -def cwl_json_gen(conf_args, lib_type, metadata_filename): - func_name = inspect.stack()[0][3] - cells = [] - output_fn = "%s/processing/%s/scripts/%s_%s.sh" % (conf_args['root_dir'], - lib_type, - func_name, - conf_args['project_name']) - context = { - 'output_fn' : output_fn, - 'metadata_filename': metadata_filename, - 'project_name': conf_args['project_name'], - 'root_dir': conf_args['root_dir'], - 'lib_type': lib_type, - 'star_genome': consts.star_genome, - 'mem': consts.mem[lib_type.lower()], - 'nthreads': consts.nthreads[lib_type.lower()], - 'separate_jsons': consts.separate_jsons - } - contents = [render('templates/%s.j2' % func_name, context)] - - cell_write_dw_file = Cell(contents=contents, description="#### Create JSON files for CWL pipeline files") - cells.extend(cell_write_dw_file.to_list()) - - logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) - execute_cell = CellSbatch(contents=[output_fn], - description="Execute file to create JSON files", - depends_on=True, - partition=",".join(consts.slurm_partitions), - prolog=["source %s %s" % (consts.conda_activate, - consts.conda_environment)], - script_output="%s/%s_%s.out" % (logs_dir, - conf_args['project_name'], - inspect.stack()[0][3])) - cells.extend(execute_cell.to_list()) - return cells - - -def cwl_slurm_array_gen(conf_args, lib_type, metadata_filename, pipeline_type, n_samples): - func_name = inspect.stack()[0][3] - cells = [] - output_fn = "%s/processing/%s/scripts/%s-%s.sh" % (conf_args['root_dir'], - lib_type, - conf_args['project_name'], - pipeline_type) - metadata_basename = os.path.splitext(os.path.basename(metadata_filename))[0] - context = { - 'output_fn' : output_fn, - 'metadata_basename': metadata_basename, - 'project_name': conf_args['project_name'], - 'root_dir': conf_args['root_dir'], - 'user_duke_email': conf_args['user_duke_email'], - 'lib_type': lib_type, - 'mem': consts.mem[lib_type.lower()], - 'nthreads': consts.nthreads[lib_type.lower()], - 'pipeline_type': pipeline_type, - 'consts': consts - } - contents = [render('templates/%s.j2' % func_name, context)] - - cell_write_dw_file = Cell(contents=contents, description="#### Create SLURM array master bash file for %s samples" % pipeline_type) - cells.extend(cell_write_dw_file.to_list()) - - execute_cell = CellSbatch(contents=[output_fn], - description="Execute SLURM array master file", - depends_on=True, - array="0-%d%%20" % (n_samples - 1), - prolog=["source %s %s" % (consts.conda_activate, - consts.conda_environment)], - partition=",".join(consts.slurm_partitions)) - cells.extend(execute_cell.to_list()) - - return cells - - -def generate_qc_cell(conf_args, lib_type, pipeline_type): - func_name = inspect.stack()[0][3] - cells = [] - - # Python program has no 'se' or 'pe' abbreviation - end_type = pipeline_type.split("-")[0] - if end_type == "se": - end_type = "single_end" - elif end_type == "pe": - end_type = "paired_end" - else: - return CellSbatch(contents=[""]) - - - output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"], - lib_type, - func_name, - conf_args["project_name"], - pipeline_type) - qc_type = lib_type.replace("_", "") - context = { - 'output_fn': output_fn, - "conda_activate": consts.conda_activate, - 'root_dir': conf_args["root_dir"], - "library_type": lib_type, - "project_name": conf_args["project_name"], - "pipeline_type": pipeline_type, - "qc_script_dir": consts.qc_script_dir, - "qc_type": qc_type, - "end_type": end_type - } - contents = [render('templates/%s.j2' % func_name, context)] - - cell_write_dw_file = Cell(contents=contents, description="#### Create QC generating script") - cells.extend(cell_write_dw_file.to_list()) - - execute_cell = CellSbatch(contents=[output_fn], - depends_on=True, - partition=",".join(consts.slurm_partitions), - description="Generate QCs for %s-%s" % (conf_args["project_name"], pipeline_type)) - - cells.extend(execute_cell.to_list()) - - return cells - - -def generate_plots(conf_args, metadata_file, lib_type, pipeline_type, n_samples): - """ - Generates cell for creating fingerprint data - :param conf_args: Dictionary containing data about directories, project name, etc. - :param metadata_file: File path to metadata - :param lib_type: Type of assay (RNA, ChIP, ATAC) - :param pipeline_type: Type of sequencing pipeline (end, control) - :return: - """ - func_name = inspect.stack()[0][3] - cells = [] - # Current iteration of web-application only accepts ChIP samples - if lib_type != "chip_seq": - return [] - - input_directory = "{}/processing/{}/{}-{}".format(conf_args['root_dir'], - lib_type, - conf_args['project_name'], - pipeline_type) - output_directory = input_directory - - output_fn = '%s/processing/%s/scripts/generate_plot.%s-%s.sh' % (conf_args["root_dir"], - lib_type, - conf_args["project_name"], - pipeline_type) - - context = { - 'output_fn': output_fn, - 'env_activate': consts.conda_activate, - 'root_dir': conf_args['root_dir'], - 'lib_type': lib_type, - 'project_name': conf_args['project_name'], - 'pipeline_type': pipeline_type, - 'metadata_file': metadata_file, - 'input_dir': input_directory, - 'output_dir': output_directory - } - contents = [render('templates/%s.j2' % func_name, context)] - cell_write_dw_file = Cell(contents=contents, description="#### Create plot generating script") - cells.extend(cell_write_dw_file.to_list()) - - - execute_cell = CellSbatch(contents=[output_fn], - depends_on=True, - array="0-%d%%5" % (n_samples - 1), - prolog=["source %s %s" % (consts.conda_activate, consts.conda_environment)], - partition=",".join(consts.slurm_partitions), - description="Generate plots and data for website") - cells.extend(execute_cell.to_list()) - - return cells - - -def data_upload(conf_args, lib_type, pipeline_type): - """ - Function for generating a cell that uploads notebook generated data - to database. Can be avoided with usage of tag "-n". - """ - func_name = inspect.stack()[0][3] - cells = [] - - # Only upload data to web-app if it is ChIP-seq - if lib_type != "chip_seq" or not conf_args["upload"]: - return [] - - output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"], - lib_type, - func_name, - conf_args["project_name"], - pipeline_type) - - script_dir = os.path.dirname(os.path.realpath(__file__)) - data_dir = "{}/processing/chip_seq/{}-{}".format(conf_args['root_dir'], - conf_args['project_name'], pipeline_type) - - context = { - 'output_fn': output_fn, - 'root_dir': conf_args['root_dir'], - 'pipeline_type': pipeline_type, - 'library_type': lib_type, - 'project_name': conf_args['project_name'], - 'script_dir': script_dir, - 'conda_activate': consts.conda_activate, - 'data_dir': data_dir, - 'uri': conf_args['uri'] if 'uri' in conf_args else None, - 'database': conf_args['database'] if 'database' in conf_args else None, - 'collection': conf_args['collection'] if 'collection' in conf_args else None - } - - contents = [render('templates/%s.j2' % func_name, context)] - cell_write_dw_file = Cell(contents=contents, description="#### Create data upload script") - cells.extend(cell_write_dw_file.to_list()) - - execute_cell = CellSbatch(contents=[output_fn], - depends_on=True, - prolog=["source %s alex" % consts.conda_activate], - partition=",".join(consts.slurm_partitions), - description="### Upload ChIP-seq to web-application") - cells.extend(execute_cell.to_list()) - - return cells - - -def get_pipeline_types(samples_df): - lib_type = samples_df['library type'].iloc[0].lower().replace('-', '_') - if lib_type == consts.library_type_chip_seq: - for seq_end in consts.seq_ends: - for with_control in consts.with_controls: - samples_filter = samples_df['paired-end or single-end'].str.lower() == seq_end - if with_control: - samples_filter = samples_filter & (~samples_df['control'].isnull()) - pipeline_type = '-'.join([seq_end, with_control]) - else: - samples_filter = samples_filter & (samples_df['control'].isnull()) - pipeline_type = '-'.join([seq_end]) - yield pipeline_type, np.sum(samples_filter) - if lib_type == consts.library_type_rna_seq: - for seq_end in consts.seq_ends: - for strandness in consts.strandnesses: - samples_filter = \ - (samples_df['paired-end or single-end'].str.lower() == seq_end) \ - & (samples_df['strand specificity'].str.lower() == strandness) - if consts.with_sjdb: - pipeline_type = '-'.join([seq_end, strandness, 'with-sjdb']) - else: - pipeline_type = '-'.join([seq_end, strandness]) - yield pipeline_type, np.sum(samples_filter) - if lib_type == consts.library_type_atac_seq: - for seq_end in consts.seq_ends: - for with_blacklist_removal in consts.blacklist_removal: - samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end) - if with_blacklist_removal: - pipeline_type = '-'.join([seq_end, with_blacklist_removal]) - samples_filter = samples_filter & (~samples_df['blacklist removal'].isnull()) - else: - pipeline_type = '-'.join([seq_end]) - samples_filter = samples_filter & (samples_df['blacklist removal'].isnull()) - yield pipeline_type, np.sum(samples_filter) - if lib_type == consts.library_type_starr_seq: - for seq_end in consts.seq_ends: - for with_umis in consts.with_umis: - samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end) - if with_umis: - pipeline_type = '-'.join([seq_end, with_umis]) - if 'umis' in samples_df.columns: - samples_filter = samples_filter & samples_df['umis'] - else: - pipeline_type = '-'.join([seq_end]) - if 'umis' in samples_df.columns: - samples_filter = samples_filter & ~samples_df['umis'] - yield pipeline_type, np.sum(samples_filter) - - -def data_acquisition_cells(conf_args, lib_type, metadata_file, nsamples): - cells = [] - if conf_args['data_from'] != consts.DATA_SOURCES_LOCAL: - cells.extend(download_fastq_files(conf_args, - lib_type, - metadata_fn=metadata_file)) - cells.extend(merge_fastq_files(conf_args, - lib_type, - metadata_filename=metadata_file, - num_samples=nsamples)) - else: - download_fn = "%s/data/%s/processed_raw_reads/%s" % ( - conf_args['root_dir'], lib_type, - conf_args['project_name']) - warning_cell = Cell(contents=None, - description=["### FASTQ files already available locally!!", - "Please, make sure the FASTQ files are correctly named, decompressed and located/symlinked in:", - "", "**", download_fn, "**"]) - cells.extend(warning_cell.to_list()) - - return cells - - -def create_cells(samples_df, conf_args=None): - """ - Master function to write all code and text for the notebook. - - Conceptually, there are a number of things that have to happen: - - save metadata txt file - - download FASTQ.gz files from sequencing core - - uncompress FASTQ.gz files - - rename and move FASTQ files - - create JSONs files for cwltool - - execute cwltool master file - """ - lib_type = samples_df.iloc[0]['library type'].lower().replace('-', '_') - num_samples = samples_df.shape[0] - cells = [] - - cc, metadata_file = save_metadata(samples_df, conf_args, lib_type) - cells.extend(cc) - - cells.extend(data_acquisition_cells(conf_args, lib_type, metadata_file, num_samples)) - cells.extend(cwl_json_gen(conf_args, lib_type, metadata_filename=metadata_file)) - for pipeline_type, n in get_pipeline_types(samples_df): - if n > 0: - cells.extend(cwl_slurm_array_gen(conf_args, lib_type, metadata_filename=metadata_file, - pipeline_type=pipeline_type, n_samples=n)) - cells.extend(generate_qc_cell(conf_args, lib_type, pipeline_type=pipeline_type)) - cells.extend(generate_plots(conf_args, metadata_file=metadata_file, - lib_type=lib_type, pipeline_type=pipeline_type, n_samples=n)) - cells.extend(data_upload(conf_args, lib_type, pipeline_type)) - - return cells - - -def make_notebook(outfile, metadata, conf_args=None): - """Create notebook with parsed contents from metadata""" - nb = nbf.new_notebook() - - cells = [] - # Create a notebook by Library type existing in the metadata file - for samples_df in get_samples_by_library_type(metadata, conf_args['sep']): - cells.extend(create_cells(samples_df, conf_args=conf_args)) - - nb['worksheets'].append(nbf.new_worksheet(cells=cells)) - - with open(outfile, 'w') as _: - nbformat.write(nb, _) - - -def get_samples_by_library_type(metadata_file, sep='\t'): - """ - Parse a metadata file (either a spreadsheet or a tab-delimited file. - - :return: generator of panda's dataframe - """ - try: - md = pd.read_excel(metadata_file.name, - true_values=['Yes', 'Y', 'yes', 'y', 1], - false_values=['No', 'N', 'no', 'n', 0]) - except XLRDError: - print (XLRDError) - md = pd.read_csv(metadata_file.name, - true_values=['Yes', 'Y', 'yes', 'y', 1], - false_values=['No', 'N', 'no', 'n', 0], sep=sep) - - md.columns = [x.lower() for x in md.columns] - named_cols = [c for c in md.columns if not c.startswith('unnamed: ')] - lib_types_found = set(md['library type'][~pd.isnull(md['library type'])]) - - for lt in lib_types_found: - yield md.loc[md['library type'] == lt, named_cols] - - -def init_conf_args(args, - required_args = ['root_dir'], - optional_args = ['user', 'sep', 'user_duke_email', 'project_name']): - conf_args = {} - if args['conf_file']: - conf_args = ruamel.yaml.load(args['conf_file'], Loader=ruamel.yaml.Loader) - for r in required_args: - conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r] - try: - assert conf_args[r] is not None - except AssertionError as e: - print("[ERROR]", r, "not defined") - raise - for o in optional_args: - conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None) - conf_args['user'] = conf_args['user'] or os.environ['USER'] - conf_args['user_duke_email'] = conf_args['user_duke_email'] or "%s@duke.edu" % conf_args['user'] - conf_args['project_name'] = conf_args['project_name'] or os.path.splitext(os.path.basename(args['metadata'].name))[0] - - return conf_args - -def main(): - parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines') - parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name') - parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'), - help='Metadata file with samples information') - parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file') - parser.add_argument('-n', '--no-upload', action='store_false', - help='Avoids uploading generated data to database when specified') - parser.add_argument('--metadata-sep', dest='sep', required=False, type=str, default='\t', - help='Separator for metadata file (when different than Excel spread sheet)') - parser.add_argument('--project-name', required=False, type=str, - help='Project name (by default, basename of metadata file name)') - parser.add_argument('--data-from', required=False, choices=consts.data_sources, - default=consts.data_sources[0], - help='Choices: %s' % (', '.join(consts.data_sources))) - parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)') - parser.add_argument('-u', '--user', required=False, - help='HARDAC User used in SLURM (default: ${USER})') - parser.add_argument('-e', '--user-duke-email', required=False, - help='Email(s) notified when execution is finished (default: ${USER}@duke.edu)') - parser.add_argument('-r', '--root-dir', required=False, - help='Root directory where all subfolders and files will be created ' - '(semi-required: either defined here or in conf-file') - - args = parser.parse_args() - - conf_args = init_conf_args(vars(args)) - - outfile = "%s.ipynb" % conf_args['project_name'] - - if os.path.isdir(args.out): - outfile = os.path.join(args.out, outfile) - else: - outfile = args.out - - if os.path.isfile(outfile) and not args.force: - print(outfile, "is an existing file. Please use -f or --force to overwrite the contents") - sys.exit(1) - - conf_args['upload'] = args.no_upload - conf_args['data_from'] = args.data_from - make_notebook(outfile, - args.metadata, - conf_args=conf_args) - - -if __name__ == '__main__': - main() diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py new file mode 120000 index 0000000..e27d141 --- /dev/null +++ b/ggr-cwl-ipynb-gen.py @@ -0,0 +1 @@ +ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py \ No newline at end of file diff --git a/ggr_cwl_ipynb_gen/__init__.py b/ggr_cwl_ipynb_gen/__init__.py new file mode 100644 index 0000000..a75ddec --- /dev/null +++ b/ggr_cwl_ipynb_gen/__init__.py @@ -0,0 +1,2 @@ +if __name__ == '__main__': + ggr-main() diff --git a/consts.py b/ggr_cwl_ipynb_gen/consts.py similarity index 97% rename from consts.py rename to ggr_cwl_ipynb_gen/consts.py index 32518da..03fc3a7 100644 --- a/consts.py +++ b/ggr_cwl_ipynb_gen/consts.py @@ -53,3 +53,6 @@ qc_script_dir = '/data/reddylab/software/cwl/bin' data_upload_script = '/data/reddylab/Darryl/GitHub/reddylab/csv_to_mongo.py' HOST_FOR_TUNNELED_DOWNLOAD = "Hardac-xfer.genome.duke.edu" + +# Package constants +PACKAGE_NAME = "ggr_cwl_ipynb_gen" diff --git a/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py new file mode 100644 index 0000000..1cc9fde --- /dev/null +++ b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py @@ -0,0 +1,661 @@ +#!/usr/bin/env python +import argparse +import nbformat +import nbformat.v3 as nbf +import sys +import os +import pandas as pd +from jinja2 import FileSystemLoader, PackageLoader +from xlrd import XLRDError +import ruamel.yaml +import ggr_cwl_ipynb_gen.consts as consts +import jinja2 +import inspect +from jinja2.exceptions import TemplateNotFound +import numpy as np + +encoding = sys.getfilesystemencoding() +EXEC_DIR = os.path.dirname(str(__file__)) + + +def render(tpl_path, context): + path, filename = os.path.split(tpl_path) + try: + jinja_rendered = jinja2.Environment( + loader=FileSystemLoader(os.path.join(EXEC_DIR, "templates")) + ).get_template(filename).render(context) + except TemplateNotFound: + jinja_rendered = jinja2.Environment( + loader=PackageLoader(consts.PACKAGE_NAME, "templates") + ).get_template(filename).render(context) + return jinja_rendered + + +class Cell(object): + def __init__(self, contents, description=None): + self.contents = contents + self.description = description + if type(self.description) is not list: + self.description = [self.description] + self.header = [] + # self.header_inputs = [] + # self.header_outputs = [] + + def writefile_to(self, dest): + self.header = ["%%%%writefile %s" % dest] + + def to_list(self): + cells = [] + if self.description: + cells.append(nbf.new_text_cell('markdown', source=self.description)) + if self.contents: + cells.append(nbf.new_code_cell(input=self.header + self.contents)) + return cells + + +class CellSbatch(Cell): + def __init__(self, script_output=None, depends_on=False, mem=None, + cpus=None, partition=None, wrap_command=None, array=None, + prolog=list(), **kwargs): + super(CellSbatch, self).__init__(**kwargs) + + content_prolog = ['sbatch'] + if script_output: + content_prolog.extend(['-o', script_output, '\\\n']) + if partition: + content_prolog.extend(['-p', partition, '\\\n']) + if mem: + content_prolog.extend(['--mem', str(mem), '\\\n']) + if cpus: + content_prolog.extend(['-c', str(cpus), '\\\n']) + if depends_on: + content_prolog.extend(['--depend', 'afterok:$1', '\\\n']) + if array is not None: + content_prolog.extend(['--array', array, '\\\n']) + if wrap_command: + content_prolog.append('--wrap="%s' % wrap_command) + self.contents.append('"') + self.contents = content_prolog + self.contents + self.contents = prolog + [' '.join(self.contents)] + + self.header = ["%%script"] + self.header.append('--out blocking_job_str') + self.header.append("bash") + + if depends_on: + self.header.append('-s "$blocking_job"') + self.header = [' '.join(self.header)] + + def to_list(self): + cells = super(CellSbatch, self).to_list() + + # We need to add an extra code cell to compute the SLURM job id + extra_cell = Cell( + contents=["import re", "blocking_job = re.match('Submitted batch job (\d+).*', blocking_job_str).group(1)"], + description="Extract blocking job id" + ) + cells.extend(extra_cell.to_list()) + return cells + + +def save_metadata(samples_df, conf_args, lib_type): + cells = [] + cell_mkdir = Cell(contents=["%%bash", + "mkdir -p %s/data/%s/metadata" % (conf_args['root_dir'], lib_type), + "mkdir -p %s/data/%s/raw_reads" % (conf_args['root_dir'], lib_type), + "mkdir -p %s/data/%s/processed_raw_reads" % (conf_args['root_dir'], lib_type), + "mkdir -p %s/processing/%s/scripts" % (conf_args['root_dir'], lib_type), + "mkdir -p %s/processing/%s/jsons" % (conf_args['root_dir'], lib_type), + "mkdir -p %s/processing/%s/logs" % (conf_args['root_dir'], lib_type) + ], + description=["# %s - %s" % (conf_args['project_name'], lib_type), + consts.notebook_blurb, + "#### Create necessary folder(s)"]) + cells.extend(cell_mkdir.to_list()) + + + outfile = "%s/data/%s/metadata/%s_download_metadata.%s.txt" % \ + (conf_args['root_dir'], lib_type, lib_type, + conf_args['project_name']) + contents = ["%%%%writefile %s" % + outfile, samples_df.to_csv(index=False, + sep=conf_args['sep'], + encoding='utf-8', + header=[x.capitalize() for x in samples_df.columns.values])] + cell = Cell(contents=contents, description="Save metadata file") + cells.extend(cell.to_list()) + + return cells, outfile + + +def download_fastq_files(conf_args, lib_type, metadata_fn=None): + cells = [] + + download_fn = "%s/processing/%s/scripts/download_%s.sh" % (conf_args['root_dir'], lib_type, + conf_args['project_name']) + context = { + 'output_fn': download_fn, + 'project_name': conf_args['project_name'], + 'metadata_filename': metadata_fn, + 'root_dir': conf_args['root_dir'], + 'user': conf_args['user'], + 'lib_type': lib_type, + 'data_source': conf_args['data_from'], + 'consts': consts + } + contents = [render('templates/download_fastq_files.j2', context)] + + cell_write_dw_file = Cell(contents=contents, + description=["#### Download FASTQ from %s" % conf_args['data_from'], + "Create file to download FASTQ files"]) + cells.extend(cell_write_dw_file.to_list()) + + logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) + execute_cell = CellSbatch(contents=list(), + partition=",".join(consts.slurm_partitions), + wrap_command="ssh %s@%s 'sh %s'" % (conf_args['user'], + consts.HOST_FOR_TUNNELED_DOWNLOAD, + download_fn), + description="Execute file to download files", + script_output="%s/%s_%s.out" % (logs_dir, conf_args['project_name'], + inspect.stack()[0][3])) + cells.extend(execute_cell.to_list()) + + return cells + + +def ungzip_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None): + cells = [] + ungzip_fn = "%s/processing/%s/scripts/ungzip_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name']) + context = { + 'output_fn' : ungzip_fn, + 'metadata_filename': metadata_filename, + 'project_name': conf_args['project_name'], + 'root_dir': conf_args['root_dir'], + 'lib_type': lib_type, + 'num_samples': num_samples + } + contents = [render('templates/ungzip_fastq_files.j2', context)] + + cell_write_dw_file = Cell(contents=contents, description="#### Ungzip FASTQ files") + cells.extend(cell_write_dw_file.to_list()) + + logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) + execute_cell = CellSbatch(contents=[ungzip_fn], + description="Execute file to ungzip FASTQ files", + depends_on=True, + partition=",".join(consts.slurm_partitions), + array="0-%d%%20" % (num_samples - 1), + script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'], + inspect.stack()[0][3])) + cells.extend(execute_cell.to_list()) + + return cells + + +def merge_fastq_files(conf_args, lib_type, metadata_filename=None, num_samples=None): + cells = [] + merge_fn = "%s/processing/%s/scripts/merge_lanes_%s.sh" % (conf_args['root_dir'], lib_type, conf_args['project_name']) + context = { + 'output_fn' : merge_fn, + 'metadata_filename': metadata_filename, + 'project_name': conf_args['project_name'], + 'root_dir': conf_args['root_dir'], + 'lib_type': lib_type, + 'num_samples': num_samples + } + contents = [render('templates/merge_lanes_fastq.j2', context)] + + cell_write_dw_file = Cell(contents=contents, description="#### Merge lanes of FASTQ files") + cells.extend(cell_write_dw_file.to_list()) + + logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) + execute_cell = CellSbatch(contents=[merge_fn], + description="Execute file to merge lanes of FASTQ files", + depends_on=True, + array="0-%d%%20" % (num_samples-1), + partition=",".join(consts.slurm_partitions), + script_output="%s/%s_%s_%%a.out" % (logs_dir, conf_args['project_name'], + inspect.stack()[0][3]), ) + cells.extend(execute_cell.to_list()) + + return cells + + +def cwl_json_gen(conf_args, lib_type, metadata_filename): + func_name = inspect.stack()[0][3] + cells = [] + output_fn = "%s/processing/%s/scripts/%s_%s.sh" % (conf_args['root_dir'], + lib_type, + func_name, + conf_args['project_name']) + context = { + 'output_fn' : output_fn, + 'metadata_filename': metadata_filename, + 'project_name': conf_args['project_name'], + 'root_dir': conf_args['root_dir'], + 'lib_type': lib_type, + 'star_genome': consts.star_genome, + 'mem': consts.mem[lib_type.lower()], + 'nthreads': consts.nthreads[lib_type.lower()], + 'separate_jsons': consts.separate_jsons + } + contents = [render('templates/%s.j2' % func_name, context)] + + cell_write_dw_file = Cell(contents=contents, description="#### Create JSON files for CWL pipeline files") + cells.extend(cell_write_dw_file.to_list()) + + logs_dir = "%s/processing/%s/logs" % (conf_args['root_dir'], lib_type) + execute_cell = CellSbatch(contents=[output_fn], + description="Execute file to create JSON files", + depends_on=True, + partition=",".join(consts.slurm_partitions), + prolog=["source %s %s" % (consts.conda_activate, + consts.conda_environment)], + script_output="%s/%s_%s.out" % (logs_dir, + conf_args['project_name'], + inspect.stack()[0][3])) + cells.extend(execute_cell.to_list()) + return cells + + +def cwl_slurm_array_gen(conf_args, lib_type, metadata_filename, pipeline_type, n_samples): + func_name = inspect.stack()[0][3] + cells = [] + output_fn = "%s/processing/%s/scripts/%s-%s.sh" % (conf_args['root_dir'], + lib_type, + conf_args['project_name'], + pipeline_type) + metadata_basename = os.path.splitext(os.path.basename(metadata_filename))[0] + context = { + 'output_fn' : output_fn, + 'metadata_basename': metadata_basename, + 'project_name': conf_args['project_name'], + 'root_dir': conf_args['root_dir'], + 'user_duke_email': conf_args['user_duke_email'], + 'lib_type': lib_type, + 'mem': consts.mem[lib_type.lower()], + 'nthreads': consts.nthreads[lib_type.lower()], + 'pipeline_type': pipeline_type, + 'consts': consts + } + contents = [render('templates/%s.j2' % func_name, context)] + + cell_write_dw_file = Cell(contents=contents, description="#### Create SLURM array master bash file for %s samples" % pipeline_type) + cells.extend(cell_write_dw_file.to_list()) + + execute_cell = CellSbatch(contents=[output_fn], + description="Execute SLURM array master file", + depends_on=True, + array="0-%d%%20" % (n_samples - 1), + prolog=["source %s %s" % (consts.conda_activate, + consts.conda_environment)], + partition=",".join(consts.slurm_partitions)) + cells.extend(execute_cell.to_list()) + + return cells + + +def generate_qc_cell(conf_args, lib_type, pipeline_type): + func_name = inspect.stack()[0][3] + cells = [] + + # Python program has no 'se' or 'pe' abbreviation + end_type = pipeline_type.split("-")[0] + if end_type == "se": + end_type = "single_end" + elif end_type == "pe": + end_type = "paired_end" + else: + return CellSbatch(contents=[""]) + + + output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"], + lib_type, + func_name, + conf_args["project_name"], + pipeline_type) + qc_type = lib_type.replace("_", "") + context = { + 'output_fn': output_fn, + "conda_activate": consts.conda_activate, + 'root_dir': conf_args["root_dir"], + "library_type": lib_type, + "project_name": conf_args["project_name"], + "pipeline_type": pipeline_type, + "qc_script_dir": consts.qc_script_dir, + "qc_type": qc_type, + "end_type": end_type + } + contents = [render('templates/%s.j2' % func_name, context)] + + cell_write_dw_file = Cell(contents=contents, description="#### Create QC generating script") + cells.extend(cell_write_dw_file.to_list()) + + execute_cell = CellSbatch(contents=[output_fn], + depends_on=True, + partition=",".join(consts.slurm_partitions), + description="Generate QCs for %s-%s" % (conf_args["project_name"], pipeline_type)) + + cells.extend(execute_cell.to_list()) + + return cells + + +def generate_plots(conf_args, metadata_file, lib_type, pipeline_type, n_samples): + """ + Generates cell for creating fingerprint data + :param conf_args: Dictionary containing data about directories, project name, etc. + :param metadata_file: File path to metadata + :param lib_type: Type of assay (RNA, ChIP, ATAC) + :param pipeline_type: Type of sequencing pipeline (end, control) + :return: + """ + func_name = inspect.stack()[0][3] + cells = [] + # Current iteration of web-application only accepts ChIP samples + if lib_type != "chip_seq": + return [] + + input_directory = "{}/processing/{}/{}-{}".format(conf_args['root_dir'], + lib_type, + conf_args['project_name'], + pipeline_type) + output_directory = input_directory + + output_fn = '%s/processing/%s/scripts/generate_plot.%s-%s.sh' % (conf_args["root_dir"], + lib_type, + conf_args["project_name"], + pipeline_type) + + context = { + 'output_fn': output_fn, + 'env_activate': consts.conda_activate, + 'root_dir': conf_args['root_dir'], + 'lib_type': lib_type, + 'project_name': conf_args['project_name'], + 'pipeline_type': pipeline_type, + 'metadata_file': metadata_file, + 'input_dir': input_directory, + 'output_dir': output_directory + } + contents = [render('templates/%s.j2' % func_name, context)] + cell_write_dw_file = Cell(contents=contents, description="#### Create plot generating script") + cells.extend(cell_write_dw_file.to_list()) + + + execute_cell = CellSbatch(contents=[output_fn], + depends_on=True, + array="0-%d%%5" % (n_samples - 1), + prolog=["source %s %s" % (consts.conda_activate, consts.conda_environment)], + partition=",".join(consts.slurm_partitions), + description="Generate plots and data for website") + cells.extend(execute_cell.to_list()) + + return cells + + +def data_upload(conf_args, lib_type, pipeline_type): + """ + Function for generating a cell that uploads notebook generated data + to database. Can be avoided with usage of tag "-n". + """ + func_name = inspect.stack()[0][3] + cells = [] + + # Only upload data to web-app if it is ChIP-seq + if lib_type != "chip_seq" or not conf_args["upload"]: + return [] + + output_fn = '%s/processing/%s/scripts/%s_%s-%s.sh' % (conf_args["root_dir"], + lib_type, + func_name, + conf_args["project_name"], + pipeline_type) + + script_dir = os.path.dirname(os.path.realpath(__file__)) + data_dir = "{}/processing/chip_seq/{}-{}".format(conf_args['root_dir'], + conf_args['project_name'], pipeline_type) + + context = { + 'output_fn': output_fn, + 'root_dir': conf_args['root_dir'], + 'pipeline_type': pipeline_type, + 'library_type': lib_type, + 'project_name': conf_args['project_name'], + 'script_dir': script_dir, + 'conda_activate': consts.conda_activate, + 'data_dir': data_dir, + 'uri': conf_args['uri'] if 'uri' in conf_args else None, + 'database': conf_args['database'] if 'database' in conf_args else None, + 'collection': conf_args['collection'] if 'collection' in conf_args else None + } + + contents = [render('templates/%s.j2' % func_name, context)] + cell_write_dw_file = Cell(contents=contents, description="#### Create data upload script") + cells.extend(cell_write_dw_file.to_list()) + + execute_cell = CellSbatch(contents=[output_fn], + depends_on=True, + prolog=["source %s alex" % consts.conda_activate], + partition=",".join(consts.slurm_partitions), + description="### Upload ChIP-seq to web-application") + cells.extend(execute_cell.to_list()) + + return cells + + +def get_pipeline_types(samples_df): + lib_type = samples_df['library type'].iloc[0].lower().replace('-', '_') + if lib_type == consts.library_type_chip_seq: + for seq_end in consts.seq_ends: + for with_control in consts.with_controls: + samples_filter = samples_df['paired-end or single-end'].str.lower() == seq_end + if with_control: + samples_filter = samples_filter & (~samples_df['control'].isnull()) + pipeline_type = '-'.join([seq_end, with_control]) + else: + samples_filter = samples_filter & (samples_df['control'].isnull()) + pipeline_type = '-'.join([seq_end]) + yield pipeline_type, np.sum(samples_filter) + if lib_type == consts.library_type_rna_seq: + for seq_end in consts.seq_ends: + for strandness in consts.strandnesses: + samples_filter = \ + (samples_df['paired-end or single-end'].str.lower() == seq_end) \ + & (samples_df['strand specificity'].str.lower() == strandness) + if consts.with_sjdb: + pipeline_type = '-'.join([seq_end, strandness, 'with-sjdb']) + else: + pipeline_type = '-'.join([seq_end, strandness]) + yield pipeline_type, np.sum(samples_filter) + if lib_type == consts.library_type_atac_seq: + for seq_end in consts.seq_ends: + for with_blacklist_removal in consts.blacklist_removal: + samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end) + if with_blacklist_removal: + pipeline_type = '-'.join([seq_end, with_blacklist_removal]) + samples_filter = samples_filter & (~samples_df['blacklist removal'].isnull()) + else: + pipeline_type = '-'.join([seq_end]) + samples_filter = samples_filter & (samples_df['blacklist removal'].isnull()) + yield pipeline_type, np.sum(samples_filter) + if lib_type == consts.library_type_starr_seq: + for seq_end in consts.seq_ends: + for with_umis in consts.with_umis: + samples_filter = (samples_df['paired-end or single-end'].str.lower() == seq_end) + if with_umis: + pipeline_type = '-'.join([seq_end, with_umis]) + if 'umis' in samples_df.columns: + samples_filter = samples_filter & samples_df['umis'] + else: + pipeline_type = '-'.join([seq_end]) + if 'umis' in samples_df.columns: + samples_filter = samples_filter & ~samples_df['umis'] + yield pipeline_type, np.sum(samples_filter) + + +def data_acquisition_cells(conf_args, lib_type, metadata_file, nsamples): + cells = [] + if conf_args['data_from'] != consts.DATA_SOURCES_LOCAL: + cells.extend(download_fastq_files(conf_args, + lib_type, + metadata_fn=metadata_file)) + cells.extend(merge_fastq_files(conf_args, + lib_type, + metadata_filename=metadata_file, + num_samples=nsamples)) + else: + download_fn = "%s/data/%s/processed_raw_reads/%s" % ( + conf_args['root_dir'], lib_type, + conf_args['project_name']) + warning_cell = Cell(contents=None, + description=["### FASTQ files already available locally!!", + "Please, make sure the FASTQ files are correctly named, decompressed and located/symlinked in:", + "", "**", download_fn, "**"]) + cells.extend(warning_cell.to_list()) + + return cells + + +def create_cells(samples_df, conf_args=None): + """ + Master function to write all code and text for the notebook. + + Conceptually, there are a number of things that have to happen: + - save metadata txt file + - download FASTQ.gz files from sequencing core + - uncompress FASTQ.gz files + - rename and move FASTQ files + - create JSONs files for cwltool + - execute cwltool master file + """ + lib_type = samples_df.iloc[0]['library type'].lower().replace('-', '_') + num_samples = samples_df.shape[0] + cells = [] + + cc, metadata_file = save_metadata(samples_df, conf_args, lib_type) + cells.extend(cc) + + cells.extend(data_acquisition_cells(conf_args, lib_type, metadata_file, num_samples)) + cells.extend(cwl_json_gen(conf_args, lib_type, metadata_filename=metadata_file)) + for pipeline_type, n in get_pipeline_types(samples_df): + if n > 0: + cells.extend(cwl_slurm_array_gen(conf_args, lib_type, metadata_filename=metadata_file, + pipeline_type=pipeline_type, n_samples=n)) + cells.extend(generate_qc_cell(conf_args, lib_type, pipeline_type=pipeline_type)) + cells.extend(generate_plots(conf_args, metadata_file=metadata_file, + lib_type=lib_type, pipeline_type=pipeline_type, n_samples=n)) + cells.extend(data_upload(conf_args, lib_type, pipeline_type)) + + return cells + + +def make_notebook(outfile, metadata, conf_args=None): + """Create notebook with parsed contents from metadata""" + nb = nbf.new_notebook() + + cells = [] + # Create a notebook by Library type existing in the metadata file + for samples_df in get_samples_by_library_type(metadata, conf_args['sep']): + cells.extend(create_cells(samples_df, conf_args=conf_args)) + + nb['worksheets'].append(nbf.new_worksheet(cells=cells)) + + with open(outfile, 'w') as _: + nbformat.write(nb, _) + + +def get_samples_by_library_type(metadata_file, sep='\t'): + """ + Parse a metadata file (either a spreadsheet or a tab-delimited file. + + :return: generator of panda's dataframe + """ + try: + md = pd.read_excel(metadata_file.name, + true_values=['Yes', 'Y', 'yes', 'y', 1], + false_values=['No', 'N', 'no', 'n', 0]) + except XLRDError: + print (XLRDError) + md = pd.read_csv(metadata_file.name, + true_values=['Yes', 'Y', 'yes', 'y', 1], + false_values=['No', 'N', 'no', 'n', 0], sep=sep) + + md.columns = [x.lower() for x in md.columns] + named_cols = [c for c in md.columns if not c.startswith('unnamed: ')] + lib_types_found = set(md['library type'][~pd.isnull(md['library type'])]) + + for lt in lib_types_found: + yield md.loc[md['library type'] == lt, named_cols] + + +def init_conf_args(args, + required_args = ['root_dir'], + optional_args = ['user', 'sep', 'user_duke_email', 'project_name']): + conf_args = {} + if args['conf_file']: + conf_args = ruamel.yaml.load(args['conf_file'], Loader=ruamel.yaml.Loader) + for r in required_args: + conf_args[r] = args[r] if (r in args and args[r]) else conf_args[r] + try: + assert conf_args[r] is not None + except AssertionError as e: + print("[ERROR]", r, "not defined") + raise + for o in optional_args: + conf_args[o] = args[o] if (o in args and args[o]) else (conf_args[o] if o in conf_args else None) + conf_args['user'] = conf_args['user'] or os.environ['USER'] + conf_args['user_duke_email'] = conf_args['user_duke_email'] or "%s@duke.edu" % conf_args['user'] + conf_args['project_name'] = conf_args['project_name'] or os.path.splitext(os.path.basename(args['metadata'].name))[0] + + return conf_args + +def main(): + parser = argparse.ArgumentParser('Generator of Jupyter notebooks to execute CWL pre-processing pipelines') + parser.add_argument('-o', '--out', required=True, type=str, help='Jupyter notebook output file name') + parser.add_argument('-m', '--metadata', required=True, type=argparse.FileType('r'), + help='Metadata file with samples information') + parser.add_argument('-f', '--force', action='store_true', help='Force to overwrite output file') + parser.add_argument('-n', '--no-upload', action='store_false', + help='Avoids uploading generated data to database when specified') + parser.add_argument('--metadata-sep', dest='sep', required=False, type=str, default='\t', + help='Separator for metadata file (when different than Excel spread sheet)') + parser.add_argument('--project-name', required=False, type=str, + help='Project name (by default, basename of metadata file name)') + parser.add_argument('--data-from', required=False, choices=consts.data_sources, + default=consts.data_sources[0], + help='Choices: %s' % (', '.join(consts.data_sources))) + parser.add_argument('-c', '--conf-file', required=False, type=argparse.FileType('r'), help='YAML configuration file (see examples)') + parser.add_argument('-u', '--user', required=False, + help='HARDAC User used in SLURM (default: ${USER})') + parser.add_argument('-e', '--user-duke-email', required=False, + help='Email(s) notified when execution is finished (default: ${USER}@duke.edu)') + parser.add_argument('-r', '--root-dir', required=False, + help='Root directory where all subfolders and files will be created ' + '(semi-required: either defined here or in conf-file)') + + args = parser.parse_args() + + conf_args = init_conf_args(vars(args)) + + outfile = "%s.ipynb" % conf_args['project_name'] + + if os.path.isdir(args.out): + outfile = os.path.join(args.out, outfile) + else: + outfile = args.out + + if os.path.isfile(outfile) and not args.force: + print(outfile, "is an existing file. Please use -f or --force to overwrite the contents") + sys.exit(1) + + conf_args['upload'] = args.no_upload + conf_args['data_from'] = args.data_from + make_notebook(outfile, + args.metadata, + conf_args=conf_args) + + +if __name__ == '__main__': + main() diff --git a/templates/cwl_json_gen.j2 b/ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2 similarity index 100% rename from templates/cwl_json_gen.j2 rename to ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2 diff --git a/templates/cwl_slurm_array_gen.j2 b/ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2 similarity index 100% rename from templates/cwl_slurm_array_gen.j2 rename to ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2 diff --git a/templates/data_upload.j2 b/ggr_cwl_ipynb_gen/templates/data_upload.j2 similarity index 100% rename from templates/data_upload.j2 rename to ggr_cwl_ipynb_gen/templates/data_upload.j2 diff --git a/templates/download_fastq_files.j2 b/ggr_cwl_ipynb_gen/templates/download_fastq_files.j2 similarity index 100% rename from templates/download_fastq_files.j2 rename to ggr_cwl_ipynb_gen/templates/download_fastq_files.j2 diff --git a/templates/generate_plots.j2 b/ggr_cwl_ipynb_gen/templates/generate_plots.j2 similarity index 100% rename from templates/generate_plots.j2 rename to ggr_cwl_ipynb_gen/templates/generate_plots.j2 diff --git a/templates/generate_qc_cell.j2 b/ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2 similarity index 100% rename from templates/generate_qc_cell.j2 rename to ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2 diff --git a/templates/merge_lanes_fastq.j2 b/ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2 similarity index 100% rename from templates/merge_lanes_fastq.j2 rename to ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2 diff --git a/templates/ungzip_fastq_files.j2 b/ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2 similarity index 100% rename from templates/ungzip_fastq_files.j2 rename to ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5a3c46 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c434e65 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +jinja2 >=2.8 +nbformat >=4.0.1 +numpy >=1.10.4 +pandas >=0.17.1 +xlrd >=1.0.0 +ruamel.yaml >=0.11.11 + +setuptools +pymongo \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..5b06ed2 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,18 @@ +[metadata] +name = ggr-cwl-ipynb-gen-alexbarrera +version = 0.5.0 +author = Alejandro Barrera +author_email = alejandro.barrera@duke.edu +description = IPython notebook generator for GGR CWL processing pipelines of genomic data +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/alexbarrera/ggr-cwl-ipynb-gen +project_urls = + Bug Tracker = https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +packages = find: diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..27ce028 --- /dev/null +++ b/setup.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +"""Description: +Setup script for ggr-cwl-ipynb-gen +IPython Notebook generator for processing genomic data from GGR project, in CWL + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file LICENSE included with +the distribution). +""" +from setuptools import setup, find_packages +import pathlib + +here = pathlib.Path(__file__).parent.resolve() + +# Get the long description from the README file +long_description = (here / 'README.md').read_text(encoding='utf-8') + +# Load version as VERSION environmental variable +exec(open("VERSION.py").read()) + +setup( + name='ggr_cwl_ipynb_gen', + version=VERSION, + description='IPython notebook generator for GGR CWL processing pipelines of genomic data', + long_description=long_description, # Optional + long_description_content_type='text/markdown', + url='https://github.com/alexbarrera/ggr-cwl-ipynb-gen', + author='Alejandro Barrera', + author_email='alejandro.barrera@duke.edu', + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Science/Research', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9' + ], + keywords='cwl, bioinformatics, development', + scripts=[ + "ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py", + "chipdb_upload/data_upload.py", + ], + include_package_data=True, + py_modules=["ggr_cwl_ipynb_gen"], + python_requires='>=2.7, <4', + install_requires=[ + 'jinja2 >=2.8', + 'nbformat >=4.0.1', + 'numpy >=1.10.4', + 'pandas >=0.17.1', + 'xlrd >=1.0.0', + 'ruamel.yaml >=0.11.11', + 'setuptools', + 'pymongo' + ], # Optional + data_files=[ + ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_slurm_array_gen.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/data_upload.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/download_fastq_files.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/generate_plots.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2']), + ('templates', ['ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2']), + ], # Optional + project_urls={ + 'Bug Reports': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues', + 'Source': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/', + } + +) \ No newline at end of file From 3edda0877cd11b69cdacd7b3683e23e70ab6f3b4 Mon Sep 17 00:00:00 2001 From: Alejandro Barrera Date: Tue, 9 Mar 2021 10:49:33 -0500 Subject: [PATCH 3/6] Drop attempt to support Python 2 --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 27ce028..05751ca 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,9 @@ under the terms of the BSD License (see the file LICENSE included with the distribution). """ +import sys +if sys.version_info[0] == 2: + sys.exit("Sorry, Python 2 is not supported anymore. Please check and old branch (pre 2021)") from setuptools import setup, find_packages import pathlib @@ -32,8 +35,6 @@ 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics', 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', @@ -47,7 +48,7 @@ ], include_package_data=True, py_modules=["ggr_cwl_ipynb_gen"], - python_requires='>=2.7, <4', + python_requires='>=3.1', install_requires=[ 'jinja2 >=2.8', 'nbformat >=4.0.1', From a1b068447c1988aeafd4ce0c3b05b14066d2e1b2 Mon Sep 17 00:00:00 2001 From: Alejandro Barrera Date: Mon, 15 Mar 2021 17:22:00 -0400 Subject: [PATCH 4/6] Resolve review comments from Thomas --- MANIFEST.in | 1 - VERSION.py | 1 - ggr_cwl_ipynb_gen/__init__.py | 2 -- ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py | 2 ++ requirements.txt | 3 +-- setup.cfg | 18 +++++++++++---- setup.py | 32 +------------------------- 7 files changed, 17 insertions(+), 42 deletions(-) delete mode 100644 MANIFEST.in delete mode 100644 VERSION.py diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 903bcc4..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include templates/* \ No newline at end of file diff --git a/VERSION.py b/VERSION.py deleted file mode 100644 index f1c763a..0000000 --- a/VERSION.py +++ /dev/null @@ -1 +0,0 @@ -VERSION = '0.5.0' \ No newline at end of file diff --git a/ggr_cwl_ipynb_gen/__init__.py b/ggr_cwl_ipynb_gen/__init__.py index a75ddec..e69de29 100644 --- a/ggr_cwl_ipynb_gen/__init__.py +++ b/ggr_cwl_ipynb_gen/__init__.py @@ -1,2 +0,0 @@ -if __name__ == '__main__': - ggr-main() diff --git a/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py index 1cc9fde..2de202e 100644 --- a/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py +++ b/ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py @@ -634,6 +634,8 @@ def main(): parser.add_argument('-r', '--root-dir', required=False, help='Root directory where all subfolders and files will be created ' '(semi-required: either defined here or in conf-file)') + parser.add_argument('-v', '--version', required=False, + help='Print version of the program and exit') args = parser.parse_args() diff --git a/requirements.txt b/requirements.txt index c434e65..48a30d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,4 @@ pandas >=0.17.1 xlrd >=1.0.0 ruamel.yaml >=0.11.11 -setuptools -pymongo \ No newline at end of file +pymongo diff --git a/setup.cfg b/setup.cfg index 5b06ed2..4b62c45 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,18 +1,26 @@ [metadata] -name = ggr-cwl-ipynb-gen-alexbarrera +name = ggr-cwl-ipynb-gen version = 0.5.0 author = Alejandro Barrera author_email = alejandro.barrera@duke.edu description = IPython notebook generator for GGR CWL processing pipelines of genomic data long_description = file: README.md long_description_content_type = text/markdown -url = https://github.com/alexbarrera/ggr-cwl-ipynb-gen +url = https://github.com/ReddyLab/ggr-cwl-ipynb-gen project_urls = - Bug Tracker = https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues + Bug Tracker = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/issues + Source = https://github.com/ReddyLab/ggr-cwl-ipynb-gen/ + classifiers = - Programming Language :: Python :: 3 + Development Status :: 3 - Alpha + Intended Audience :: Science/Research + Topic :: Scientific/Engineering :: Bio-Informatics License :: OSI Approved :: MIT License - Operating System :: OS Independent + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 [options] packages = find: diff --git a/setup.py b/setup.py index 05751ca..a5d1e4e 100644 --- a/setup.py +++ b/setup.py @@ -11,36 +11,10 @@ if sys.version_info[0] == 2: sys.exit("Sorry, Python 2 is not supported anymore. Please check and old branch (pre 2021)") from setuptools import setup, find_packages -import pathlib - -here = pathlib.Path(__file__).parent.resolve() - -# Get the long description from the README file -long_description = (here / 'README.md').read_text(encoding='utf-8') - -# Load version as VERSION environmental variable -exec(open("VERSION.py").read()) setup( - name='ggr_cwl_ipynb_gen', - version=VERSION, - description='IPython notebook generator for GGR CWL processing pipelines of genomic data', - long_description=long_description, # Optional long_description_content_type='text/markdown', url='https://github.com/alexbarrera/ggr-cwl-ipynb-gen', - author='Alejandro Barrera', - author_email='alejandro.barrera@duke.edu', - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9' - ], keywords='cwl, bioinformatics, development', scripts=[ "ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py", @@ -68,10 +42,6 @@ ('templates', ['ggr_cwl_ipynb_gen/templates/generate_qc_cell.j2']), ('templates', ['ggr_cwl_ipynb_gen/templates/merge_lanes_fastq.j2']), ('templates', ['ggr_cwl_ipynb_gen/templates/ungzip_fastq_files.j2']), - ], # Optional - project_urls={ - 'Bug Reports': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/issues', - 'Source': 'https://github.com/alexbarrera/ggr-cwl-ipynb-gen/', - } + ] ) \ No newline at end of file From c1e5e23db5ef21d54aa47e0d45dfa85af1236206 Mon Sep 17 00:00:00 2001 From: Alejandro Barrera Date: Mon, 15 Mar 2021 17:38:22 -0400 Subject: [PATCH 5/6] Add version to pymongo requirement --- requirements.txt | 3 +-- setup.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 48a30d4..e0a34ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,4 @@ numpy >=1.10.4 pandas >=0.17.1 xlrd >=1.0.0 ruamel.yaml >=0.11.11 - -pymongo +pymongo >=3.4.0 diff --git a/setup.py b/setup.py index a5d1e4e..7cf4142 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ 'xlrd >=1.0.0', 'ruamel.yaml >=0.11.11', 'setuptools', - 'pymongo' + 'pymongo >=3.4.0' ], # Optional data_files=[ ('templates', ['ggr_cwl_ipynb_gen/templates/cwl_json_gen.j2']), From b777788549c3dd3872c1630debf05c0333daaf34 Mon Sep 17 00:00:00 2001 From: Alejandro Barrera Date: Mon, 15 Mar 2021 17:41:48 -0400 Subject: [PATCH 6/6] Add version to pymongo requirement --- data_upload.py | 1 - ggr-cwl-ipynb-gen.py | 1 - 2 files changed, 2 deletions(-) delete mode 120000 data_upload.py delete mode 120000 ggr-cwl-ipynb-gen.py diff --git a/data_upload.py b/data_upload.py deleted file mode 120000 index 9aae8f7..0000000 --- a/data_upload.py +++ /dev/null @@ -1 +0,0 @@ -chipdb_upload/data_upload.py \ No newline at end of file diff --git a/ggr-cwl-ipynb-gen.py b/ggr-cwl-ipynb-gen.py deleted file mode 120000 index e27d141..0000000 --- a/ggr-cwl-ipynb-gen.py +++ /dev/null @@ -1 +0,0 @@ -ggr_cwl_ipynb_gen/ggr-cwl-ipynb-gen.py \ No newline at end of file