From ee0583255b42de359346bd046c212179fd2af714 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Sat, 28 Oct 2017 22:08:17 +0800 Subject: [PATCH] enable gzip output mode gzip input --> gzip output ungzip input --> ungzip output ungzip input + --gzip (-z) option --> gzip output issues: https://github.com/OpenGene/AfterQC/issues/26 https://github.com/OpenGene/AfterQC/issues/14 --- after.py | 4 ++++ fastq.py | 11 +++++++++-- preprocesser.py | 45 +++++++++++++++++++++++++-------------------- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/after.py b/after.py index b33bfc1..58ca8f2 100755 --- a/after.py +++ b/after.py @@ -86,6 +86,10 @@ def parseCommand(): help = "set the qual num to 0 for mismatched base pairs in overlapped areas to mask them out") parser.add_option("", "--no_overlap", dest = "no_overlap", action='store_true', default = False, help = "disable overlap analysis (usually much faster with this option)") + parser.add_option("-z", "--gzip", dest = "gzip", action='store_true', default = False, + help = "force gzip compression for output, even the input is not gzip compressed") + parser.add_option("", "--compression", dest = "compression", type = "int", default = 2, + help = "set compression level (0~9) for gzip output, default is 2 (0 = best speed, 9 = best compression).") return parser.parse_args() def matchFlag(filename, flag): diff --git a/fastq.py b/fastq.py index e27a59a..4623617 100644 --- a/fastq.py +++ b/fastq.py @@ -60,10 +60,12 @@ class Writer: __file = None - def __init__(self, fname): + def __init__(self, fname, force_gzip = False, gzip_compression = 2): self.filename = fname + if not self.filename.endswith(".gz") and force_gzip: + self.filename = self.filename + ".gz" if self.filename.endswith(".gz"): - self.__file = gzip.open(self.filename, "w") + self.__file = gzip.open(self.filename, "w", compresslevel = gzip_compression) elif self.filename.endswith(".bz2"): print("ERROR: Write bzip2 stream is not supported") sys.exit(1) @@ -81,6 +83,11 @@ def __del__(self): def flush(self): if self.__file !=None: self.__file.flush() + + def close(self): + if self.__file !=None: + self.__file.flush() + self.__file.close() def writeLines(self, lines): if self.__file == None: diff --git a/preprocesser.py b/preprocesser.py index 7128da3..4c226af 100755 --- a/preprocesser.py +++ b/preprocesser.py @@ -314,17 +314,22 @@ def run(self): if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) + + gzip_out = self.options.gzip + gzip_comp = self.options.compression; + if not gzip_out and self.options.read1_file.endswith(".gz"): + gzip_out = True good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: - good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq")) - bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq")) + good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq"), gzip_out, gzip_comp) + bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq"), gzip_out, gzip_comp) overlap_read1_file = None if self.options.store_overlap: - overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq")) + overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq"), gzip_out, gzip_comp) #other files are optional read2_file = None @@ -346,24 +351,24 @@ def run(self): if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: - good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq")) - bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq")) + good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq"), gzip_out, gzip_comp) + bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: - overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq")) + overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq"), gzip_out, gzip_comp) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: - good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq")) - bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq")) + good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq"), gzip_out, gzip_comp) + bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: - overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq")) + overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq"), gzip_out, gzip_comp) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: - good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq")) - bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq")) + good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq"), gzip_out, gzip_comp) + bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: - overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq")) + overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq"), gzip_out, gzip_comp) r1 = None r2 = None @@ -633,17 +638,17 @@ def run(self): #close all files if not self.options.qc_only: - good_read1_file.flush() - bad_read1_file.flush() + good_read1_file.close() + bad_read1_file.close() if self.options.read2_file != None: - good_read2_file.flush() - bad_read2_file.flush() + good_read2_file.close() + bad_read2_file.close() if self.options.index1_file != None: - good_index1_file.flush() - bad_index1_file.flush() + good_index1_file.close() + bad_index1_file.close() if self.options.index2_file != None: - good_index2_file.flush() - bad_index2_file.flush() + good_index2_file.close() + bad_index2_file.close() # print stat numbers BAD_READS = TOTAL_READS - GOOD_READS