Update version to 0.1.9

ParkerLab · Sep 17, 2019 · 20c3cca · 20c3cca
1 parent 6cd7de0
commit 20c3cca
Show file tree

Hide file tree

Showing 21 changed files with 1,853 additions and 240 deletions.
diff --git a/README.rst b/README.rst
@@ -7,15 +7,24 @@ A toolkit for working with ATAC-seq data.
 What's in the box?
 ==================
 
-Programs we've found useful in ATAC-seq pipelines
--------------------------------------------------
+Programs we've found useful in analyzing ATAC-seq data
+------------------------------------------------------
 
-* ``trim_adapters``: based on Jason Buenrostro's utility for trimming
-  Illumina adapters by aligning paired reads to each other.
 * ``make_cut_matrix``: useful in conjunction with CENTIPEDE, and in
   generating plots of transcription factor binding sites.
-* ``plot_aggregate_matrix.R``: generates plots for motifs given the
+* ``make_midpoint_matrix``: useful in conjunction with CENTIPEDE, and in
+  generating plots of transcription factor binding sites.
+* ``measure_signal``: measures size and position of ATAC-seq fragments
+  overlapping genomic features.
+* ``measure_features``: given a bigWig file of coverage counts and a BED
+  file of features, calculates a requested statistic for each feature.
+* ``plot_aggregate_cut_matrix.R``: generates plots for motifs given the
   aggregate output of `make_cut_matrix`
+* ``plot_aggregate_midpoint_matrix.R``: generates plots for motifs given the
+  aggregate output of `make_midpoint_matrix`
+* ``plot_signal.R``: plots the output of `measure_signal`
+* ``trim_adapters``: based on Jason Buenrostro's utility for trimming
+  Illumina adapters by aligning paired reads to each other.
 
 A Python library you can use in your own tools for processing ATAC-seq data
 ---------------------------------------------------------------------------
@@ -31,7 +40,8 @@ you might find your pipeline can be simplified too.
 Requirements
 ============
 
-* Python. We've run it successfully under versions 2.7.10 and 3.4.3.
+* Python. We've run it successfully under versions 2.7 and 3.5.
+* pyBigWig
 * pysam
 * python-levenshtein
 * sexpdata
@@ -44,6 +54,10 @@ At the command line::
   git clone https://github.com/ParkerLab/atactk
   pip install ./atactk
 
+Or in one step, if you don't want a local copy::
+
+  pip install git+https://github.com/ParkerLab/atactk
+
 Documentation
 =============
 

diff --git a/atactk/__init__.py b/atactk/__init__.py
@@ -1,12 +1,12 @@
 #
 # atactk: ATAC-seq toolkit
 #
-# Copyright 2015 The Parker Lab at the University of Michigan
+# Copyright 2015 Stephen Parker
 #
 # Licensed under Version 3 of the GPL or any later version
 #
 
 
 __author__ = 'The Parker Lab'
 __email__ = 'parkerlab-software@umich.edu'
-__version__ = '0.1.6'
+__version__ = '0.1.9'
diff --git a/atactk/command.py b/atactk/command.py
@@ -1,7 +1,7 @@
 #
 # atactk: ATAC-seq toolkit
 #
-# Copyright 2015 The Parker Lab at the University of Michigan
+# Copyright 2015 Stephen Parker
 #
 # Licensed under Version 3 of the GPL or any later version
 #
@@ -14,10 +14,13 @@
 from __future__ import print_function
 
 import argparse
+import logging
 import sys
 
 import sexpdata
 
+import atactk
+
 
 def check_bins_for_overlap(bins):
     """
@@ -39,28 +42,51 @@ def check_bins_for_overlap(bins):
     for bin in bins:
         start, end, resolution = bin
         if start <= last_end:
-            raise argparse.ArgumentTypeError("Bin %d-%d overlaps %d-%d." % (start, end, last_start, last_end))
+            raise argparse.ArgumentTypeError("Bin {:d}-{:d} overlaps {:d}-{:d}.".format(start, end, last_start, last_end))
         last_start, last_end = start, end
 
 
+def check_bin_resolutions(bins, extension):
+    """
+    Check the resolutions of a flattened list of bins.
+
+    Look for inconsistent resolutions, which tend to give suboptimal
+    results, or resolutions that don't divide the extended region
+    evenly.
+    """
+    logger = logging.getLogger("{}.{}".format(__name__, check_bin_resolutions.__name__))
+
+    resolutions = set(bin[2] for bin in bins)
+    if len(resolutions) > 1:
+        logger.warn("""We've found we usually get better results using the same resolution in each bin.""")
+
+    for bin in bins:
+        start, end, resolution = bin
+        if extension % resolution != 0:
+            logger.warn("Bin {:d}-{:d} resolution {:d} is not a divisor of extension {:d}".format(start, end, resolution, extension))
+
+
 def parse_bins(bins_string):
     """
-    Parse the string representing template size groups.
+    Parse the string representing fragment size groups.
 
     The bins are specified as a list of groups, each group comprising
     one or more bins, and ending with a resolution value, which
     controls how many individual cuts in the extended region around
     the feature are aggregated. Within the feature itself, we always
-    count the cut points for each base. A complete example:
+    count the cut points for each base.
 
-    (36-149 1) (150-224 225-324 2) (325-400 5)
+    We recommend using the same resolution in all bins. The following
+    example only shows different resolutions to be thorough.
+
+    Assume a --bins value of '(36-149 1) (150-224 225-324 2) (325-400 5)'.
 
     With a resolution of 1, every base in the extended region
-    around motifs overlapping templates of length 36-149 would be
+    around motifs overlapping fragments of length 36-149 would be
     scored independently; each base's cut count would be added to
     the matrix.
 
-    The second group, for templates of length 150-224 or 225-324,
+    The second group, for fragments of length 150-224 or 225-324,
     with a resolution of 2, would result in every two bases in the
     extended region around motifs being added together. Then the
     aggregate scores of the two bins in the group would be summed,
@@ -72,7 +98,7 @@ def parse_bins(bins_string):
 
     To illustrate, assume these settings and an imaginary motif 5
     bases long, with a 10-base extended region on either side, and
-    for the sake of example pretend that each template length bin
+    for the sake of example pretend that each fragment length bin
     had the same counts of cut points around the motif, shown
     here::
 
@@ -120,7 +146,7 @@ def parse_bins(bins_string):
             if resolution < 1:
                 raise ValueError
         except ValueError:
-            raise argparse.ArgumentTypeError("Resolution in bin group %s is not a positive integer." % g)
+            raise argparse.ArgumentTypeError("Resolution in bin group {} is not a positive integer.".format(g))
 
         for i, bin_string in enumerate(bin_group):
             bin_string = bin_string.value()
@@ -131,14 +157,18 @@ def parse_bins(bins_string):
                 start, end = [int(s) for s in bin]
                 if start > end:
                     start, end = end, start
-                    print("Bin %s specified backward; corrected to %d-%d" % (bin_string, start, end), file=sys.stderr)
+                    print("Bin {} specified backward; corrected to {:d}-{:d}".format(bin_string, start, end), file=sys.stderr)
 
                 group.append((start, end, resolution))
             except ValueError:
-                raise argparse.ArgumentTypeError("Bin %s in group %s is malformed." % (i, g))
+                raise argparse.ArgumentTypeError("Bin {} in group {} is malformed.".format(i, g))
         groups.append(group)
 
     # flatten groups to just a list of bins, sort, check for overlaps
     bins = sorted([b for bins in groups for b in bins])
     check_bins_for_overlap(bins)
     return groups
+
+
+def version(program_name=''):
+    return program_name + ' ' + atactk.__version__
diff --git a/atactk/data.py b/atactk/data.py
@@ -1,7 +1,7 @@
 #
 # atactk: ATAC-seq toolkit
 #
-# Copyright 2015 The Parker Lab at the University of Michigan
+# Copyright 2015 Stephen Parker
 #
 # Licensed under Version 3 of the GPL or any later version
 #
@@ -11,8 +11,6 @@
 Code for reading and manipulating data commonly used in ATAC-seq pipelines.
 """
 
-from __future__ import print_function
-
 import csv
 import gzip
 import pysam
@@ -80,15 +78,12 @@ def __init__(self, reference=None, start=None, end=None, name=None, score=0, str
         self.feature_end = int(end)
 
         # optional BED fields
-        self.name = name
-        self.score = float(score)
-        self.strand = strand
+        self.name = name or None
+        self.score = score and float(score) or None
+        self.strand = strand or None
 
         # region adjustments
         self.extension = int(extension)
-        self.is_reverse = strand == '-'
-        self.region_start = self.feature_start - self.extension
-        self.region_end = self.feature_end + self.extension
 
     def __str__(self):
         return '\t'.join(str(attribute or '') for attribute in [
@@ -101,13 +96,30 @@ def __str__(self):
             self.extension,
         ])
 
+    @property
+    def center(self):
+        return self.feature_start + int(round(self.feature_length / 2.0))
+
     @property
     def feature_length(self):
         return self.feature_end - self.feature_start
 
+    @property
+    def is_reverse(self):
+        return self.strand == '-'
+
+    @property
+    def region_end(self):
+        return self.center + self.extension
+
     @property
     def region_length(self):
-        return self.region_end - self.region_start
+        return self.extension * 2
+
+    @property
+    def region_start(self):
+        return self.center - self.extension
+
 
 def complement(seq):
     """
@@ -175,8 +187,9 @@ def open_maybe_gzipped(filename):
 
 def count_features(filename):
     count = 0
-    for line in open_maybe_gzipped(filename):
-        count += 1
+    with open_maybe_gzipped(filename) as f:
+        for line in f:
+            count += 1
     return count
 
 
@@ -203,32 +216,24 @@ def read_features(filename, extension=100, feature_class=ExtendedFeature):
         An :class:`ExtendedFeature` instance for each row of the file.
     """
 
-    if filename == '-':
-        source = sys.stdin
-    else:
-        source = open_maybe_gzipped(filename)
-    reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab')
-    for row in reader:
-        if 'extra_fields' in row:
-            del row['extra_fields']
-        yield feature_class(extension=extension, **row)
+    with filename == '-' and sys.stdin or open_maybe_gzipped(filename) as source:
+        reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab')
 
-
-ALIGNMENT_FILE_CACHE = {}
+        for row in reader:
+            if 'extra_fields' in row:
+                del row['extra_fields']
+            yield feature_class(extension=extension, **row)
 
 
 def open_alignment_file(alignment_filename):
-    if alignment_filename in ALIGNMENT_FILE_CACHE:
-        return ALIGNMENT_FILE_CACHE[alignment_filename]
-
     alignment_file = pysam.AlignmentFile(alignment_filename, 'rb')
     try:
         alignment_file.check_index()
     except AttributeError:
         raise AttributeError('The alignments file {} is not in BAM format. Please supply an indexed BAM file.'.format(alignment_filename))
     except ValueError:
         raise ValueError('The alignment file {} is not usable. Please supply an indexed BAM file.'.format(alignment_filename))
-    ALIGNMENT_FILE_CACHE[alignment_filename] = alignment_file
+
     return alignment_file
 
 

diff --git a/atactk/metrics/__init__.py b/atactk/metrics/__init__.py
@@ -0,0 +1,7 @@
+#
+# atactk: ATAC-seq toolkit
+#
+# Copyright 2015 Stephen Parker
+#
+# Licensed under Version 3 of the GPL or any later version
+#
diff --git a/atactk/metrics/common.py b/atactk/metrics/common.py
@@ -0,0 +1,36 @@
+#
+# atactk: ATAC-seq toolkit
+#
+# Copyright 2015 Stephen Parker
+#
+# Licensed under Version 3 of the GPL or any later version
+#
+
+
+import atactk.data
+import atactk.util
+
+
+DEFAULT_CUT_POINT_OFFSET = 4
+
+
+def reduce_scores(scores, resolution):
+    """
+    Reduce a sequence of scores by summing every `resolution` values.
+
+    Called with `scores` of [0, 1, 1, 4, 2], you'd get the following
+    results at various resolutions:
+
+    ==========  ======
+    Resolution  Result
+    ==========  ======
+    1           [0, 1, 1, 4, 2]
+    2           [1, 5, 2]
+    3           [2, 6]
+    4           [6, 2]
+    10          [8]
+    ==========  ======
+    """
+    if resolution == 1:
+        return scores
+    return [sum(chunk) for chunk in atactk.util.partition(resolution, scores)]