Addresses #3. Only the parser for AMOS file is done. Now can use it t…

…o make an executable that will pull quality data from the input fastq files and join them with AMOS RED sequences
VDBWRAIR · Apr 3, 2015 · 26fac61 · 26fac61
1 parent 235fc86
commit 26fac61
Show file tree

Hide file tree

Showing 5 changed files with 508 additions and 0 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,3 +9,4 @@ Version 0.1.0
 * Added rename_fasta that can rename fasta sequence identifiers based
   on a input rename file
 * Added travis, coveralls, readthedocs
+* Added amos file parser that is specific to Ray assembler amos format
diff --git a/bio_pieces/amos.py b/bio_pieces/amos.py
@@ -0,0 +1,187 @@
+import re
+import importlib
+
+def splitline(strvalue, delimiter, converter):
+    '''
+    Split a string on delimiter into 2 pieces and convert the
+    right piece using converter
+
+    :param str strvalue: String to split
+    :param str delimiter: string delimiter to split with
+    :param func converter: function to convert right piece of split with
+    :return: tuple(left, converter(right))
+    '''
+    left, right = strvalue.split(delimiter)
+    return (left, converter(right))
+
+class AMOS(object):
+    def __init__(self, file_handle):
+        self.reds = {}
+        self.ctgs = {}
+        self._parse_file(file_handle)
+
+    def _parse_file(self, file_handle):
+        '''
+        Iterate over file_handle and parse the AMOS contents
+        '''
+        hdr = re.compile('^\{([A-Z]{3})$')
+        # Build this up to be parsed
+        curstr = ''
+        curtype = None
+        for line in file_handle:
+            m = hdr.match(line)
+            # New type found
+            if m:
+                _typename = m.group(1)
+                # First iteration
+                if curtype is None:
+                    curstr = ''
+                    curtype = getattr(importlib.import_module(__name__), _typename)
+                elif _typename != 'TLE':
+                    # Append parsed curtype to the correct list
+                    # either self.reds or self.ctgs
+                    p = curtype.parse(curstr)
+                    classname = curtype.__name__.lower()
+                    getattr(self, classname + 's')[p.iid] = p
+                    # Get new type and reset curstr
+                    curtype = getattr(importlib.import_module(__name__), _typename)
+                    curstr = ''
+            curstr += line
+        # Catch the last CTG
+        p = curtype.parse(curstr)
+        self.ctgs[p.iid] = p
+
+class CTG(object):
+    def __init__(self, iid, eid, com, seq, qlt, tlelist):
+        self.iid = iid
+        self.eid = eid
+        self.com = com
+        self.seq = seq
+        self.qlt = qlt
+        self.tlelist = tlelist
+
+    @classmethod
+    def parse(klass, ctgstr):
+        '''
+        Parse an AMOS CTG block and return a CTG instance
+
+        {CTG
+        iid:1
+        eid:contig-000001
+        com:
+        Ray software bla bla
+        .
+        seq:
+        ATGC
+        .
+        qlt:
+        DDDD
+        .
+        {TLE
+        src:1
+        off:1
+        clr:0,4
+        }
+        }
+        
+        :param str ctgstr: CTG block string
+        :param dict reddict: Dictionary of iid -> RED instances
+        :return: CTG instance
+        '''
+        lines = ctgstr.splitlines()
+        iid = splitline(lines[1], ':', int)[1]
+        eid = splitline(lines[2], ':', str)[1]
+        com = lines[4]
+        seq = lines[7]
+        qlt = lines[10]
+        # Lines 12 -> end-1 are TLE blocks
+        tlelist = []
+        for i in range(12, len(lines)-1, 5):
+            # The \n gets stripped from the splitlines above
+            tlestr = '\n'.join(lines[i:i+5])
+            tle = TLE.parse(tlestr)
+            tlelist.append(tle)
+        return CTG(iid, eid, com, seq, qlt, tlelist)
+
+class TLE(object):
+    def __init__(self, src, off, clr):
+        self.src = src
+        self.off = off
+        self.clr = clr
+
+    @classmethod
+    def parse(klass, tlestr):
+        '''
+        Parse an AMOS TLE block and return a TLE instance
+
+        {TLE
+        src:1
+        off:1
+        clr:0,10
+        }
+
+        :param str tlestr: TLE block string
+        :return: TLE instance
+        '''
+        lines = tlestr.splitlines()
+        l = len(lines)
+        if l != 5:
+            raise ValueError('Got {0} lines instead of 5'.format(l))
+        src = splitline(lines[1], ':', int)[1]
+        off = splitline(lines[2], ':', int)[1]
+        clr = splitline(lines[3], ':', str)[1]
+        return TLE(src, off, clr)
+
+class RED(object):
+    class MismatchedSequenceError(Exception):
+        pass
+
+    def __init__(self, iid, eid, seq, qlt):
+        self.iid = iid
+        self.eid = eid
+        self.seq = seq
+        self.qlt = qlt
+
+    @classmethod
+    def parse(klass, redstr):
+        '''
+        Parse an AMOS RED block and return a RED instance
+
+        {RED
+        iid:1
+        eid:1
+        seq:
+        ATGC
+        .
+        qlt:
+        DDDD
+        .
+        }
+
+        :param str redstr: RED block string
+        :return: RED instance
+        '''
+        lines = redstr.splitlines()
+        l = len(lines)
+        if l != 10:
+            raise ValueError("Got {0} lines instead of 10".format(l))
+        iid = splitline(lines[1], ':', int)[1]
+        eid = splitline(lines[2], ':', int)[1]
+        seq = lines[4]
+        qlt = lines[7]
+        return RED(iid, eid, seq, qlt)
+
+    def set_from_seqrec(self, seqrec):
+        '''
+        Set seq, qlt based on a Bio.SeqRecord.SeqRecord
+    
+        :param Bio.SeqRecord.SeqRecord seqrec: SeqRecord to pull id, seq and qual from
+        '''
+        if str(seqrec.seq) != self.seq:
+            raise RED.MismatchedSequenceError(
+                "RED sequence and SeqRecord sequence do not match"
+            )
+        if 'phred_quality' not in seqrec._per_letter_annotations:
+            raise ValueError("Given SeqRecord is missing phred_quality")
+        self.eid = seqrec.id
+        self.qlt = seqrec._per_letter_annotations['phred_quality']
diff --git a/docs/amos.rst b/docs/amos.rst
@@ -0,0 +1,130 @@
+====
+AMOS
+====
+
+AMOS is a file format that is similar to any assembly file format such as ACE or SAM.
+It contains information about each read that is used to assemble each contig.
+
+The format is broken into different message blocks. For the Ray assembler, it 
+produces an AMOS file that is broken into 3 types of message blocks
+
+* RED
+
+    .. code-block:: none
+
+        {RED
+        iid:\d+
+        eid:\d+
+        seq:
+        [ATGC]+
+        .
+        qlt:
+        [A-Z]+
+        }
+
+    iid
+      Integer identifier
+    eid
+      Same as iid?
+    seq
+      Sequence data
+    qlt
+      Should be quality, but is only a series of D's from Ray assembler
+
+* TLE
+
+    .. code-block:: none
+
+        {TLE
+        src:\d+
+        off:\d+
+        clr:\d+,\d+
+        }
+
+    src
+      RED iid that was used
+    off
+      One would think offset, but unsure what it actually means
+    clr
+      Not sure what this is either
+
+* CTG
+
+    .. code-block:: none
+
+        {CTG
+        iid:\d+
+        eid:\w+
+        com:
+        .*$
+        .
+        seq:
+        [ATGC]+
+        .
+        qlt:
+        [A-Z]+
+        .
+        {TLE
+        ...
+        }
+        }
+
+    iid
+      integer id of contig
+    eid
+      contig name
+    com
+      Communication software that generated this contig
+    seq
+      Contig sequence data
+    qlt
+      Supposed to be contig quality data, but for Ray it only produces D's
+    TLE
+      0 or more TLE blocks that represent RED sequences that compose the contig
+
+Parsing
+-------
+
+bio_pieces contains an interface to parse a given file handle that has been opened
+on an AMOS file.
+
+To read in the AMOS file you simply do the following
+
+.. code-block:: python
+
+    from bio_pieces import amos
+    a = None
+    with open('AMOS.afg') as fh:
+        a = amos.AMOS(fh)
+
+CTG
+^^^
+
+To get information about the contigs(CTG) you can access the ``.ctgs`` attribute.
+The contigs are indexed based on their iid so to get the sequence of contig iid 1 
+you would do the following:
+
+.. code-block:: python
+
+    ctg = a.ctgs[1]
+    seq = ctg.seq
+
+To retrieve all the reads(RED) that belong to a specific contig:
+
+.. code-block:: python
+
+    reads = []
+    for tle in ctg.tlelist:
+        reads.append(a.reds[tle.src])
+
+RED
+^^^
+
+To get information about the reads(RED) you can access the ``.reds`` attribute.
+The reds are indexed based on their iid so to get the sequence of red iid 1 you
+would do the following:
+
+.. code-block:: python
+
+    red = a.reds[1]
+    seq = red.seq
diff --git a/docs/index.rst b/docs/index.rst
@@ -13,6 +13,7 @@ Contents:
 
     installation
     scripts/index
+    amos
     changelog
 
 Indices and tables