Skip to content

Commit

Permalink
docs and -S flag fix
Browse files Browse the repository at this point in the history
  • Loading branch information
makeartandgames committed Jul 2, 2015
1 parent 60f67a0 commit 8d56e4e
Showing 1 changed file with 21 additions and 3 deletions.
24 changes: 21 additions & 3 deletions bio_pieces/group_references.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
'''
Usage: parse_contigs <samfile> --outdir <DIR>
Usage: group_references <samfile> --outdir <DIR>
Options:
--outdir=<DIR>,-o=<DIR> output directory [Default: parse_contigs_out]
--outdir=<DIR>,-o=<DIR> output directory [Default: group_references_out]
Create separate fastq files for each reference in a samfile.
'''
Expand All @@ -20,19 +20,34 @@

sam_columns = ["QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL"]
def samview_to_df(rawtext):
'''
:param str rawtext: text of .sam file or output of `samtools view`
:return: pandas.DataFrame
'''
samtext = '\n'.join( fixline(row) for row in str(rawtext).split('\n') )
as_bytes = BytesIO(samtext)
#Write test to ensure handles varaibles #columns
return pd.read_csv(as_bytes, names=sam_columns, usecols=sam_columns, delimiter='\t', header=None, squeeze=True)

def fixline(row):
'''
Fix lines of `samtools view` which are unmapped so they can be easily parsed by pandas.
:row str row: a line from `samtools view`
:return: fixed line
'''
newrow = []
cols = row.split('\t')[:len(sam_columns)]
if len(cols) > 1 and cols[2] == '*':
cols[2] = 'unmapped'
return '\t'.join(cols)

def get_seqs_by_ctg(outdir, rawtext):
'''
Splits a given sam view into seperate fastq files, grouped by references (the first column RNAME), and saves to outdir.
:param str outdir: directory result fastqs are saved to
:param str rawtext: output of `samtools view`
:return: None
'''
sam_df = samview_to_df(rawtext)
contig_groups = sam_df.groupby('RNAME')
fastq = "@{0}\n{1}\n+\n{2}".format
Expand All @@ -43,6 +58,9 @@ def get_seqs_by_ctg(outdir, rawtext):
out.write('\n')

def main():
'''
Call `samtools view` on the input file and split into fastqs by RNAME column.
'''
raw_args = docopt(__doc__)
scheme = Schema({
'<samfile>' : str,
Expand All @@ -52,7 +70,7 @@ def main():
if not os.path.exists(outdir):
os.mkdir(outdir)
infile = parsed_args['<samfile>']
view = open(infile).read() if infile.endswith('.sam') else str(sh.samtools('view', infile))
view = str(sh.samtools('view', infile, S=True)) if infile.endswith('.sam') else str(sh.samtools('view', infile))
get_seqs_by_ctg(outdir, view)
return 0

Expand Down

0 comments on commit 8d56e4e

Please sign in to comment.