shapemapper-txome

#!/usr/bin/env python

import datetime
start_time = datetime.datetime.now()

import sys, os

from scripts.logger import Logger
from scripts.job_wrapper import stage
from scripts.shapemapper_wrapper import fmt_shapemapper_cmds
from scripts.parse_args import parse_args
from scripts.util import timestamp, makedirs
from scripts.globals import god

THIS_DIR = os.path.split(os.path.realpath(__file__))[0]

if '--test' in sys.argv[1:]:
    args = '''
            --paired
            --out test
            --target test_data/16S.fa
                     test_data/23S.fa
                     test_data/TPP.fa
            '''.strip().split()
    args += ['--shapemapper-args', '--random-primer-len 9']
    if '--sam' in sys.argv[1:]:
        args += ['--modified', 'test_data/sam/modified.sam',
                 '--untreated', 'test_data/sam/untreated.sam']  
    elif '--bam' in sys.argv[1:]:
        args += ['--modified', 'test_data/bam/modified.bam',
                 '--untreated', 'test_data/bam/untreated.bam']
    else:
        args += ['--modified', 'test_data/modified_10k_fastq',
                 '--untreated', 'test_data/untreated_10k_fastq']
else:
    args = sys.argv[1:]

p = parse_args(args)

outlog_path = p.out + "/" + "kallisto-txome_log.txt"
outlog = Logger(outlog_path,
                sys.stdout)
# override stdout and stderr globally to redirect through logger
sys.stdout = outlog
sys.stderr = outlog

s = "Started kallisto-txome at {}".format(timestamp())
print('#' * len(s))
print(s)
print("arguments = {}".format(p))
print("Will write all output to {}.".format(p.out))
makedirs(p.out)

# parameters used in job_wrapper.py functions
god.platform = p.platform
god.max_jobs = p.max_jobs
god.bsub_opts = "-n{} -R span[hosts=1]".format(p.nproc)

# -----------------------------------------------------------------------------
# check fasta target names for characters that don't play nicely with bowtie2
# and/or bbmerge
dir = p.out + "/fasta_check"
cmd = (
    '{THIS_DIR}/scripts/check_fasta_names.py '
    '--target {target} '
).format(THIS_DIR=THIS_DIR,
         dir=dir,
         target=' '.join(p.target))
stage(name="checking fasta target names",
      dir=dir,
      done=dir + "/fasta_check_done",
      cmd=cmd)

kallisto_dir = p.out + "/kallisto_pseudomap"
bam_to_sam_dir = p.out + "/convert_sam"

samples = ['modified','untreated']
sam_paths = {}
bam_paths = {}
id_file = None
if p.filetype == 'sam':
    for sample in samples:
        sam_paths[sample] = p.input_files[sample][0]
elif p.filetype == 'bam':
    for sample in samples:
        bam_paths[sample] = p.input_files[sample][0]
        sam_paths[sample] = "{bam_to_sam_dir}/{sample}/converted.sam".format(
                                bam_to_sam_dir=bam_to_sam_dir, sample=sample)
elif p.filetype == 'fastq':
    for sample in samples:
        bam_paths[sample] = "{kallisto_dir}/{sample}/pseudoalignments.bam".format(
            kallisto_dir=kallisto_dir, sample=sample)
        sam_paths[sample] = "{bam_to_sam_dir}/{sample}/pseudoalignments.sam".format(
            bam_to_sam_dir=bam_to_sam_dir, sample=sample)

if p.filetype == 'fastq':
    # -----------------------------------------------------------------------------
    # generate kallisto index
    dir = p.out + "/kallisto_index"
    cmd = 'kallisto index -i {dir}/index {target}'.format(dir=dir,
                                                          target=' '.join(p.target))
    stage(name="kallisto index building",
          dir=dir,
          done=dir + "/kallisto_index_building_done",
          cmd=cmd)

    # -----------------------------------------------------------------------------
    # run pseudomapping
    prev_dir = dir
    dir = kallisto_dir
    extra_flags = '--single --fragment-length {fragment_length} --sd {fragment_sd}'
    extra_flags = extra_flags.format(fragment_length=p.fragment_length,
                                     fragment_sd=p.fragment_sd)
    if p.paired:
        extra_flags = ''
    cmds = []
    for sample in p.input_files.keys():
        if len(p.input_files[sample]) == 0:
            continue
        cmd = ('kallisto quant -i "{prev_dir}/index" --pseudobam --plaintext '
               '--threads {threads} '
               '-o "{dir}/{sample}" {inputs} {extra_flags}')
        cmd = cmd.format(dir=dir,
                         prev_dir=prev_dir,
                         target=' '.join(p.target),
                         sample=sample,
                         inputs=' '.join(['"{}"'.format(f) for f in p.input_files[sample]]),
                         extra_flags=extra_flags,
                         threads=p.nproc)
        cmds.append(cmd)

    stage(name="kallisto pseudomapping to targets",
          dir=dir,
          done=dir + "/kallisto_pseudomapping_done",
          cmds=cmds)


if p.filetype in ['bam','fastq']:
    # -----------------------------------------------------------------------------
    # convert BAM to SAM for simpler parsing
    dir = bam_to_sam_dir
    cmds = []
    for sample in p.input_files.keys():
        if len(p.input_files[sample]) == 0:
            continue
        cmd = (
            'mkdir -p "{{dir}}/{{sample}}" '
            '&& '
            'samtools view -h -o "{output}" "{input}"'
        )
        cmd = cmd.format(
            output=sam_paths[sample],
            input=bam_paths[sample],
        )
        cmd = cmd.format(dir=dir,
                         sample=sample)
        cmds.append(cmd)

    stage(name="SAM conversion",
          dir=dir,
          done=dir + "/sam_conversion_done",
          cmds=cmds)

    sam_dir = dir


# -------------------------------------------------------------------------------------
# handle reads mapping to multiple targets and exclude headers and unmapped reads
dir = p.out + "/multimapper_processed_sam"
cmds = []
for sample in p.input_files.keys():
    if len(p.input_files[sample]) == 0:
        continue
    cmd = (
        'mkdir -p "{{dir}}/{{sample}}" '
        '&& '
        '{{THIS_DIR}}/scripts/filter_multimappers.py '
        '--in "{input}" '
        '--out "{output}" '
        '--multimapper-mode {multimapper_mode}'
    )
    cmd = cmd.format(
        output="{dir}/{sample}/processed.sam",
        input=sam_paths[sample],
        multimapper_mode=p.multimapper_mode,
    )
    cmd = cmd.format(THIS_DIR=THIS_DIR,
                     dir=dir,
                     sample=sample)
    cmds.append(cmd)

stage(name="multimapper handling",
      dir=dir,
      done=dir + "/multimapper_handling_done",
      cmds=cmds)

processed_sam_dir = dir


# -----------------------------------------------------------------------------
# sort SAM files by pseudomapped target
dir = p.out + "/target_sorted_sam"
cmds = []
for sample in samples:
    if len(p.input_files[sample]) == 0:
        continue
    cmd = (
        'mkdir -p "{{dir}}/{{sample}}" '
        '&& '
        'sort --stable '
        '-k 3,3 '
        '-S 20% '
        '-T "{{dir}}/{{sample}}" '
        '-o "{output}" '
        '"{input}"'
    )
    cmd = cmd.format(
        output="{dir}/{sample}/sorted.sam",
        input="{processed_sam_dir}/{sample}/processed.sam",
    )
    cmd = cmd.format(processed_sam_dir=processed_sam_dir,
                     dir=dir,
                     sample=sample)
    cmds.append(cmd)

stage(name="SAM sorting",
      dir=dir,
      done=dir + "/sam_sorting_done",
      cmds=cmds)

sorted_sam_dir = dir


# --------------------------------------------------------------------------------
# identify transcripts above some potential coverage or total read count threshold
dir = p.out + "/select_targets"
id_file = "{dir}/selected_ids.txt".format(dir=dir)

if p.filetype == 'fastq':
    cmd = (
        '{THIS_DIR}/scripts/select_targets.py '
        '--in {input_args} '
        '--out "{id_file}" '
        '--min-reads {min_reads} '
        '--min-mean-coverage {min_mean_coverage} '
        '--frag-len {frag_len} '
    )
    input_args = ''
    for sample in p.input_files.keys():
        if len(p.input_files[sample]) == 0:
            continue
        input_args += (
            '"{kallisto_dir}/{sample}/abundance.tsv" '.format(
                kallisto_dir=kallisto_dir, sample=sample
            )
        )
    cmd = cmd.format(
        THIS_DIR=THIS_DIR,
        dir=dir,
        input_args=input_args,
        min_reads=p.min_reads,
        min_mean_coverage=p.min_mean_coverage,
        frag_len=p.fragment_length,
        id_file=id_file,
    )

elif p.filetype in ['sam','bam']:
    sam = ''
    for sample in samples:
        if len(p.input_files[sample]) == 0:
            continue
        sam += '"{sorted_sam_dir}/{sample}/sorted.sam" '.format(
            sorted_sam_dir=sorted_sam_dir,
            sample=sample,
        )
    cmd = (
        '{THIS_DIR}/scripts/select_targets_from_sam.py '
        '--in {sam} '
        '--out "{id_file}" '
        '--min-reads {min_reads} '
    )
    cmd = cmd.format(
        THIS_DIR=THIS_DIR,
        dir=dir,
        sam=sam,
        min_reads=p.min_reads,
        id_file=id_file,
    )


stage(name="target coverage selection",
      dir=dir,
      done=dir + "/target_coverage_selection_done",
      cmd=cmd)


# -----------------------------------------------------------------------------
# generate fasta with selected transcripts only,
# folder hierarchy of individual fasta files and a file indexing
# those paths by target name
dir = p.out + "/filter_targets"
cmd = (
    '{THIS_DIR}/scripts/filter_targets.py '
    '--target {target} '
    '--out-dir "{dir}/single_seqs" '
    '--out-fasta-paths "{fasta_locations_file}" '
    '--max-files-per-folder {max_files_per_folder} '
)

if id_file is not None:
    cmd += '--id-file "{id_file}" '
    #cmd += '--out "{dir}/selected_targets.fa" '

fasta_locations_file = '{dir}/fasta_locations.txt'.format(dir=dir)
cmd = cmd.format(
    THIS_DIR=THIS_DIR,
    dir=dir,
    target=' '.join(['"{}"'.format(f) for f in p.target]),
    id_file=id_file,
    fasta_locations_file=fasta_locations_file,
    max_files_per_folder=p.max_files_per_folder,
)

stage(name="target filtering",
      dir=dir,
      done=dir + "/target_filtering_done",
      cmd=cmd)


# -----------------------------------------------------------------------------
# split sam file(s) into FASTQ or pairs of FASTQ files for each selected target transcript
dir = p.out + "/fastq_by_target"
cmds = []
paired_arg = '--unpaired'
if p.paired:
    paired_arg = '--paired'
for sample in samples:
    if len(p.input_files[sample]) == 0:
        continue
    cmd = (
        'mkdir -p "{{dir}}/{{sample}}" '
        '&& '
        '{{THIS_DIR}}/scripts/split_sam_to_fastqs.py '
        '--in "{input}" '
        '--selected-target-ids "{id_file}" '
        '--out "{output}" '
        '{paired_arg} '
        '--max-files-per-folder {max_files_per_folder}'
    )
    cmd = cmd.format(
        output="{dir}/{sample}",
        id_file=id_file,
        input="{sorted_sam_dir}/{sample}/sorted.sam",
        paired_arg=paired_arg,
        max_files_per_folder=p.max_files_per_folder,
    )
    cmd = cmd.format(sorted_sam_dir=sorted_sam_dir,
                     THIS_DIR=THIS_DIR,
                     dir=dir,
                     sample=sample)
    cmds.append(cmd)

stage(name="SAM to FASTQ conversion and splitting",
      dir=dir,
      done=dir + "/fastq_conversion_done",
      cmds=cmds)

fastq_dir = dir

# -----------------------------------------------------------------------------
# run shapemapper on each batch of reads and associated target sequence
dir = p.out + "/shapemapper"
done = dir + "/shapemapper_done"


try:
    shapemapper_args = p.shapemapper_args
except AttributeError:
    shapemapper_args = ''

cmds, dirs = fmt_shapemapper_cmds(
    dir=dir,
    fasta_locations_file=fasta_locations_file,
    input_files=p.input_files,
    fastq_dir=fastq_dir,
    paired=p.paired,
    nproc=p.nproc,
    shapemapper_args=shapemapper_args,
    max_files_per_folder=p.max_files_per_folder,
)

stage(name="ShapeMapper",
      dirs=dirs,
      done=done,
      cmds=cmds)

# -----------------------------------------------------------------------------
end_time = datetime.datetime.now()
delta = end_time - start_time
hours, remain = divmod(delta.seconds, 3600)
minutes, seconds = divmod(remain, 60)
print("\nkallisto-txome completed. Total turnaround time {} hours, {} minutes, {} seconds".format(
    hours, minutes, seconds))