# OP-WORKFLOW-CAGEscan-short-reads-v2.0

## Imports

In [1]:
%matplotlib inline

In [2]:
import subprocess, os, shutil
import signal
import tempfile
from collections import defaultdict
from Bio import SeqIO
from itertools import izip

## Custom functions

In [3]:
remove_extension = lambda x: x.split('.')[0]

## Parameters

In [4]:
tagdust2_path = 'tagdust'
bwa_path = 'bwa'
samtools_path = 'samtools'
paired_bam_to_bed12 = 'pairedBamToBed12'
umicountFP = 'umicountFP'
syncpairs = 'syncpairs'

In [5]:
ref_genome = './GRCh38.fa'

In [6]:
softwares = {    
    'bwa': bwa_path,
    'tagdust': tagdust2_path,
    'syncpairs': syncpairs,
    'samtools': samtools_path,
    'pairedBamToBed12': paired_bam_to_bed12,
    'umicountFP': umicountFP}

In [7]:
output_folders = ['tagdust_r1', 'unzip_r2', 'extracted_r1', 'extracted_r2', 'cleaned_reads', 'cleaned_r1', 'cleaned_r2', 
                  'r1_sai', 'r2_sai', 'sampe', 'sam_to_bam', 'genome_mapped',
                  'properly_paired', 'cagescan_pairs', 'cagescan_fragments']

In [8]:
for folder in output_folders:
    os.makedirs(os.path.join('output', folder))

In [9]:
def get_args(read1, read2, ref_genome, output_folders):
    
    r1_shortname = remove_extension(os.path.basename(read1))

    args = {  
        'r1_input': read1,
        'r2_input': read2,
        'ref_genome': ref_genome,
    }
    
    output_paths = {folder: os.path.join('output', folder, r1_shortname) for folder in output_folders}
    
    return dict(args, **output_paths)

In [10]:
cmds = [
    
    '{tagdust} -t8 -o {tagdust_r1} -1 F:NNNNNNNN -2 S:TATAGGG -3 R:N {r1_input}',
    
    'gunzip -c {r2_input} > {unzip_r2}.fq',
        
    '{syncpairs} {tagdust_r1}.fq {unzip_r2}.fq {extracted_r1}.fq {extracted_r2}.fq',
    
    '{tagdust} -arch SimpleArchitecture.txt -ref ercc_and_hg38_rRNA.fa -o {cleaned_reads} {extracted_r1}.fq {extracted_r2}.fq',
    
    'cp {cleaned_reads}_READ1.fq {cleaned_r1}.fq',
    
    'cp {cleaned_reads}_READ2.fq {cleaned_r2}.fq',
    
    '{bwa} aln {ref_genome} {cleaned_r1}.fq > {r1_sai}.sai',
    
    '{bwa} aln {ref_genome} {cleaned_r2}.fq > {r2_sai}.sai',
    
    '{bwa} sampe -a 2000000 -c 0.00001 {ref_genome} {r1_sai}.sai {r2_sai}.sai {cleaned_r1}.fq {cleaned_r2}.fq > {sampe}.sam',
    
    '{samtools} view -bSo  {sam_to_bam}.bam {sampe}.sam',
    
    '{samtools} sort -n {sam_to_bam}.bam {genome_mapped}',
    
    '{samtools} view -f 0x0002 -F 0x0100 -bo {properly_paired}.bam {genome_mapped}.bam',
    
    '{pairedBamToBed12} -i {properly_paired}.bam > {cagescan_pairs}.bed',
    
    '{umicountFP} -f {cagescan_pairs}.bed > {cagescan_fragments}.bed'
    
]

In [11]:
root, folders, files = os.walk('./test_data/').next()

reads1 = sorted([os.path.join(root, f) for f in files if 'R1' in f])[:10]
reads2 = sorted([os.path.join(root, f) for f in files if 'R2' in f])[:10]

In [12]:
for read1, read2 in zip(reads1, reads2):
    args = get_args(read1, read2, ref_genome, output_folders)
    args = dict(args, **softwares)
    
    for cmd in cmds:
        print cmd.format(**args)
        subprocess.call(cmd.format(**args), preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL), shell=True)

tagdust -t8 -o output/tagdust_r1/100_S100_L001_R1_001 -1 F:NNNNNNNN -2 S:TATAGGG -3 R:N ./test_data/100_S100_L001_R1_001.fastq.gz
gunzip -c ./test_data/100_S100_L001_R2_001.fastq.gz > output/unzip_r2/100_S100_L001_R1_001.fq
syncpairs output/tagdust_r1/100_S100_L001_R1_001.fq output/unzip_r2/100_S100_L001_R1_001.fq output/extracted_r1/100_S100_L001_R1_001.fq output/extracted_r2/100_S100_L001_R1_001.fq
tagdust -arch SimpleArchitecture.txt -ref ercc_and_hg38_rRNA.fa -o output/cleaned_reads/100_S100_L001_R1_001 output/extracted_r1/100_S100_L001_R1_001.fq output/extracted_r2/100_S100_L001_R1_001.fq
cp output/cleaned_reads/100_S100_L001_R1_001_READ1.fq output/cleaned_r1/100_S100_L001_R1_001.fq
cp output/cleaned_reads/100_S100_L001_R1_001_READ2.fq output/cleaned_r2/100_S100_L001_R1_001.fq
bwa aln ./GRCh38.fa output/cleaned_r1/100_S100_L001_R1_001.fq > output/r1_sai/100_S100_L001_R1_001.sai
bwa aln ./GRCh38.fa output/cleaned_r2/100_S100_L001_R1_001.fq > output/r2_sai/100_S100_L001_R1_001.sai
b