We need to get from mapped bam file to single fast5s:

Input is:
* mapped bam file.
* multifast5 of fastqs mapped to the reference

Output:
* folder with single fast5s that are mapped

What we need to do:
* get ID of all mapped fastqs
* extract them in batches
* make them single fast5s

In [None]:
pip install ipyparallel

In [4]:
import os
from joblib import Parallel, delayed


In [None]:
def fast5s_to_fastq(dir_):
    print(dir_)
    start = time.time()
    plus = '+'
    fastq_fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
    fast5s = (os.path.join(dir_ ,x) for x in os.listdir(dir_) if x.endswith('.fast5') )
    if os.path.exists(fastq_fn):
        with open(fastq_fn, 'w'):
            pass
    with open(fastq_fn, mode='a+') as fastq_fh:
        for fast5_fn in fast5s:
            with get_fast5_file(fast5_fn, mode='r') as f5:
                with Basecall1DTools(f5) as basecall:
                    n1, s1, q1 = basecall.get_called_sequence('template')

                    print(n1, file=fastq_fh)
                    print(s1, file=fastq_fh)
                    print(plus, file=fastq_fh)
                    print(q1, file=fastq_fh)
    string = '%s done' % fastq_fn
    stop = time.time()
    string = string + ': Done in {:.2f}'.format(stop - start)
    print(string)
    return string

In [2]:
def fast5s_to_fastq(dir_):
    print(dir_)
    start = time.time()
    plus = '+'
    fastq_fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
    fast5s = (os.path.join(dir_ ,x) for x in os.listdir(dir_) if x.endswith('.fast5') )
    if os.path.exists(fastq_fn):
        with open(fastq_fn, 'w'):
            pass
    n  = []
    s  = []
    q  = []
    for fast5_fn in fast5s:
        with get_fast5_file(fast5_fn, mode='r') as f5:
            with Basecall1DTools(f5) as basecall:
                n1, s1, q1 = basecall.get_called_sequence('template')
                n.append(n1)
                s.append(s1)
                q.append(q1)
    
    with open(fastq_fn, mode='a+') as fastq_fh:
        for (n1, s1, q1) in zip(n, s, q):
            print('@%s' % n1, file=fastq_fh)
            print(s1, file=fastq_fh)
            print(plus, file=fastq_fh)
            print(q1, file=fastq_fh)
    string = '%s done' % fastq_fn
    stop = time.time()
    string = string + ': Done in {:.2f}'.format(stop - start)
    print(string)
    return string

In [6]:
!pwd

/home/jamila/jamila_Storage/scripts/multi_to_single_fast5_&_RNA_expression_mapping


### INPUTS

### *****Define directories before running on different sampes*****

In [None]:
###define input directories here 
BAMIN_DIR = '../../analyses/mapping/infected_leaves/infected_leaves_2/'
FAST5IN_DIR = '../../data/genomic_data/infected_leaves/workspace_fast5/infected_leaves_2_fast5_out/'
FAST5singleIN_DIR = '../../analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/'
OUT_DIR = '../../analyses/single_fast5s/infected_leaves/infected_leaves_2_mapped_single_fast5/'
n_threads = 25

### *****Define directories before running on different sampes*****

In [None]:
BAMIN_DIR = os.path.abspath(BAMIN_DIR)
FAST5IN_DIR = os.path.abspath(FAST5IN_DIR)
OUT_DIR = os.path.abspath(OUT_DIR)
FAST5singleIN_DIR = os.path.abspath(FAST5singleIN_DIR)

In [None]:
single_fast5_count = 0
dirs = []
for direcotry in (os.path.join(FAST5singleIN_DIR, x) for x in os.listdir(FAST5singleIN_DIR) if os.path.isdir(os.path.join(FAST5singleIN_DIR, x))):
    dirs.append(direcotry)
    fast5s = [os.path.join(direcotry ,x) for x in os.listdir(direcotry) if x.endswith('.fast5')]
    single_fast5_count += len(fast5s)

In [None]:
print(single_fast5_count)

In [None]:
%%time
print(single_fast5_count)

In [None]:
%%timeit -n10
fast5s_to_fastq(dirs[100])

In [None]:
len(dirs)

In [None]:
from multiprocessing import Pool

In [None]:
with Pool(processes=n_threads) as pool:
    pool.map(fast5s_to_fastq, dirs)

In [None]:
%%time
for  x in (Parallel(n_jobs=n_threads, prefer='threads')(delayed(fast5s_to_fastq)(x) for x in dirs)):
    print(x)

In [None]:
all_fastq_fn = os.path.join(FAST5singleIN_DIR, os.path.basename(FAST5singleIN_DIR)+ '.fastq')

In [None]:
with open(all_fastq_fn, mode='w') as all_fastq_fh:
    for dir_ in dirs[:1]:
        fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
    with open(fn, mode = 'r') as fh:
        for line in fh:
            line.rstrip()
            print(line, file=fh)

In [None]:
##catch the bam based on naming convention
inbam_fn = [os.path.join(BAMIN_DIR, x) for x in os.listdir(BAMIN_DIR) if x.endswith('sorted.bam')][0]

In [None]:
inbam_fn

In [None]:
##generated the read ID list
mappedids_fn = inbam_fn.replace('.bam', '.mappedids.txt')
#!samtools  view -F 4  {inbam_fn} | cut -f 1 | sort | uniq > {mappedids_fn}

In [None]:
!head {mappedids_fn}

In [None]:
### generate a temporary folder that holds the batches of fast5
TMPOUT_DIR = os.path.join(OUT_DIR, 'tmp')
if not os.path.exists(TMPOUT_DIR):
    os.makedirs(TMPOUT_DIR)

In [None]:
#read in the mapped reads ids
mapped_reads = []
with open(mappedids_fn) as fh:
    for line in fh:
        mapped_reads.append(line.rstrip())

In [None]:
mapped_reads = set(mapped_reads)

In [None]:
###go from multies fast5 of mapped reads to single fast5s
!multi_to_single_fast5 -i {FAST5IN_DIR} -s {TMPOUT_DIR} -t 10 --recursive
!touch {OUT_DIR}/single_fast5.done

In [None]:
#move fast5s you want from tmp to out dir
count = 0
for directory in os.listdir(TMPOUT_DIR):
    directory = os.path.join(TMPOUT_DIR, directory)
    #check if path is directory
    if os.path.isdir(directory):
        print(directory)
        fast5s = [fn for fn in os.listdir(directory)]
        for fast5_file in fast5s:
            if fast5_file.rstrip('.fast5') in mapped_reads:
                count = count + 1
                #move the files by renaming absolute path
                old_fn = os.path.join(directory, fast5_file)
                new_fn = os.path.join(OUT_DIR, fast5_file)
                os.replace(old_fn, new_fn)
        #count = count + len(fast5s)
        print(count)
        
    #print(os.path.isfile(x))

In [None]:
count == len(mapped_reads)

In [None]:
for x in os.listdir(OUT_DIR):
    if not x.endswith('.fast5'):
        print(x)

In [None]:
#remove tmp folder
!rm -r {TMPOUT_DIR}


In [None]:
len(mapped_reads)