We need to get from mapped bam file to single fast5s:

Input is:
* mapped bam file.
* multifast5 of fastqs mapped to the reference

Output:
* folder with single fast5s that are mapped

What we need to do:
* get ID of all mapped fastqs
* extract them in batches
* make them single fast5s

In [43]:
import os
from joblib import Parallel, delayed
import time
from ont_fast5_api.fast5_interface import get_fast5_file
from ont_fast5_api.analysis_tools.basecall_1d import Basecall1DTools

In [41]:
def fast5s_to_fastq(dir_):
    print(dir_)
    start = time.time()
    plus = '+'
    fastq_fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
    fast5s = (os.path.join(dir_ ,x) for x in os.listdir(dir_) if x.endswith('.fast5') )
    if os.path.exists(fastq_fn):
        with open(fastq_fn, 'w'):
            pass
    n  = []
    s  = []
    q  = []
    for fast5_fn in fast5s:
        with get_fast5_file(fast5_fn, mode='r') as f5:
            with Basecall1DTools(f5) as basecall:
                n1, s1, q1 = basecall.get_called_sequence('template')
                n.append(n1)
                s.append(s1)
                q.append(q1)
    
    with open(fastq_fn, mode='a+') as fastq_fh:
        for (n1, s1, q1) in zip(n, s, q):
            print('@%s' % n1, file=fastq_fh)
            print(s1, file=fastq_fh)
            print(plus, file=fastq_fh)
            print(q1, file=fastq_fh)
    string = '%s done' % fastq_fn
    stop = time.time()
    string = string + ': Done in {:.2f}'.format(stop - start)
    print(string)
    return string

### INPUTS

### *****Define directories before running on different sampes*****

In [8]:
###define input directories here 
BAMIN_DIR = '../../analyses/mapping/infected_leaves/infected_leaves_2/'
FAST5IN_DIR = '../../data/genomic_data/infected_leaves/workspace_fast5/infected_leaves_2_fast5_out/'
FAST5singleIN_DIR = '../../analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/'
OUT_DIR = '../../analyses/single_fast5s/infected_leaves/infected_leaves_2_mapped_single_fast5/'
n_threads = 25

### *****Define directories before running on different sampes*****

In [9]:
BAMIN_DIR = os.path.abspath(BAMIN_DIR)
FAST5IN_DIR = os.path.abspath(FAST5IN_DIR)
OUT_DIR = os.path.abspath(OUT_DIR)
FAST5singleIN_DIR = os.path.abspath(FAST5singleIN_DIR)

In [52]:
single_fast5_count = 0
dirs = []
for direcotry in (os.path.join(FAST5singleIN_DIR, x) for x in os.listdir(FAST5singleIN_DIR) if os.path.isdir(os.path.join(FAST5singleIN_DIR, x))):
    dirs.append(direcotry)
    fast5s = [os.path.join(direcotry ,x) for x in os.listdir(direcotry) if x.endswith('.fast5')]
    single_fast5_count += len(fast5s)

In [37]:
all_fastq_fn = os.path.join(FAST5singleIN_DIR, os.path.basename(FAST5singleIN_DIR)+ '.fastq')
with open(all_fastq_fn, mode='w') as all_fastq_fh:
    for dir_ in dirs[:1]:
        fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
    with open(fn, mode = 'r') as fh:
        for line in fh:
            line.rstrip()
            print(line, file=fh)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/0/0.fastq'

In [53]:
print(single_fast5_count)

764289


In [54]:
dirs.sort()

In [55]:
dirs

['/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/0',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/1',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/10',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/100',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/101',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/102',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/103',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/104',
 '/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/105

In [49]:
from multiprocessing import Pool, Process

In [60]:
for i in range(len(dirs)):
    print(dirs[i])

/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/0
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/1
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/10
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/100
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/101
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/102
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/103
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/104
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/105
/home/jamila/jamila_Storage/analy

In [62]:
for i in range(len(dirs)):
    print(dirs[i])
        #p = Process(target=fast5s_to_fastq, args=(dirs[i]))
        #p.start()
        #p.join()

/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/0
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/1
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/10
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/100
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/101
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/102
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/103
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/104
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/105
/home/jamila/jamila_Storage/analy

In [47]:
with Pool(processes=n_threads) as pool:
    pool.map(fast5s_to_fastq, dirs)

In [None]:
all_fastq_fn = os.path.join(FAST5singleIN_DIR, os.path.basename(FAST5singleIN_DIR)+ '.fastq')

In [None]:
with open(all_fastq_fn, mode='w') as all_fastq_fh:
    for dir_ in dirs[:1]:
        fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
    with open(fn, mode = 'r') as fh:
        for line in fh:
            line.rstrip()
            print(line, file=fh)

In [63]:
fastqids_fn = inbam_fn.replace('.bam', '.fastqids.txt.tmp')

In [28]:
fastq_fn = '../../data/genomic_data/infected_leaves/all_fastq/infected_leaves_2.all.fastq'
!grep '^@' {fastq_fn} | cut -d ' ' -f 1 > {fastqids_fn}

In [12]:
##catch the bam based on naming convention
inbam_fn = [os.path.join(BAMIN_DIR, x) for x in os.listdir(BAMIN_DIR) if x.endswith('sorted.bam')][0]

In [13]:
##generated the read ID list
mappedids_fn = inbam_fn.replace('.bam', '.mappedids.txt')
#!samtools  view -F 4  {inbam_fn} | cut -f 1 | sort | uniq > {mappedids_fn}

In [22]:
!samtools  view -F 4  {inbam_fn} | cut -f 1 | sort | uniq | wc -l

22879


In [None]:
### generate a temporary folder that holds the batches of fast5
TMPOUT_DIR = os.path.join(OUT_DIR, 'tmp')
if not os.path.exists(TMPOUT_DIR):
    os.makedirs(TMPOUT_DIR)

In [15]:
#read in the mapped reads ids
mapped_reads = []
with open(mappedids_fn) as fh:
    for line in fh:
        mapped_reads.append(line.rstrip())

In [30]:
mapped_reads = set(mapped_reads)

In [29]:
fastq_reads = []
with open(fastqids_fn) as fh:
    for line in fh:
        fastq_reads.append(line.rstrip().replace('@', ''))

In [32]:
fastq_reads = set(fastq_reads)

In [34]:
fastq_reads

{'0db34f18-70a2-4ce1-9bd8-fa8953c0d64a',
 '4d68df89-03d4-4ace-b9b3-5162900686e2',
 'fca222e2-a0ff-4e3f-8bbf-9a54861ac21c',
 '3d2d533c-24a9-4415-934f-6ec490521173',
 '250fc75b-6c44-4143-bfca-a57e3ac19028',
 '6db5783e-78d2-4a53-9bbb-a2543201ee9b',
 '686ebf46-69c4-4c1d-acc5-fae6e9907ca8',
 '37504b43-e43c-42b2-845f-841924c0c0ce',
 'dcddf48c-505f-445e-9c7f-75be0d7b2963',
 'f83fad03-7925-4e67-9306-04755ff2e068',
 'e5198e80-32d3-4a82-aa3e-2d36ef6ecf38',
 'd27485fa-cc0f-4d9d-b2b1-2a68d45d1b68',
 'e523a0b9-47e1-4685-a1ad-e97ada6f438a',
 '2dba3c80-1f32-4b1f-886e-62072ce915f8',
 '028571ac-f232-4cdf-aba5-e1ba0e01b639',
 '751c8827-ae22-448f-8dd2-440eb7d7ced1',
 '85e87569-f13d-4168-9b38-68f6a8744c7a',
 '7122aa9d-1da8-4c91-b060-b3b17f93d339',
 '279d3b79-40af-4d47-a525-f941cb2c9d69',
 'cec087d7-8aa4-4d4d-8d51-e2c2854b0425',
 'c4935e73-7d1f-468e-9eb9-18332221f31c',
 'ea6ff671-f0da-44e1-8437-d3dd68fba5ad',
 'a45611f8-ac92-4b08-8dc1-9b30a98abf50',
 '74c84ad1-ae37-4ade-bc03-3058b0487aac',
 '992a9798-b64a-

In [33]:
#move fast5s you want from tmp to out dir
count = 0
TMPOUT_DIR = FAST5singleIN_DIR
for directory in os.listdir(TMPOUT_DIR):
    directory = os.path.join(TMPOUT_DIR, directory)
    #check if path is directory
    if os.path.isdir(directory):
        #print(directory)
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        for fast5_file in fast5s:
            if fast5_file.rstrip('.fast5') in fastq_reads:
                count = count + 1
                #move the files by renaming absolute path
                #old_fn = os.path.join(directory, fast5_file)
                #new_fn = os.path.join(OUT_DIR, fast5_file)
                #os.replace(old_fn, new_fn)
        #count = count + len(fast5s)
        print(count)
        
    #print(os.path.isfile(x))

3244
6484
9723
12969
16216
19471
22679
25934
29161
32360
35557
38822
42069
45324
48557
51817
55065
58301
61536
64781
68042
71317
74555
77803
81056
84295
87589
90861
94068
97322
100610
103816
107024
110273
113543
116791
120037
123292
126527
129769
133008
136265
139505
142748
145967
149189
152425
155626
158881
162132
165398
168659
171868
175118
177492
180738
183986
187284
190512
193768
196996
200262
203540
206793
210021
213295
216550
219856
223104
226354
229614
232900
236110
239352
242616
245853
249055
252327
255571
258823
262042
265331
268604
271867
275105
278401
281621
284828
288101
291323
294556
297805
301101
304342
307623
310877
314089
317359
320585
323832
327109
330368
333619
336868
340130
343394
346642
349895
353135
356394
359602
362840
366107
369397
372649
375938
379188
382424
385630
388855
392124
395365
398614
401902
405167
408436
411703
414962
418201
421419
424711
427943
431172
434430
437710
440987
444260
447513
450768
454000
457249
460492
463743
466969
470235
473432
476703
4799

In [17]:
len(mapped_reads)

22879

In [None]:
###go from multies fast5 of mapped reads to single fast5s
!multi_to_single_fast5 -i {FAST5IN_DIR} -s {TMPOUT_DIR} -t 10 --recursive
!touch {OUT_DIR}/single_fast5.done

In [18]:
#move fast5s you want from tmp to out dir
count = 0
TMPOUT_DIR = FAST5singleIN_DIR
for directory in os.listdir(TMPOUT_DIR):
    directory = os.path.join(TMPOUT_DIR, directory)
    #check if path is directory
    if os.path.isdir(directory):
        print(directory)
        fast5s = [fn for fn in os.listdir(directory) if fn.endswith('.fast5')]
        for fast5_file in fast5s:
            if fast5_file.rstrip('.fast5') in mapped_reads:
                count = count + 1
                #move the files by renaming absolute path
                #old_fn = os.path.join(directory, fast5_file)
                #new_fn = os.path.join(OUT_DIR, fast5_file)
                #os.replace(old_fn, new_fn)
        #count = count + len(fast5s)
        print(count)
        
    #print(os.path.isfile(x))

/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/113
85
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/137
180
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/146
273
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/188
376
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/20
470
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/155
564
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/6
651
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/36
748
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/127
8

10899
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/40
10989
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/121
11083
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/23
11174
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/9
11268
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/172
11365
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/152
11460
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/156
11559
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_fast5_single_fast5/174
11657
/home/jamila/jamila_Storage/analyses/single_fast5s/infected_leaves/infected_leaves_2_f

In [19]:
count == len(mapped_reads)

False

In [None]:
for x in os.listdir(OUT_DIR):
    if not x.endswith('.fast5'):
        print(x)

In [None]:
#remove tmp folder
!rm -r {TMPOUT_DIR}


In [None]:
len(mapped_reads)