-
Notifications
You must be signed in to change notification settings - Fork 1
/
verify.py
749 lines (642 loc) · 28.5 KB
/
verify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
import collections
import copy
import csv
import os
import random
import subprocess
import sys, re, statistics, itertools as its, multiprocessing as mp, numpy as np, scipy.sparse as sp, \
matplotlib.pyplot as pit
import argparse, math
import editdistance
import matplotlib.pyplot as plt
import scipy.interpolate
import build_matrix as bm
import identify_strain
import strain_finder as st_find
import identify_strain as idv
def new_verify_misp(ref_file, support_matrix_info, sub_read_file, new_strain_file, candidate_sam,stage=0):
"""
:param ref_file: original ref fa
:param support_matrix_info: other SRR sam
:param sub_read_file: subbed_read
:param new_strain_file: generated strain fa
:param candidate_sam: sub_read_candidates
:return: rejected reads, updated misp_conflict, no_candidate to fill
if multiple reads contain same misp,the misp will be accpeted.
if one read contains rejected misp, the read will be rejected, removed from misp_reads
misp with no read support will also be removed from misPs, misp_reads and misp_source
"""
ref = ""
with open(new_strain_file, "r") as f:
for line in f:
if line[0] != ">":
temp = line.replace(" ", "").replace("\n", "").replace("\d", "").upper()
ref += re.sub('\d', "", temp)
unvarified_ref = ref
ori_ref = ""
with open(ref_file, "r") as f:
for line in f:
if line[0] != ">":
temp = line.replace(" ", "").replace("\n", "").replace("\d", "").upper()
ori_ref += re.sub('\d', "", temp)
strain = int(re.search('[0-9]+', os.path.basename(sub_read_file))[0])
test_misp = {}
subbed_read = []
with open(sub_read_file, "r") as subf:
for line in subf:
fields = line.strip().split(" ")
subbed_read.append(fields)
original_subbed_read = copy.deepcopy(subbed_read)
misPs, misPs_source, misP_reads,new_subbed_read = idv.get_misp(ori_ref, subbed_read)
subbed_read = new_subbed_read
total_misps = sum([len(x) for k, x in misPs.items()])
print(total_misps, "misps")
# print(sorted(misPs.items(),key=lambda x:x[0])[:10])
read_freq = {}
candidate_read = []
with open(candidate_sam, "r") as canf:
for line in canf:
sline = line.strip().split(" ")
candidate_read.append(sline)
read_freq.update({sline[st_find.read_field]: int(sline[st_find.freq_field])})
candidate_read = idv.fix_s_pos(candidate_read)
#initial_matrix_info = bm.matrix_from_readlist(bm.read_sam(support_matrix), 0, marked_id=set({}), target="raw")
#raw_matrix_info = bm.build_insertion(initial_matrix_info, 0)
# not using insertion now,later change to raw_matrix_info.real_narrowed_matrix
raw_matrix_info = support_matrix_info
matrix = support_matrix_info.narrowed_matrix
#raw_matrix_info.narrowed_read = fix_s_pos(raw_matrix_info.narrowed_read)
found_segments = set({})
rejected_read = []
# accepted_misps = set({})
accepted_misps = {}
rejected_misps = {}
def verify_misp(ref_file, support_matrix_info, sub_read_file, new_strain_file, candidate_sam,stage=0):
"""
:param ref_file: original ref fa
:param support_matrix_info: other SRR sam
:param sub_read_file: subbed_read
:param new_strain_file: generated strain fa
:param candidate_sam: sub_read_candidates
:return: rejected reads, updated misp_conflict, no_candidate to fill
if multiple reads contain same misp,the misp will be accpeted.
if one read contains rejected misp, the read will be rejected, removed from misp_reads
misp with no read support will also be removed from misPs, misp_reads and misp_source
"""
ref = ""
with open(new_strain_file, "r") as f:
for line in f:
if line[0] != ">":
temp = line.replace(" ", "").replace("\n", "").replace("\d", "").upper()
ref += re.sub('\d', "", temp)
unvarified_ref = ref
ori_ref = ""
with open(ref_file, "r") as f:
for line in f:
if line[0] != ">":
temp = line.replace(" ", "").replace("\n", "").replace("\d", "").upper()
ori_ref += re.sub('\d', "", temp)
strain = int(re.search('[0-9]+', os.path.basename(sub_read_file))[0])
test_misp = {}
subbed_read = []
with open(sub_read_file, "r") as subf:
for line in subf:
fields = line.strip().split(" ")
subbed_read.append(fields)
original_subbed_read = copy.deepcopy(subbed_read)
misPs, misPs_source, misP_reads,new_subbed_read = idv.get_misp(ori_ref, subbed_read)
total_misps = sum([len(x) for k, x in misPs.items()])
print(total_misps, "misps")
# print(sorted(misPs.items(),key=lambda x:x[0])[:10])
read_freq = {}
candidate_read = []
with open(candidate_sam, "r") as canf:
for line in canf:
sline = line.strip().split(" ")
candidate_read.append(sline)
read_freq.update({sline[st_find.read_field]: int(sline[st_find.freq_field])})
candidate_read = idv.fix_s_pos(candidate_read)
inserted_pos = {}
i = 0
while (i < len(new_subbed_read)):
temp_read = new_subbed_read[i]
new_subbed_read,candidate_read,covered_pos,inserted_pos = identify_strain.fix_insertion_pos(new_subbed_read[i],new_subbed_read,candidate_read,{},inserted_pos)
i += 1
misp_fixed_subbed_read = []
fix_to_ori_mPos = {}
i = 0
while (i < len(new_subbed_read)):
temp_read = new_subbed_read[i]
misp_fixed_read = copy.deepcopy(temp_read[:st_find.misp_field])
misp_fixed_read.append([])
for misp in temp_read[st_find.misp_field]:
pos = list(misp.keys())[0]
ori_pos = pos
change = misp[pos]
for insPos in inserted_pos.keys():
if pos >= insPos:
pos += inserted_pos[insPos]
misp_fixed_read[st_find.misp_field].append({pos:change})
fix_to_ori_mPos.update({pos:ori_pos})
print(misp_fixed_read)
i += 1
misp_fixed_subbed_read.append(misp_fixed_read)
subbed_read = misp_fixed_subbed_read
# fix insertion pos of subbed read
#initial_matrix_info = bm.matrix_from_readlist(bm.read_sam(support_matrix), 0, marked_id=set({}), target="raw")
#raw_matrix_info = bm.build_insertion(initial_matrix_info, 0)
# not using insertion now,later change to raw_matrix_info.real_narrowed_matrix
raw_matrix_info = support_matrix_info
matrix = support_matrix_info.narrowed_matrix
#raw_matrix_info.narrowed_read = fix_s_pos(raw_matrix_info.narrowed_read)
found_segments = set({})
rejected_read = []
# accepted_misps = set({})
accepted_misps = {}
rejected_misps = {}
for ir,read in enumerate(subbed_read):
#if read_freq[read[st_find.read_field]] > 1:
# rejected = False
#else:
# rejected = True
rejected = False
tmp_accepted_misps = {}
for misp in read[st_find.misp_field]:
pos = list(misp.keys())[0]
change = misp[pos][0]
if pos in accepted_misps.keys() :
accepted_misps.update({pos:(accepted_misps[pos][0],accepted_misps[pos][1]+read_freq[read[st_find.read_field]],accepted_misps[pos][2])})
print(pos,accepted_misps[pos],"already accpted, add frequency, continue")
continue
if pos in rejected_misps.keys():
if read_freq[read[st_find.read_field]] == 1 and change in set(rejected_misps[pos]):
rejected_read.append(read)
print(read,pos,change,"already rejected, continue",rejected_misps[pos])
continue
if pos > matrix.shape[1]:
print( misp,"out of range")
print(matrix.shape)
exit()
tmp = np.squeeze(matrix.getcol(pos).toarray())
tmp_count = np.bincount(tmp)[1:]
changed_base = change.split("|")[1]
support_index = np.where(tmp == st_find.dict_tonu[changed_base])
support_count = support_index[0].shape[0]
visited_sindex = set({})
if support_count > 0:
for sp in support_index[0]:
temp_read = raw_matrix_info.narrowed_read[sp]
read_index = int(temp_read[st_find.index_field]) - 1
curr = pos
read_end = read_index + len(temp_read[st_find.read_field])
reached = False
l_reached = False
l_window = 0
r_reached = False
r_window = 0
while curr - 1 >= read_index:
try:
if ref[curr - 1] == temp_read[st_find.read_field][curr - 1 - read_index]:
l_window += 1
else:
l_reached = True
break
except:
print(curr, temp_read, len(ref))
raise "index error"
curr -= 1
curr_l_line = sp - 1
if curr_l_line >= 0 and curr - 1 <= raw_matrix_info.narrowed_read[curr_l_line][
st_find.index_field] - 1 + len(
raw_matrix_info.narrowed_read[curr_l_line][st_find.read_field]):
curr_l_read = raw_matrix_info.narrowed_read[curr_l_line]
# print(curr_l_read)
else:
l_reached = True
# while not l_reached and curr_l_line >= 0:
for curr_l_line in range(sp - 1, 0, -1):
if l_reached:
break
if curr_l_line in visited_sindex:
continue
else:
visited_sindex.add(curr_l_line)
curr_l_read = raw_matrix_info.narrowed_read[curr_l_line]
if curr - 1 < int(curr_l_read[st_find.index_field]) - 1 or curr - 1 >= int(
curr_l_read[st_find.index_field]) - 1 + len(curr_l_read[st_find.read_field]):
if curr_l_line - 1 < 0:
l_reached = True
break
if curr - 1 > int(curr_l_read[st_find.index_field]) - 1 + len(
curr_l_read[st_find.read_field]):
l_reached = True
break
# if current read does not support the current misp, break
if int(curr_l_read[st_find.index_field]) - 1 <= pos <= int(
curr_l_read[st_find.index_field]) - 1 + len(curr_l_read[st_find.read_field]) - 1 and \
curr_l_read[st_find.read_field][
pos - (int(curr_l_read[st_find.index_field]) - 1)] != change.split("|")[1]:
continue
continue
try:
if ref[curr - 1] == curr_l_read[st_find.read_field][
curr - 1 - (int(curr_l_read[st_find.index_field]) - 1)]:
l_window += 1
else:
l_reached = True
break
except:
print(curr - 1, curr - 1 - (int(curr_l_read[st_find.index_field]) - 1), curr_l_line,
curr_l_read, raw_matrix_info.narrowed_read[curr_l_line])
raise "index errror"
curr -= 1
l_seg = ref[curr:pos]
curr = pos
while curr + 1 <= read_index + len(temp_read[st_find.read_field]) - 1:
if ref[curr + 1] == temp_read[st_find.read_field][curr + 1 - read_index]:
r_window += 1
else:
r_reached = False
break
curr += 1
curr_r_line = sp + 1
if curr_r_line < raw_matrix_info.narrowed_matrix.shape[0] and \
raw_matrix_info.narrowed_read[curr_r_line][st_find.index_field] - 1 <= curr + 1:
curr_r_read = raw_matrix_info.narrowed_read[curr_r_line]
else:
r_reached = True
# while not r_reached and curr_r_line <= raw_matrix_info.narrowed_matrix.shape[0]:
for curr_r_line in range(sp + 1, raw_matrix_info.narrowed_matrix.shape[0]):
if r_reached:
break
if curr_r_line in visited_sindex:
continue
else:
visited_sindex.add(curr_r_line)
curr_r_read = raw_matrix_info.narrowed_read[curr_r_line]
if curr + 1 > int(curr_r_read[st_find.index_field]) - 1 + len(
curr_r_read[st_find.read_field]) - 1 or \
curr + 1 < int(curr_r_read[st_find.index_field]) - 1:
if curr_r_line + 1 >= raw_matrix_info.narrowed_matrix.shape[0]:
r_reached = True
break
if curr + 1 < int(curr_r_read[st_find.index_field]) - 1:
r_reached = True
break
# if current read does not support the current misp, break
if int(curr_r_read[st_find.index_field]) - 1 <= pos <= int(
curr_r_read[st_find.index_field]) - 1 + len(curr_r_read[st_find.read_field]) - 1 and \
curr_r_read[st_find.read_field][
pos - (int(curr_r_read[st_find.index_field]) - 1)] != changed_base:
continue
# print(curr_r_read)
continue
if ref[curr + 1] == curr_r_read[st_find.read_field][
curr + 1 - (int(curr_r_read[st_find.index_field]) - 1)]:
r_window += 1
else:
r_reached = True
break
curr += 1
seg = l_seg + ref[pos:curr + 1]
window = l_window + r_window + 1
if seg not in found_segments:
found_segments.add(seg)
print("target misp", pos, read_freq.get(read[st_find.read_field]))
print("seg_length", len(seg))
'''if len(seg)<50 and read_freq.get(read[st_find.read_field]) == 1 :#and len(misP_reads[pos][changed_base])==1:
rejected_read.append(read)
#if misP_reads[pos][changed_base][0][st_find.read_field] == read[st_find.read_field]:
# misP_reads[pos].pop(changed_base)
# misPs[pos].remove(change)
rejected = True
if pos not in rejected_misps.keys():
rejected_misps.update({pos:[change]})
else:
rejected_misps.get(pos).append(change)
break'''
if window < 50 and read_freq[read[st_find.read_field]]==1:
rejected = True
break
#if not rejected:
if rejected: # and len(misP_reads[pos][changed_base])==1:
rejected_read.append(read)
# if misP_reads[pos][changed_base][0][st_find.read_field] == read[st_find.read_field]:
# misP_reads[pos].pop(changed_base)
# misPs[pos].remove(change)
rejected = True
if pos not in rejected_misps.keys():
rejected_misps.update({pos: [change]})
else:
rejected_misps.get(pos).append(change)
break
else:
tmp_accepted_misps.update({pos: (change, read_freq[read[st_find.read_field]], window)})
else:
if read_freq.get(read[st_find.read_field]) == 1:# and len(misP_reads[pos][changed_base]) < 2:
#rejected_read.append(read)
rejected_read.append(original_subbed_read[ir])
rejected = True
#if misP_reads[pos][changed_base][0][st_find.read_field] == read[st_find.read_field]:
# misP_reads[pos].pop(changed_base)
# misPs[pos].remove(change)
if pos not in rejected_misps.keys():
rejected_misps.update({pos: [change]})
else:
rejected_misps.get(pos).append(change)
break
else:
tmp_accepted_misps.update({pos: (change, read_freq[read[st_find.read_field]], 0)})
print("accept?",read,tmp_accepted_misps,len(tmp_accepted_misps),len(read[st_find.misp_field]))
if len(tmp_accepted_misps) == len(read[st_find.misp_field]):
for kpos in tmp_accepted_misps.keys():
if kpos not in accepted_misps:
accepted_misps.update({kpos:tmp_accepted_misps[kpos]})
restored_accepted_misp = {}
for aMisp in accepted_misps:
assert fix_to_ori_mPos[aMisp] in misPs.keys()
restored_accepted_misp.update({fix_to_ori_mPos[aMisp]:accepted_misps[aMisp]})
fixed_accepted_misp = accepted_misps
accepted_misps = restored_accepted_misp
if len(rejected_read) > 0:
print(len(rejected_read), "reads rejected")
rejected_set = set([x[st_find.read_field] for x in rejected_read])
d_info = {}
'''if rejected read has more than 1 misp, check if all misp should be reversed'''
for ir, read in enumerate(new_subbed_read):
start = int(read[st_find.index_field]) - 1
cigar = read[st_find.cigar_field]
#if re.search('^[0-9]+S', cigar) is not None:
# assert start != int(original_subbed_read[ir][st_find.index_field]) - 1
if read[st_find.read_field] in rejected_set:
#print("revert", read)
for tmp_misp in read[st_find.misp_field]:
tmp_misp_pos = list(tmp_misp.keys())[0]
if tmp_misp_pos not in accepted_misps:
#assert ref[tmp_misp_pos] != ori_ref[tmp_misp_pos]
ref = ref[:tmp_misp_pos] + ori_ref[tmp_misp_pos] + ref[tmp_misp_pos+1:]
idv.write_seq(ref,"verified_"+new_strain_file)
#ref = ref[:start] + ori_ref[start:start + len(read[st_find.read_field])] + ref[start + len(read[st_find.read_field]):]
# ref[start:start+len(read[3])] = ori_ref[start:start+len(read[3])]
# print(pos,change,support_count,tmp_count)
# remove rejected reads from subbed_reads and file
verified_subbed_read = []
for read in original_subbed_read:
if read[st_find.read_field] not in rejected_set:
verified_subbed_read.append(read)
print(len(subbed_read), "reads originally in ", os.path.basename(sub_read_file))
subbed_read = verified_subbed_read
print(len(rejected_set), "reads reverted, removed from", os.path.basename(sub_read_file), "now ", len(subbed_read))
subbed_read_set = set([x[st_find.read_field] for x in subbed_read])
subprocess.run("rm -r batch_*",stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=True)
#new_misp_conflict,no_fill = curr_gap_reads(ref, strain, subbed_read_set, candidate_read,
# raw_matrix_info.narrowed_matrix, read_freq,misp_conflict)
#misp_conflict = new_misp_conflict
'''after removing rejected reads, go through accepted_misp again
writing restored index to rejected and subbed_read file'''
if stage > 0:
idv.write_sam(rejected_read, "stage_"+str(stage)+"_rejected_" + os.path.basename(sub_read_file), "w+", False)
idv.write_sam(verified_subbed_read, "stage_" + str(stage) + "_verified_"+sub_read_file, restore_pos=False)
idv.write_seq(ref,"stage_"+str(stage)+"_verified_"+new_strain_file)
idv.write_seq(ref,new_strain_file)
idv.write_sam(rejected_read, "rejected_" + os.path.basename(sub_read_file), "a+",True)
idv.write_sam(subbed_read, sub_read_file, restore_pos=True)
# if reach_end:
# update misp total freq
else:
print("all reads are already verified",len(rejected_read),"reads rejected")
if ref != unvarified_ref:
idv.write_seq(ref,"error_ref_"+new_strain_file)
'''
with open("misp_" + os.path.basename(sub_read_file), "w+") as mf:
for misp_pos in sorted(list(accepted_misps.keys())):
val = accepted_misps[misp_pos]
bases = val[0].split("|")
print(str(misp_pos+1), "&", bases[0], "&", bases[1], "&", val[1], "&", val[2], "\\\\\n\hline")
mf.write(str(misp_pos+1) + " & " + str(bases[0]) + " & " + bases[1] + "&" + str(val[1]) + " & " + str(
val[2]) + "\\\\\n\hline\n")'''
# return 0
return subbed_read,accepted_misps
print()
cvg_list = []
for i in range(raw_matrix_info.real_narrowed_matrix.shape[1]):
tmp = np.squeeze(raw_matrix_info.real_narrowed_matrix.getcol(i).toarray())
tmp_count = np.bincount(tmp)[1:]
cvg_list.append(sum(tmp_count))
with open("cvg.txt", "w+") as near_rncf:
for i, v in enumerate(cvg_list):
near_rncf.write(str(i) + ": " + str(v) + ", ")
# facilitate verify_misp to stop at full verified reads
def fac_verify_misp(ref_file, support_samfile, sub_read_file, new_strain_file, candidate_sam):
strain = int(re.search('[0-9]+', os.path.basename(sub_read_file))[0])
with open("rejected_" + os.path.basename(sub_read_file), "w+") as rejf:
rejf.write("")
initial_matrix_info = bm.matrix_from_readlist(bm.read_sam(support_samfile), 0, marked_id=set({}), target="raw")
matrix_info = initial_matrix_info
#verified_read, accpeted_misps = verify_misp(ref_file, matrix_info, sub_read_file, new_strain_file, candidate_sam)
initial_sub_read = bm.read_sam(sub_read_file)
tmp_rejected = []
stage = 1
tmp_verified_read, tmp_accepted_misps = verify_misp(ref_file, matrix_info, sub_read_file, new_strain_file, candidate_sam,stage)
misp_conflict = {}
verified_misp,verified_misp_source,verified_misp_read, tmp_reads = idv.get_misp(idv.get_ref_seq(ref_file), tmp_verified_read,
fix_pos=False)
for k in verified_misp.keys():
misp_conflict.update({k:[]})
strain = int(re.search('[0-9]+', os.path.basename(sub_read_file))[0])
read_freq = {}
candidate_read = []
with open(candidate_sam, "r") as canf:
for line in canf:
sline = line.strip().split(" ")
candidate_read.append(sline)
read_freq.update({sline[st_find.read_field]: int(sline[st_find.freq_field])})
fixed_candidate_read = idv.fix_s_pos(candidate_read)
for ir,read in enumerate(candidate_read):
if re.search('^[0-9]+S',read[st_find.cigar_field]) is not None:
if int(read[st_find.index_field]) == int(fixed_candidate_read[ir][st_find.index_field]):
print(read,candidate_read[ir])
raise "didin't fix s begin"
candidate_read = fixed_candidate_read
one_iter = True
if not one_iter and len(tmp_verified_read) < len(initial_sub_read):
print("run curr_gap_read")
exit()
verified_read = []
tmp_subbed_read, misp_conflict = curr_gap_reads(idv.get_ref_seq(new_strain_file), strain, tmp_verified_read,
candidate_read, matrix_info.narrowed_matrix, read_freq,
misp_conflict, stage)
if idv.readlist_comapre(tmp_subbed_read,tmp_verified_read) == 0:
print("no new reads found, terminate")
verified_read = tmp_subbed_read
else:
verified_read = tmp_verified_read
#keep a global visited read set?
while not one_iter and len(verified_read) != len(tmp_verified_read):
print("iterateive curr gap read")
break
verified_read = tmp_verified_read
accepted_misps = tmp_accepted_misps
stage += 1
tmp_verified_read, tmp_accepted_misps = verify_misp(ref_file, matrix_info, sub_read_file, new_strain_file,
candidate_sam,stage)
test_tmp_verified_read = bm.read_sam(sub_read_file)
readlist_comapre(tmp_verified_read,verified_read)
if len(tmp_verified_read) != len(test_tmp_verified_read):
readlist_comapre(tmp_verified_read,test_tmp_verified_read)
raise "verify_misp introduce new reads or remove reads or write reads error"
curr_verified_read = copy.deepcopy(tmp_verified_read)
tmp_subbed_read,misp_conflict = curr_gap_reads(get_ref_seq(new_strain_file), strain, curr_verified_read,
candidate_read, matrix_info.narrowed_matrix, read_freq,
misp_conflict, stage)
if readlist_comapre(tmp_subbed_read,tmp_verified_read) == 0:
print("no new found reads, finish verification")
break
if len(tmp_subbed_read) < len(tmp_verified_read):
raise "curr_gap_reads removes verirified reads"
other_SRR_command = os.path.dirname(__file__) + "/get_combined_extract.sh final_strain_"+str(strain)+"_reference.fa"
other_SRR_proc = subprocess.run(other_SRR_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=True)
subprocess.run("cp combine_final_strain_"+str(strain)+"_reference.fa_extract.sam stage_"+str(stage)+"_combine_final_strain_"+str(strain)+"_reference.fa_extract.sam",stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=True)
#write misp
verified_read = tmp_verified_read
accepted_misps = tmp_accepted_misps
with open("misp_" + os.path.basename(sub_read_file), "w+") as mf:
for misp_pos in sorted(list(accepted_misps.keys())):
val = accepted_misps[misp_pos]
bases = val[0].split("|")
print(str(misp_pos + 1), "&", bases[0], "&", bases[1], "&", val[1], "&", val[2], "\\\\\n\hline")
mf.write(str(misp_pos + 1) + " & " + str(bases[0]) + " & " + bases[1] + "&" + str(val[1]) + " & " + str(
val[2]) + "\\\\\n\hline\n")
def curr_gap_reads(ref, strain, subbed_read, candidate_read, matrix, read_freq, misp_conflict, stage):
"""
Fill more candidate reads after rejecting reads
:param stage:
:param ref: reference sequence
:param strain: strain number
:param subbed_read: set of remained subbed reads
:param candidate_read: fixed position
:param matrix: matrix of other SRR samples
:param read_freq: frequency of each read
:param misp_conflict: misp that causes gaps when placed together, format {pos:{pos_misp:[changed_base]}}
:return:
"""
if os.path.exists("half_real_r1.fastq") and os.path.exists("half_real_r2.fastq"):
readfile1 = "half_real_R1.fastq"
readfile2 = "half_real_R2.fastq"
else:
if os.path.exists("half_real.fastq"):
readfile1 = "half_real.fastq"
readfile2 = "none"
else:
raise "Error: reduced version reads error"
prev_readset = set({})
covered_pos = {}
for i in range(0, strain + 1):
print("getting replacement reads in subbed_reads_" + str(i))
with open("subbed_reads_" + str(i) + ".sam", "r") as subf:
for line in subf:
sline = line.strip().split(" ")
prev_readset.add(sline[st_find.read_field])
with open("rejected_subbed_reads_" + str(i) + ".sam", "r") as rejf:
for line in rejf:
fields = line.strip().split(" ")
prev_readset.add(fields[st_find.read_field])
print(len(prev_readset))
readlist = candidate_read
file_subbed_read = bm.read_sam("subbed_reads_" + str(strain) + ".sam")
fixed_subbed_read = idv.fix_s_pos(file_subbed_read)
file_subbed_read = fixed_subbed_read
assert idv.readlist_comapre(file_subbed_read,subbed_read)== 0
for isr,sread in enumerate(file_subbed_read):
if int(sread[st_find.index_field]) != int(subbed_read[isr][st_find.index_field]):
print(sread, subbed_read[isr])
raise "index error in file_subbed_read"
start = int(sread[st_find.index_field])-1
for ib, base in enumerate(sread[st_find.read_field]):
covered_pos.update({start + ib: base})
#misPs = []
subbed_read_set = set([x[st_find.read_field] for x in subbed_read])
new_subbed_read = copy.deepcopy(subbed_read)
subbed_read = new_subbed_read
#for ir, read in enumerate(readlist):
#if read[st_find.read_field] in subbed_read_set:
#subbed_read.append(read)
#misPs.extend(read[st_find.misp_field])
original_readlist = copy.deepcopy(readlist)
misPs, misP_source, misP_reads, new_readlist = idv.get_misp(ref, readlist, fix_pos=False)
for tmp_read in new_readlist:
if len(tmp_read[st_find.misp_field]) > 70 :
print(tmp_read)
raise "position error"
readlist = new_readlist
count = 0
for batch, read in enumerate(readlist):
count += 1
if read[st_find.read_field] in prev_readset:
# batch += 1
continue
if read[st_find.read_field] in subbed_read_set:
continue
read_index = int(read[st_find.index_field]) - 1
#read_misPs = [list(x.items())[0][0] for x in read[st_find.misp_field]]
print("curr read", batch, read)
if idv.if_clash(read, covered_pos) or idv.if_reject(read,matrix,read_freq,misP_reads):
continue
temp_ref = ref[:read_index] + read[st_find.read_field] + ref[read_index + len(read[st_find.read_field]):]
# print(editdistance.eval(temp_ref,ref))
gap_list = identify_strain.verify_seq_support(temp_ref,batch,readfile1,readfile2)
if len(gap_list) == 0:
subbed_read.append(candidate_read[batch])
subbed_read_set.add(read[st_find.read_field])
read_start = int(read[2]) - 1
for ind, base in enumerate(read[st_find.read_field]):
if read_start + ind in covered_pos.keys():
if covered_pos[read_start + ind] != base:
print("clash happened for current read", ind, covered_pos[read_start + ind], base)
exit()
else:
covered_pos.update({read_start + ind: base})
ref = temp_ref
print(batch, "no gaps for ")
else:
#construct relational gaps
print("gaps at ",gap_list)
with open("gaps.txt", "a+") as gf:
gf.write("gaps for read," + str(read) + " is " + str(gap_list) + "\n")
for tmp_read in subbed_read:
tmp_read_index= int(tmp_read[st_find.index_field]) -1
if abs(tmp_read_index-read_index) <= 300:
gf.write(str(tmp_read))
gf.write("\n")
# print(subbed_read,len(readlist))
# batch += 1
if len(subbed_read) > len(file_subbed_read):
idv.\
write_result(strain, ref, subbed_read, stage, restore_pos=True,curr=True)
else:
print("no more reads can fit")
strain += 1
# remove temp files
verify_sub_command = "rm" + " " + "-r" + " " + "./batch_*"
verify_proc = subprocess.run(verify_sub_command,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True,
shell=True)
return subbed_read,misp_conflict
def verify_strain(strain,ref_file,candidate_sam,option,other_read_files):
backup_command = "cp subbed_reads_" + str(strain) + ".sam subbed_reads_" + str(
strain) + ".sam.original; cp final_strain_" + str(strain) \
+ "_reference.fa final_strain_" + str(strain) + "_reference.fa.original"
backup_proc = subprocess.run(backup_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True, shell=True)
other_SRR_command = os.path.dirname(__file__) + "/combine_align.sh "+ option+ " final_strain_" + str(
strain) + "_reference.fa"
for rf in other_read_files:
other_SRR_command += " " + rf
other_SRR_proc = subprocess.run(other_SRR_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=True)
fac_verify_misp(ref_file, "combine_final_strain_" + str(strain) + "_reference.fa_extract.sam",
"subbed_reads_" + str(strain) + ".sam", "final_strain_" + str(strain) + "_reference.fa",
candidate_sam)
subprocess.run("rm -r batch_* *_out", stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True,
shell=True)
if __name__ == "__main__":
verify_strain(int(sys.argv[1]),sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5:])