Skip to content

Commit

Permalink
attempted new logic for merge_breakpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
dpannguyen committed Mar 16, 2022
1 parent 3a59bc1 commit 6c7261d
Showing 1 changed file with 58 additions and 31 deletions.
89 changes: 58 additions & 31 deletions openrdp/scripts/bootscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,12 @@ def scan(self, i):
# Shuffle columns with replacement
rep_window = window[:, np.random.randint(0, window.shape[1], window.shape[1])]
dist_mat = squareform(pdist(rep_window, jc_distance))

"""
To be looked into for optimization
"""
dists.append(dist_mat)

return dists

def do_scanning_phase(self, align):
Expand All @@ -126,7 +131,6 @@ def do_scanning_phase(self, align):
"""
with multiprocessing.Pool() as p:
all_dists = p.map(self.scan, range(0, align.shape[1], self.step_size))

return all_dists

def execute(self, triplet):
Expand All @@ -139,6 +143,10 @@ def execute(self, triplet):
ab_support = []
bc_support = []
ac_support = []

"""
To be looked into for optimization
"""
for dists in self.dists:
supports = []
for dist_mat in dists:
Expand Down Expand Up @@ -185,19 +193,20 @@ def execute(self, triplet):
# Find p-value for regions
for recomb_candidate, event in possible_regions:
n = event[1] - event[0]
l = self.align.shape[1]

# m is the proportion of nts in common between either A or B and C in the recombinant region
recomb_region_cand = triplet.sequences[recomb_candidate, event[0]: event[1]]
other_seqs = triplet.sequences[trps[:recomb_candidate] + trps[recomb_candidate+1:], event[0]: event[1]]
m = np.sum(np.any(recomb_region_cand == other_seqs, axis=0))
if n > 0:
l = self.align.shape[1]

# p is the proportion of nts in common between either A or B and C in the entire sequence
recomb_region_cand = triplet.sequences[recomb_candidate, :]
other_seqs = triplet.sequences[trps[:recomb_candidate] + trps[recomb_candidate + 1:], :]
p = np.sum(np.any(recomb_region_cand == other_seqs, axis=0)) / l
# m is the proportion of nts in common between either A or B and C in the recombinant region
recomb_region_cand = triplet.sequences[recomb_candidate, event[0]: event[1]]
other_seqs = triplet.sequences[trps[:recomb_candidate] + trps[recomb_candidate+1:], event[0]: event[1]]
m = np.sum(np.any(recomb_region_cand == other_seqs, axis=0))

# p is the proportion of nts in common between either A or B and C in the entire sequence
recomb_region_cand = triplet.sequences[recomb_candidate, :]
other_seqs = triplet.sequences[trps[:recomb_candidate] + trps[recomb_candidate + 1:], :]
p = np.sum(np.any(recomb_region_cand == other_seqs, axis=0)) / l

if n > 0:
# Calculate p_value
val = 0
log_n_fact = np.sum(np.log(np.arange(1, n + 1))) # Convert to log space to prevent integer overflow
Expand All @@ -209,12 +218,13 @@ def execute(self, triplet):
(log_n_fact - (log_i_fact + log_ni_fact)) + np.log(p ** n) + np.log((1 - p) ** (n - i)))

# Get potential recombinant and the parents
trp_names = copy.copy(triplet.names)
for i, name in enumerate(trp_names):
if i == recomb_candidate:
rec_name = trp_names.pop(i)
parents = trp_names
if val != 0.0:
if val != 0.0:
trp_names = copy.copy(triplet.names)
for i, name in enumerate(trp_names):
if i == recomb_candidate:
# rec_name = trp_names.pop(i)
rec_name = name
parents = tuple(trp_names[:i] + trp_names[i + 1:])
self.raw_results.append((rec_name, parents, *event, val))

return
Expand All @@ -240,19 +250,36 @@ def merge_breakpoints(self):
# Merge any locations that overlap - eg [1, 5] and [3, 7] would become [1, 7]
for key in results_dict:
merged_regions = []
for region in results_dict[key]:
region = list(region)
old_regions = list(results_dict[key])
for region2 in old_regions:
start = region[0]
end = region[1]
start2 = region2[0]
end2 = region2[1]
if start <= start2 <= end or start <= end2 <= end:
region[0] = min(start,start2)
region[1] = max(end, end2)
results_dict[key].remove(region2)
merged_regions.append(region)

regions = list(sorted(results_dict[key], reverse = True))
while len(regions) != 0:
region = regions.pop()
merged = list(region)
if len(regions) == 0:
merged_regions.append(merged)
break
next_region = regions.pop()
while region[1] >= next_region[0]:
merged[1] = max(region[1], next_region[1])
if len(regions) == 0: break
next_region = regions.pop()
merged_regions.append(merged)
regions.append(next_region)

# for region in results_dict[key]:
# region = list(region)
# old_regions = list(results_dict[key])
# for region2 in old_regions:
# start = region[0]
# end = region[1]
# start2 = region2[0]
# end2 = region2[1]
# if start <= start2 <= end or start <= end2 <= end:
# region[0] = min(start,start2)
# region[1] = max(end, end2)
# results_dict[key].remove(region2)
# merged_regions.append(region)


# Output the results
for region in merged_regions:
Expand All @@ -263,4 +290,4 @@ def merge_breakpoints(self):
p_value = region[2]
if p_value < self.max_pvalue:
results.append((rec_name, parents, start, end, p_value))
return results
return results

0 comments on commit 6c7261d

Please sign in to comment.