In [9]:
# imports
import AmpliPy
import pysam
import multiprocessing

MAIN_EXAMPLE_INPUT_FILE = "./example/example_untrimmed_sorted.bam"
data = []

In [10]:
# counts total reads we have
import AmpliPy
in_align,_ = AmpliPy.create_AlignmentFile_objects(MAIN_EXAMPLE_INPUT_FILE)
count = 0
for read in in_align:
	count += 1

print("total reads", str(count)) # last count was 201600

total reads 201600


In [11]:
# simple function to take time
import time
def func_to_run(read):
	time.sleep(1)

In [12]:

new_input_file = "intermediary.bam"
os.remove(new_input_file) # you need a new file each time

in_align,out_align = AmpliPy.create_AlignmentFile_objects(MAIN_EXAMPLE_INPUT_FILE, new_input_file)

i = 0
for read in in_align:
	if (i >= 100):
		break
	out_align.write(read)
	i += 1
out_align.close()

In [13]:
# read in the newly created file, ensuring we created the right file size
inter_align,_ = AmpliPy.create_AlignmentFile_objects(new_input_file)

count = 0
for read in inter_align:
	count += 1

print("count: ", str(count)) # count was 100 on last run


count:  100


In [14]:
# simple test to check time for a linear approach
inter_align,_ = AmpliPy.create_AlignmentFile_objects(new_input_file)

linear_start = time.time()
for read in inter_align:
	func_to_run(read)
linear_end = time.time()

linear_time = linear_end - linear_start

print("total time for linear approach: ", str(linear_time)) # took 50.36s on last run
data.append(["linear", 1, linear_time])

total time for linear approach:  100.10854363441467


In [23]:
# simple multiprocessing test
import multiprocessing as mp
inter_align,_ = AmpliPy.create_AlignmentFile_objects(new_input_file)
ld = mp.JoinableQueue()

def worker():
	#print("spawned!")
	while True:
		read = ld.get()
		func_to_run(read)
		#print("done")
		ld.task_done()

if __name__ == '__main__':
	multi_start = time.time()
	
	num_processes = 15
	processes = []
	for _ in range(num_processes):
		p = mp.Process(target=worker)
		p.start()
		processes.append(p)

	for read in inter_align:
		fake_read = {
			'cigartuples': read.cigartuples,
			'is_paired': read.is_paired,
			'is_reverse': read.is_reverse,
			'query_alignment_qualities': read.query_alignment_qualities,
			'query_alignment_start': read.query_alignment_start,
			'query_length': read.query_length,
			'reference_end': read.reference_end,
			'reference_start': read.reference_start,
			'template_length': read.template_length
		}
		ld.put(fake_read)
	
	#print(ld.qsize())
	ld.close()
	ld.join()
	#print(ld.qsize())
	multi_end = time.time()
	multi_time = multi_end - multi_start
	data.append(["multi", num_processes, multi_time])

In [24]:
import pandas as pd
pd.DataFrame(data, columns=["Impl", "Processes", "Runtime"])

Unnamed: 0,Impl,Processes,Runtime
0,linear,1,100.108544
1,multi,1,100.119531
2,multi,2,50.069536
3,multi,4,25.045139
4,multi,8,13.13637
5,multi,12,9.064442
6,multi,15,7.06068
