Skip to content

Commit

Permalink
updated README and added multiprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
sean-la committed Jun 22, 2018
1 parent 92b07d9 commit 137d903
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 31 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

PRINCE estimates Variable Number Tandem Repeats (VNTR) copy number from raw next generation sequencing (NGS) data.

## Change History
* \[insert date here\] **Version 1.1** - Refactored file handling and added multiprocessing.
## Build status

[![Build Status](https://travis-ci.org/WGS-TB/PythonPRINCE.svg?branch=master)](https://travis-ci.org/WGS-TB/PythonPRINCE)
Expand Down
12 changes: 9 additions & 3 deletions bin/prince
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@ from prince.query_sample import test_target

DEFAULT_K = 9
DEFAULT_BOOST_OUTPUT = resource_filename('prince.resources', 'training_data.txt')
MAJOR_VERSION_K=1
MINOR_VERSION_K=1

def main():
parser = argparse.ArgumentParser(description='Prince Options.')
parser = argparse.ArgumentParser(description='Prince (Version %d.%d) Options.'\
% (MAJOR_VERSION_K,MINOR_VERSION_K))

parser.add_argument('-bo', '--boost_output', default=DEFAULT_BOOST_OUTPUT,
help="output file for training data / training data used to predict copy numbers for queries")
parser.add_argument('-to', '--target_output', default="results/predictions.csv",
help="output file for training data / training data used to predict \
copy numbers for queries")
parser.add_argument('-to', '--target_output', default="results/predictions.csv",\
help="output file for query copy number predictions")
parser.add_argument('-tmp','--templates', default="templates.fasta",
help="VNTR templates. Default is for M.TB")
Expand All @@ -29,6 +33,8 @@ def main():
help="Kmer size used during read recruitment.")
parser.add_argument('-cn', '--copynumber', default=1,type=int,
help="Copy number for training genome.")
parser.add_argument('-np', '--num_procs', default=1,type=int,
help="Number of cores for parallel processing.")

prince_options = parser.parse_args()

Expand Down
75 changes: 47 additions & 28 deletions prince/query_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,53 @@
from prince.predict import get_data, get_equations, get_copy_number
from prince.match_score import compute_match_score
import time
import multiprocessing as mp

def partition(lst, n):
division = len(lst) / float(n)
return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in range(n) ]

def multiple_targetMatchScore(opts, queries, templates, templateKmers):
match_scores = []
for query in queries:
targetFileName = query.split("/")[-1]
print("Querying %s" % targetFileName)
start_time = time.time()
match_score = compute_match_score(query, templates, templateKmers, opts.k)
match_scores.append( (targetFileName,match_score) )
print("Done with %s in time %s" % (targetFileName,str(time.time()-start_time)))
return match_scores

def test_target(opts, templates,templateNames, templateKmers):
# Get the query paths
queries = []
with open(opts.target_file) as file:
query = file.readline().strip("\n")
while query:
start_time = time.time()
targetFileName = query.split("/")[-1] #CHANGE
print("\nQuerying %s" % targetFileName)

targetMatchScore = compute_match_score(query, templates, templateKmers, opts.k)

data = get_data(opts.boost_output)
equations = get_equations(data)
predictions = []

# Write target predictions to text file
with open(opts.target_output, "a+") as f:
if f.readline() == "":
f.write("Templates,")
f.write(",".join(templateNames))
f.write("\n")
f.write(targetFileName)
for t, ms in enumerate(targetMatchScore):
slope, intercept = equations[t]
y_predict = get_copy_number(ms, slope, intercept)
f.write("," + "{:.2f}".format(y_predict))

f.write("\n")
print("Done with %s" % targetFileName)
print(time.time()-start_time)
query = file.readline().strip("\n")
for line in file:
queries.append( line.rstrip("\n") )
# Find match scores
if opts.num_procs > 1:
queries_partition = partition(queries,opts.num_procs)
pool = mp.Pool(processes=opts.num_procs)
# Run analyses in multiple processes
results = [pool.apply_async(multiple_targetMatchScore,(opts,queries,templates,templateKmers))
for queries in queries_partition]
match_scores_list = [p.get() for p in results]
match_scores = []
for lst in match_scores_list:
match_scores += lst
else:
match_scores = multiple_targetMatchScore(opts,queries,templates,templateKmers)
# Write results
data = get_data(opts.boost_output)
equations = get_equations(data)
with open(opts.target_output,'w') as file:
file.write("Templates,")
file.write(",".join(templateNames))
file.write('\n')
for (targetFileName, targetMatchScore) in match_scores:
file.write(targetFileName)
for t, ms in enumerate(targetMatchScore):
slope, intercept = equations[t]
y_predict = get_copy_number(ms, slope, intercept)
file.write("," + "{:.2f}".format(y_predict))
file.write('\n')

0 comments on commit 137d903

Please sign in to comment.