updated README and added multiprocessing

WGS-TB · Jun 22, 2018 · 137d903 · 137d903
1 parent 92b07d9
commit 137d903
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 PRINCE estimates Variable Number Tandem Repeats (VNTR) copy number from raw next generation sequencing (NGS) data.
 
+## Change History
+* \[insert date here\] **Version 1.1** - Refactored file handling and added multiprocessing. 
 ## Build status
 
 [![Build Status](https://travis-ci.org/WGS-TB/PythonPRINCE.svg?branch=master)](https://travis-ci.org/WGS-TB/PythonPRINCE)

diff --git a/bin/prince b/bin/prince
@@ -11,13 +11,17 @@ from prince.query_sample import test_target
 
 DEFAULT_K = 9
 DEFAULT_BOOST_OUTPUT = resource_filename('prince.resources', 'training_data.txt')
+MAJOR_VERSION_K=1
+MINOR_VERSION_K=1
 
 def main():
-    parser = argparse.ArgumentParser(description='Prince Options.')
+    parser = argparse.ArgumentParser(description='Prince (Version %d.%d) Options.'\
+                                     % (MAJOR_VERSION_K,MINOR_VERSION_K))
 
     parser.add_argument('-bo', '--boost_output', default=DEFAULT_BOOST_OUTPUT,
-                        help="output file for training data / training data used to predict copy numbers for queries")
-    parser.add_argument('-to', '--target_output', default="results/predictions.csv",
+                        help="output file for training data / training data used to predict \
+                              copy numbers for queries")
+    parser.add_argument('-to', '--target_output', default="results/predictions.csv",\
                         help="output file for query copy number predictions")
     parser.add_argument('-tmp','--templates', default="templates.fasta",
                 help="VNTR templates. Default is for M.TB")
@@ -29,6 +33,8 @@ def main():
                 help="Kmer size used during read recruitment.")
     parser.add_argument('-cn', '--copynumber', default=1,type=int,
                 help="Copy number for training genome.")
+    parser.add_argument('-np', '--num_procs', default=1,type=int,
+                help="Number of cores for parallel processing.")
 
     prince_options = parser.parse_args()
 

diff --git a/prince/query_sample.py b/prince/query_sample.py
@@ -2,34 +2,53 @@
 from prince.predict import get_data, get_equations, get_copy_number
 from prince.match_score import compute_match_score
 import time
+import multiprocessing as mp
+
+def partition(lst, n):
+    division = len(lst) / float(n)
+    return [ lst[int(round(division * i)): int(round(division * (i + 1)))] for i in range(n) ]
+
+def multiple_targetMatchScore(opts, queries, templates, templateKmers):
+    match_scores = []
+    for query in queries:
+        targetFileName = query.split("/")[-1]
+        print("Querying %s" % targetFileName)
+        start_time = time.time()
+        match_score = compute_match_score(query, templates, templateKmers, opts.k) 
+        match_scores.append( (targetFileName,match_score) )
+        print("Done with %s in time %s" % (targetFileName,str(time.time()-start_time)))
+    return match_scores
 
 def test_target(opts, templates,templateNames, templateKmers):
+    # Get the query paths
+    queries = []
     with open(opts.target_file) as file:
-        query = file.readline().strip("\n")
-        while query:
-            start_time = time.time()
-            targetFileName = query.split("/")[-1] #CHANGE
-            print("\nQuerying %s" % targetFileName)
-
-            targetMatchScore = compute_match_score(query, templates, templateKmers, opts.k)
-
-            data = get_data(opts.boost_output)
-            equations = get_equations(data)
-            predictions = []
-
-            # Write target predictions to text file
-            with open(opts.target_output, "a+") as f:
-                if f.readline() == "":
-                    f.write("Templates,")
-                    f.write(",".join(templateNames))
-                    f.write("\n")
-                f.write(targetFileName)
-                for t, ms in enumerate(targetMatchScore):
-                    slope, intercept = equations[t]
-                    y_predict = get_copy_number(ms, slope, intercept)
-                    f.write("," + "{:.2f}".format(y_predict))
-
-                f.write("\n")
-            print("Done with %s" % targetFileName)
-            print(time.time()-start_time)
-            query = file.readline().strip("\n")
+        for line in file:
+            queries.append( line.rstrip("\n") )
+    # Find match scores
+    if opts.num_procs > 1:
+        queries_partition = partition(queries,opts.num_procs)
+        pool = mp.Pool(processes=opts.num_procs)
+        # Run analyses in multiple processes 
+        results = [pool.apply_async(multiple_targetMatchScore,(opts,queries,templates,templateKmers)) 
+                   for queries in queries_partition]
+        match_scores_list = [p.get() for p in results]
+        match_scores = []
+        for lst in match_scores_list:
+            match_scores += lst 
+    else:
+        match_scores = multiple_targetMatchScore(opts,queries,templates,templateKmers)
+    # Write results
+    data = get_data(opts.boost_output)
+    equations = get_equations(data)
+    with open(opts.target_output,'w') as file:
+        file.write("Templates,")
+        file.write(",".join(templateNames))
+        file.write('\n') 
+        for (targetFileName, targetMatchScore) in match_scores:
+            file.write(targetFileName)
+            for t, ms in enumerate(targetMatchScore):
+                slope, intercept = equations[t]
+                y_predict = get_copy_number(ms, slope, intercept)
+                file.write("," + "{:.2f}".format(y_predict))
+        file.write('\n')