Cleanup and fix examples.py

SamStudio8 · Jul 8, 2015 · d50787f · d50787f
1 parent 6915d8f
commit d50787f
Show file tree

Hide file tree

Showing 2 changed files with 191 additions and 132 deletions.
diff --git a/example.py b/example.py
@@ -1,151 +1,177 @@
 from goldilocks.goldilocks import Goldilocks
-from goldilocks.strategies import VariantCounterStrategy, GCRatioStrategy, NucleotideCounterStrategy, KMerCounterStrategy
+from goldilocks.strategies import PositionCounterStrategy, GCRatioStrategy, NucleotideCounterStrategy, KMerCounterStrategy, ReferenceConsensusStrategy
 
-#TODO Methods may take a list of locations or may need to actually analyze
-#     a proper genomic sequence
-"""Execute Goldilocks search."""
-data = {"ONE": {1: [1,2,5]}}
-g = Goldilocks(VariantCounterStrategy(), data, is_seq=False, stride=1, length=3)
-
-candidates = g._filter("max", actual_distance=1)
+#########################################
+"""Read a pair of 1-indexed base position lists and output all regions falling
+within 2 of the maximum count of positions in regions across both, in a table."""
+data = {
+        "my_positions": {
+            1: [1,2,5,10,15,15,18,25,30,50,51,52,53,54,55,100]
+        },
+        "my_other_positions": {
+            1: [1,3,5,7,9,12,15,21,25,51,53,59,91,92,93,95,99,100]
+        }
+}
+g = Goldilocks(PositionCounterStrategy(), data, is_pos=True, length=10, stride=1)
 
-print candidates
+g.query("max", actual_distance=2).export_meta(sep="\t", group="total")
 
 #########################################
-data = {"ONE": {1: "CCCGGGAGATTT"}}
+"""Read a short sequence, census GC-ratio and output the top 5 regions as FASTA."""
+data = {
+        "my_sequence": {
+            1: "ACCGAGAGATTT"
+        }
+}
 g = Goldilocks(GCRatioStrategy(), data, 3, 1)
 
-candidates = g._filter("max", actual_distance=1)
-
-print candidates
-
-candidates.export_fasta(["ONE"])
+g.query("max", limit=5).export_fasta()
 
 #########################################
-data = {"ONE": {1: "AAACCCGGGCCCGGGAGAAAAAAA"}}
-g = Goldilocks(KMerCounterStrategy(["AAA", "CCC"]), data, 6, 1)
-
-candidates = g._filter("max", actual_distance=1, track="AAA")
+"""Read a short sequence and census the appearance of the "AAA" and "CCC" motif.
+Output a table of regions with the most occurrences of CCC (and at least one)
+and another table of regions featuring the most appearances of both motifs.
+Output only the maximum region (actual_distance = 0) displaying both motifs to
+FASTA."""
+data = {
+        "my_sequence": {
+            1: "CCCAAACCCGGGCCCGGGAGAAACCC"
+        }
+}
+g = Goldilocks(KMerCounterStrategy(["AAA", "CCC"]), data, 9, 1)
 
-print candidates
+g.query("max", track="CCC", gmin=1).export_meta(sep="\t")
+g.query("max", group="total").export_meta(sep="\t", group="total", track="default")
 
-candidates.export_fasta("ONE")
-g.export_meta("ONE", sep="\t")
+g.query("max", group="total", actual_distance=0).export_fasta()
 
 #########################################
+"""Read two samples of three short chromosomes and search for 'N' nucleotides.
+List and export a FASTA of regions that contain at least one N, sorted by number
+of N's appearing across both samples. Below, an example of complex filtering."""
 data = {
-    "ONE": {
+    "sample_one": {
         1: "ANAGGGANACAN",
         2: "ANAGGGANACAN",
-        3: "ANANNNANACAN"
+        3: "ANANNNANACAN",
+        4: "NNNNAANNAANN"
     },
-    "TWO": {
+    "sample_two": {
         1: "ANAGGGANACAN",
         2: "ANAGGGANACAN",
-        3: "ANANNNANACAN"
+        3: "ANANNNANACAN",
+        4: "NNNANNAANNAA"
     }
 }
 g = Goldilocks(NucleotideCounterStrategy(["N"]), data, 3, 1)
-print g.groups
-candidates = g._filter("max")
 
-candidates.export_fasta(["ONE", "TWO"], filename="example", divide=True)
-candidates.export_fasta()
+g_max = g.query("max", gmin=1)
+g_max.export_meta(sep="\t")
+g_max.export_fasta()
 
-candidates = g._filter("min", limit=0,
+g.query("min",
+        gmin = 1,
         exclusions={
-            # Filter any region with a starting position <= 3 AND an ending postion >= 4
-            0: {
-                "start_lte": 3,
-                "end_gte": 4
-            },
+            # Filter any region with a starting position <= 3 or >= 10
+            "start_lte": 3,
+            "start_gte": 10,
 
-            # Filter anything from Chrm1
+            # Filter any regions on Chr1
             1: {
                 "chr": True
             },
 
-            # Filter any region with an ending postion >= 9
+            # Filter NO regions on Chr2
+            # NOTE: This also prevents the superexclusions above being applied.
+            2: {
+                "chr": False
+            },
+
+            # Filter any region on Chr3 with an ending postion >= 9
             3: {
-                "start_gte": 9
+                "start_lte": 5 # NOTE: This overrides the start_lte applied above
             }
-        }, use_and=True)
-print candidates
-candidates.export_fasta("ONE")
+        }, use_chrom=True).export_meta(sep="\t")
 
 #########################################
-data = {
-    "ONE": {
-        1: "ANAGGGANACANANAGGGANACANANAGGGANACANANAGGGANACANANAGGGACGCGCGCGGGGANACAN",
-        2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN",
-        3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN",
-        4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN"
-    }
-}
-g = Goldilocks(GCRatioStrategy(), data, 3, 1)
-
-candidates = g._filter("min")
-
-print candidates
-g.plot("ONE")
-g.profile("ONE", bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
+"""Read in four simple chromosomes from one sample and census the GC ratio.
+Plot both a scatter plot of all censused regions over both of the provided
+samples with position over the x-axis and value on the y-axis.
+Produce a second plot drawing a panel with a line graph for each chromosome
+with the same axes but data from one sample only.
+For the combined result of both samples and chromosomes, organise the result
+of the census for each region into desired bins and plot the result as a histogram.
+Repeat the process for the my_sequence sample and produce a panelled histogram
+for each chromosome."""
 
-#########################################
 data = {
-    "ONE": {
-        1: "GGCCNNNNNNNNNNNNNNNNNNNNNNNNGGGNNNNNNNNNCCCNNNNNNNNNNCCGGSSCCNSNDNSSSCCC"*1500,
-        2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*1500,
-        3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*1500,
-        4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN"*1500
+    "my_sequence": {
+        1: "ANAGGGANACANANAGGGANACANANAGGGANACANANAGGGANACANANAGGGACGCGCGCGGGGANACAN"*500,
+        2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500,
+        3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500,
+        4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN"*500
+    },
+    "my_same_sequence": {
+        1: "ANAGGGANACANANAGGGANACANANAGGGANACANANAGGGANACANANAGGGACGCGCGCGGGGANACAN"*500,
+        2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500,
+        3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500,
+        4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN"*500
     }
 }
-g = Goldilocks(GCRatioStrategy(), data, 100, 10)
+g = Goldilocks(GCRatioStrategy(), data, 50, 10)
 
-candidates = g._filter("min")
-
-print candidates
-g.plot("ONE")
-g.profile("ONE", bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
+g.plot()
+g.plot("my_sequence")
 g.profile(bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
-import sys
-sys.exit()
+g.profile("my_sequence", bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
 
 #########################################
-from Mountain.IO import fastareaders
-f = fastareaders.FASTA_Library("/store/sanger/ref/hs37d5.fa1")
+"""Read a set of simple chromosomes from two samples and tabulate the top 10% of
+regions demonstrating the worst consensus to the given reference over both samples.
+Plot the lack of consensus as line graphs for each chromosome, for each sample,
+then over all chromosomes for all samples on one graph."""
+
 data = {
-    "ONE": {
-        1: f.get_next().seq,
-     }
+    "first_sample": {
+        1: "NNNAANNNNNCCCCCNNNNNGGGGGNNNNNTTTTTNNNNNAAAAANNNNNCCCCCNNNNNGGGGGNNNNNTTTTTNNNNN",
+        2: "NNNNNCCCCCNNNNNTTTTTNNNNNAAAAANNNNNGGGGGNNNNNCCCCCNNNNNTTTTTNNNNNAAAAANNNNNGGGGN"
+    },
+    "second_sample": {
+        1: "NNNNNNNNNNCCCCCCCCCCNNNNNNNNNNTTTTTTTTTTNNNNNNNNNNCCCCCCCCCCNNNNNNNNNNTTTTTTTTTT",
+        2: "NNCCCCCCCCNNNNNNNNNNAAAAAAAAAANNNNNNNNNNCCCCCCCCCCNNNNNNNNNNAAAAAAAAAANNNNNNNNNN"
+    }
+}
+ref = {
+    1: "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT",
+    2: "CCCCCCCCCCTTTTTTTTTTAAAAAAAAAAGGGGGGGGGGCCCCCCCCCCTTTTTTTTTTAAAAAAAAAAGGGGGGGGGG"
 }
 
-# Will assume that files follow the recommendation that sequence lines
-# are no longer than 80 characters
-g = Goldilocks(GCRatioStrategy(), data, 1000000, 500000)
-
-# Avoid human leukocyte antigen loci on chr6
-candidates = g._filter("max", percentile_distance=10, limit=10, exclusions={"chr": [6]})
-
-print candidates
+g = Goldilocks(ReferenceConsensusStrategy(reference=ref, polarity=-1), data, stride=10, length=10)
+g.query("max", percentile_distance=10).export_meta(group="total", track="default")
 
+g.plot("first_sample")
+g.plot("second_sample")
 g.plot()
-g.profile(bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
 
 #########################################
-from Mountain.IO import fastareaders
-f = fastareaders.FASTA_Library("/store/sanger/ref/hs37d5.fa1")
-f.get_next()
+"""Read a pair of 1-indexed base position lists from two samples. Sort regions
+with the least number of marked positions on Sample 1 and subsort by max marked
+positions in Sample 2."""
+
 data = {
-    "ONE": {
-        1: f.get_next().seq,
-     }
+        "my_positions": {
+            1: [1,2,3,4,5,6,7,8,9,10,
+                11,13,15,17,19,
+                21,
+                31,39,
+                41]
+        },
+        "other_positions": {
+            1: [21,22,23,24,25,26,27,28,
+                31,33,39,
+                41,42,43,44,45,46,47,48,49,50]
+        }
 }
+g = Goldilocks(PositionCounterStrategy(), data, is_pos=True, length=10, stride=5)
 
-# Will assume that files follow the recommendation that sequence lines
-# are no longer than 80 characters
-g = Goldilocks(KMerCounterStrategy(["AAA", "TTT", "GATTACA"]), data, 1000000, 500000)
-candidates = g._filter("max", track="AAA")
-
-print candidates
-
-g.plot(track="AAA")
+g.query("max", group="my_positions").query("max", group="other_positions").export_meta(sep="\t")