-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6915d8f
commit d50787f
Showing
2 changed files
with
191 additions
and
132 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,151 +1,177 @@ | ||
from goldilocks.goldilocks import Goldilocks | ||
from goldilocks.strategies import VariantCounterStrategy, GCRatioStrategy, NucleotideCounterStrategy, KMerCounterStrategy | ||
from goldilocks.strategies import PositionCounterStrategy, GCRatioStrategy, NucleotideCounterStrategy, KMerCounterStrategy, ReferenceConsensusStrategy | ||
|
||
#TODO Methods may take a list of locations or may need to actually analyze | ||
# a proper genomic sequence | ||
"""Execute Goldilocks search.""" | ||
data = {"ONE": {1: [1,2,5]}} | ||
g = Goldilocks(VariantCounterStrategy(), data, is_seq=False, stride=1, length=3) | ||
|
||
candidates = g._filter("max", actual_distance=1) | ||
######################################### | ||
"""Read a pair of 1-indexed base position lists and output all regions falling | ||
within 2 of the maximum count of positions in regions across both, in a table.""" | ||
data = { | ||
"my_positions": { | ||
1: [1,2,5,10,15,15,18,25,30,50,51,52,53,54,55,100] | ||
}, | ||
"my_other_positions": { | ||
1: [1,3,5,7,9,12,15,21,25,51,53,59,91,92,93,95,99,100] | ||
} | ||
} | ||
g = Goldilocks(PositionCounterStrategy(), data, is_pos=True, length=10, stride=1) | ||
|
||
print candidates | ||
g.query("max", actual_distance=2).export_meta(sep="\t", group="total") | ||
|
||
######################################### | ||
data = {"ONE": {1: "CCCGGGAGATTT"}} | ||
"""Read a short sequence, census GC-ratio and output the top 5 regions as FASTA.""" | ||
data = { | ||
"my_sequence": { | ||
1: "ACCGAGAGATTT" | ||
} | ||
} | ||
g = Goldilocks(GCRatioStrategy(), data, 3, 1) | ||
|
||
candidates = g._filter("max", actual_distance=1) | ||
|
||
print candidates | ||
|
||
candidates.export_fasta(["ONE"]) | ||
g.query("max", limit=5).export_fasta() | ||
|
||
######################################### | ||
data = {"ONE": {1: "AAACCCGGGCCCGGGAGAAAAAAA"}} | ||
g = Goldilocks(KMerCounterStrategy(["AAA", "CCC"]), data, 6, 1) | ||
|
||
candidates = g._filter("max", actual_distance=1, track="AAA") | ||
"""Read a short sequence and census the appearance of the "AAA" and "CCC" motif. | ||
Output a table of regions with the most occurrences of CCC (and at least one) | ||
and another table of regions featuring the most appearances of both motifs. | ||
Output only the maximum region (actual_distance = 0) displaying both motifs to | ||
FASTA.""" | ||
data = { | ||
"my_sequence": { | ||
1: "CCCAAACCCGGGCCCGGGAGAAACCC" | ||
} | ||
} | ||
g = Goldilocks(KMerCounterStrategy(["AAA", "CCC"]), data, 9, 1) | ||
|
||
print candidates | ||
g.query("max", track="CCC", gmin=1).export_meta(sep="\t") | ||
g.query("max", group="total").export_meta(sep="\t", group="total", track="default") | ||
|
||
candidates.export_fasta("ONE") | ||
g.export_meta("ONE", sep="\t") | ||
g.query("max", group="total", actual_distance=0).export_fasta() | ||
|
||
######################################### | ||
"""Read two samples of three short chromosomes and search for 'N' nucleotides. | ||
List and export a FASTA of regions that contain at least one N, sorted by number | ||
of N's appearing across both samples. Below, an example of complex filtering.""" | ||
data = { | ||
"ONE": { | ||
"sample_one": { | ||
1: "ANAGGGANACAN", | ||
2: "ANAGGGANACAN", | ||
3: "ANANNNANACAN" | ||
3: "ANANNNANACAN", | ||
4: "NNNNAANNAANN" | ||
}, | ||
"TWO": { | ||
"sample_two": { | ||
1: "ANAGGGANACAN", | ||
2: "ANAGGGANACAN", | ||
3: "ANANNNANACAN" | ||
3: "ANANNNANACAN", | ||
4: "NNNANNAANNAA" | ||
} | ||
} | ||
g = Goldilocks(NucleotideCounterStrategy(["N"]), data, 3, 1) | ||
print g.groups | ||
candidates = g._filter("max") | ||
|
||
candidates.export_fasta(["ONE", "TWO"], filename="example", divide=True) | ||
candidates.export_fasta() | ||
g_max = g.query("max", gmin=1) | ||
g_max.export_meta(sep="\t") | ||
g_max.export_fasta() | ||
|
||
candidates = g._filter("min", limit=0, | ||
g.query("min", | ||
gmin = 1, | ||
exclusions={ | ||
# Filter any region with a starting position <= 3 AND an ending postion >= 4 | ||
0: { | ||
"start_lte": 3, | ||
"end_gte": 4 | ||
}, | ||
# Filter any region with a starting position <= 3 or >= 10 | ||
"start_lte": 3, | ||
"start_gte": 10, | ||
|
||
# Filter anything from Chrm1 | ||
# Filter any regions on Chr1 | ||
1: { | ||
"chr": True | ||
}, | ||
|
||
# Filter any region with an ending postion >= 9 | ||
# Filter NO regions on Chr2 | ||
# NOTE: This also prevents the superexclusions above being applied. | ||
2: { | ||
"chr": False | ||
}, | ||
|
||
# Filter any region on Chr3 with an ending postion >= 9 | ||
3: { | ||
"start_gte": 9 | ||
"start_lte": 5 # NOTE: This overrides the start_lte applied above | ||
} | ||
}, use_and=True) | ||
print candidates | ||
candidates.export_fasta("ONE") | ||
}, use_chrom=True).export_meta(sep="\t") | ||
|
||
######################################### | ||
data = { | ||
"ONE": { | ||
1: "ANAGGGANACANANAGGGANACANANAGGGANACANANAGGGANACANANAGGGACGCGCGCGGGGANACAN", | ||
2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN", | ||
3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN", | ||
4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN" | ||
} | ||
} | ||
g = Goldilocks(GCRatioStrategy(), data, 3, 1) | ||
|
||
candidates = g._filter("min") | ||
|
||
print candidates | ||
g.plot("ONE") | ||
g.profile("ONE", bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) | ||
"""Read in four simple chromosomes from one sample and census the GC ratio. | ||
Plot both a scatter plot of all censused regions over both of the provided | ||
samples with position over the x-axis and value on the y-axis. | ||
Produce a second plot drawing a panel with a line graph for each chromosome | ||
with the same axes but data from one sample only. | ||
For the combined result of both samples and chromosomes, organise the result | ||
of the census for each region into desired bins and plot the result as a histogram. | ||
Repeat the process for the my_sequence sample and produce a panelled histogram | ||
for each chromosome.""" | ||
|
||
######################################### | ||
data = { | ||
"ONE": { | ||
1: "GGCCNNNNNNNNNNNNNNNNNNNNNNNNGGGNNNNNNNNNCCCNNNNNNNNNNCCGGSSCCNSNDNSSSCCC"*1500, | ||
2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*1500, | ||
3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*1500, | ||
4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN"*1500 | ||
"my_sequence": { | ||
1: "ANAGGGANACANANAGGGANACANANAGGGANACANANAGGGANACANANAGGGACGCGCGCGGGGANACAN"*500, | ||
2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500, | ||
3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500, | ||
4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN"*500 | ||
}, | ||
"my_same_sequence": { | ||
1: "ANAGGGANACANANAGGGANACANANAGGGANACANANAGGGANACANANAGGGACGCGCGCGGGGANACAN"*500, | ||
2: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500, | ||
3: "ANAGGCGCGCNANAGGGANACGCGGGGCCCGACANANAGGGANACANANAGGGACGCGCGCGCGCCCGACAN"*500, | ||
4: "GCGCGCGCGCGCGCGCGGGGGGGGGCGCCGCCNNNNNNNNNNNNNNNNGCGCGCGCGCGCGCGNNNNNNNNN"*500 | ||
} | ||
} | ||
g = Goldilocks(GCRatioStrategy(), data, 100, 10) | ||
g = Goldilocks(GCRatioStrategy(), data, 50, 10) | ||
|
||
candidates = g._filter("min") | ||
|
||
print candidates | ||
g.plot("ONE") | ||
g.profile("ONE", bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) | ||
g.plot() | ||
g.plot("my_sequence") | ||
g.profile(bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) | ||
import sys | ||
sys.exit() | ||
g.profile("my_sequence", bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) | ||
|
||
######################################### | ||
from Mountain.IO import fastareaders | ||
f = fastareaders.FASTA_Library("/store/sanger/ref/hs37d5.fa1") | ||
"""Read a set of simple chromosomes from two samples and tabulate the top 10% of | ||
regions demonstrating the worst consensus to the given reference over both samples. | ||
Plot the lack of consensus as line graphs for each chromosome, for each sample, | ||
then over all chromosomes for all samples on one graph.""" | ||
|
||
data = { | ||
"ONE": { | ||
1: f.get_next().seq, | ||
} | ||
"first_sample": { | ||
1: "NNNAANNNNNCCCCCNNNNNGGGGGNNNNNTTTTTNNNNNAAAAANNNNNCCCCCNNNNNGGGGGNNNNNTTTTTNNNNN", | ||
2: "NNNNNCCCCCNNNNNTTTTTNNNNNAAAAANNNNNGGGGGNNNNNCCCCCNNNNNTTTTTNNNNNAAAAANNNNNGGGGN" | ||
}, | ||
"second_sample": { | ||
1: "NNNNNNNNNNCCCCCCCCCCNNNNNNNNNNTTTTTTTTTTNNNNNNNNNNCCCCCCCCCCNNNNNNNNNNTTTTTTTTTT", | ||
2: "NNCCCCCCCCNNNNNNNNNNAAAAAAAAAANNNNNNNNNNCCCCCCCCCCNNNNNNNNNNAAAAAAAAAANNNNNNNNNN" | ||
} | ||
} | ||
ref = { | ||
1: "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT", | ||
2: "CCCCCCCCCCTTTTTTTTTTAAAAAAAAAAGGGGGGGGGGCCCCCCCCCCTTTTTTTTTTAAAAAAAAAAGGGGGGGGGG" | ||
} | ||
|
||
# Will assume that files follow the recommendation that sequence lines | ||
# are no longer than 80 characters | ||
g = Goldilocks(GCRatioStrategy(), data, 1000000, 500000) | ||
|
||
# Avoid human leukocyte antigen loci on chr6 | ||
candidates = g._filter("max", percentile_distance=10, limit=10, exclusions={"chr": [6]}) | ||
|
||
print candidates | ||
g = Goldilocks(ReferenceConsensusStrategy(reference=ref, polarity=-1), data, stride=10, length=10) | ||
g.query("max", percentile_distance=10).export_meta(group="total", track="default") | ||
|
||
g.plot("first_sample") | ||
g.plot("second_sample") | ||
g.plot() | ||
g.profile(bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) | ||
|
||
######################################### | ||
from Mountain.IO import fastareaders | ||
f = fastareaders.FASTA_Library("/store/sanger/ref/hs37d5.fa1") | ||
f.get_next() | ||
"""Read a pair of 1-indexed base position lists from two samples. Sort regions | ||
with the least number of marked positions on Sample 1 and subsort by max marked | ||
positions in Sample 2.""" | ||
|
||
data = { | ||
"ONE": { | ||
1: f.get_next().seq, | ||
} | ||
"my_positions": { | ||
1: [1,2,3,4,5,6,7,8,9,10, | ||
11,13,15,17,19, | ||
21, | ||
31,39, | ||
41] | ||
}, | ||
"other_positions": { | ||
1: [21,22,23,24,25,26,27,28, | ||
31,33,39, | ||
41,42,43,44,45,46,47,48,49,50] | ||
} | ||
} | ||
g = Goldilocks(PositionCounterStrategy(), data, is_pos=True, length=10, stride=5) | ||
|
||
# Will assume that files follow the recommendation that sequence lines | ||
# are no longer than 80 characters | ||
g = Goldilocks(KMerCounterStrategy(["AAA", "TTT", "GATTACA"]), data, 1000000, 500000) | ||
candidates = g._filter("max", track="AAA") | ||
|
||
print candidates | ||
|
||
g.plot(track="AAA") | ||
g.query("max", group="my_positions").query("max", group="other_positions").export_meta(sep="\t") |
Oops, something went wrong.