diff --git a/docs/advanced.rst b/docs/advanced.rst index 38c5fcd..8715e50 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -1,6 +1,6 @@ -============== -Advanced Usage -============== +====================== +Advanced Package Usage +====================== The following assumes basic Python programming experience (and that you have installed Goldilocks and familiarised yourself diff --git a/docs/examples.rst b/docs/examples.rst deleted file mode 100644 index f09841c..0000000 --- a/docs/examples.rst +++ /dev/null @@ -1,224 +0,0 @@ -=========== -Basic Usage -=========== - -The following example assumes basic Python programming experience (and -that you have installed Goldilocks), skip to the -end if you think you know what you're doing. - -Importing ---------- - -To use Goldilocks you will need to import the :class:`goldilocks.goldilocks.Goldilocks` -class and your desired census strategy (e.g. NucleotideCounterStrategy) from -:mod:`goldilocks.strategies` to your script: :: - - from goldilocks.goldilocks import Goldilocks - from goldilocks.strategies import NucleotideCounterStrategy - - -Providing Sequence Data ------------------------ - -The :class:`goldilocks.goldilocks.Goldilocks` class expects you to provide -sequence data in the following format: :: - - sequence_data = { - "sample_name_or_identifier": { - "chr_name_or_number": "my_actual_sequence", - } - } - -For example: :: - - sequence_data = { - "my_sample": { - 2: "NANANANANA", - "one": "CATCANCAT", - "X": "GATTACAGATTACAN" - }, - "my_other_sample": { - 2: "GANGANGAN", - "one": "TATANTATA", - "X": "GATTACAGATTACAN" - } - } - -The sequences are stored in a nested structure of Python dictionaries, each -key of the `sequence_data` dictionary represents the name or an otherwise unique -identifier for a particular sample (e.g. "my_sample", "my_other_sample"), the -value is a dictionary whose own keys represent chromosome names or numbers [#]_ -and the corresponding values are the sequences themselves as a string [#]_. -Regardless of how the chromosomes are identified, they must match across samples -if one wishes to make comparisons across samples. - -.. [#] Goldilocks has no preference for use of numbers or strings for chromosome names but it would be sensible to use numbers where possible for cases where you might wish to sort by chromosome. -.. [#] In future it is planned that sequences may be further nested in a dictionary to fully support polyploid species. - - -Conducting a Census -------------------- - -Once you have organised your sequence data in to the appropriate structure, you -may conduct the census with Goldilocks by passing your strategy (e.g. NucleotideCounterStrategy) -and sequence data to the imported :class:`goldilocks.goldilocks.Goldilocks` class: :: - - g = Goldilocks(NucleotideCounterStrategy(["N"]), sequence_data, length=3, stride=1) - -Make sure you add the brackets after the name of the imported strategy, this -'creates' a usuable strategy for Goldilocks to work with. - -For each chromosome (i.e. 'one', 'X' and 2) Goldilocks will split each sequence -from the corresponding chromosome in each of the two example samples in to triplets -of bases (as our specified region length is 3) with an offset of 1 (as our stride is 1). -For example, chromosome `"one"` of `"my_sample"` will be split as follows: :: - - CAT - ATC - TCA - CAN - ANC - NCA - CAT - -In our example, the NucleotideCounterStrategy will then count the number of N bases that -appear in each split, for each sample, for each chromosome. - - -Getting the Regions -------------------- - -Once the census is complete, you can extract all of the censused regions directly -from your Goldilocks object. The example below demonstrates the format of the -returned regions dictionary for the example data above: :: - - > g.regions - { - 0: { - 'chr': 2, - 'ichr': 0, - 'pos_end': 3, - 'pos_start': 1, - 'group_counts': { - 'my_sample': {'default': 2}, - 'my_other_sample': {'default': 1}, - 'total': {'default': 3} - }, - } - - ... - - 27: { - 'chr': 'one', - 'ichr': 6, - 'pos_end': 9, - 'pos_start': 7, - 'group_counts': { - 'my_sample': {'default': 0}, - 'my_other_sample': {'default': 0}, - 'total': {'default': 0} - }, - } - } - - -The returned structure is a dictionary whose keys represent the `id` of each region, -with values corresponding to a dictionary of metadata for that particular `id`. -The `id` is assigned incrementally (starting at 0) as each region is encountered -by Goldilocks during the census and isn't particularly important. - -Each region dictionary has the following metadata [#]_: - -============ ===== -Key Value -============ ===== -id A unique id assigned to the region by Goldilocks -chr The chromosome the region appeared on (as found in the input data) -ichr This region is the `ichr-th` to appear on this chromosome (0-indexed) -pos_start The 1-indexed base of the sequence where the region begins (inclusive) -pos_end The 1-indexed base of the sequence where the region ends (inclusive) -============ ===== - -.. [#] Goldilocks used to feature a group_counts dictionary as part of the region - metadata as shown in the example above, this was removed as it duplicated - data stored in the group_counts variable in the Goldilocks object needlessly. - It has not been removed in the example output above as it helps explain - what regions represent. - - -In the example output above, the first (0th) censused region appears on -chromosome 2 [#]_ and includes bases 1-3. It is the first (0th) region to appear on this -chromosome and over those three bases, the corresponding subsequence for `"my_sample"` -contained 2 N bases and the corresponding subsequence for `"my_other_sample"` contained -1. In total, over both samples, on chromosome 2, over bases 1-3, 3 N bases appeared. - -The last region, region 27 (28th) appears on chromosome `"one"` [#]_ and includes -bases 7-9. It is the seventh (6th by 0-index) found on this chromosome and over -those three bases neither of the two samples contained an N base. - -.. [#] As numbers are ordered before strings like "one" and "X" in Python. -.. [#] As "X" is ordered before "one" in Python. - - -Sorting Regions ---------------- - -Following a census, Goldilocks allows you to sort the regions found by four -mathematical operations: `max`, `min`, `mean` and `median`. :: - - regions_max = g.query("max") - regions_min = g.query("min") - regions_mean = g.query("mean") - regions_median = g.query("median") - -The data is returned in a special list:, a :class:`goldilocks.goldilocks.CandidateList` -which defines a table-based representation should a user wish to print the list: :: - - > print(regions_max) - ID VAL CHR POSITIONS (INC.) - 0 {'default': 3} 2 1 - 3 - 2 {'default': 3} 2 3 - 5 - 4 {'default': 3} 2 5 - 7 - 6 {'default': 3} 2 7 - 9 - 1 {'default': 2} 2 2 - 4 - ... - 18 {'default': 0} X 11 - 13 - 19 {'default': 0} X 12 - 14 - 21 {'default': 0} one 1 - 3 - 22 {'default': 0} one 2 - 4 - 27 {'default': 0} one 7 - 9 - - -Note the regions in the `regions_max` CandidateList are now sorted by the number -of N bases that appeared. Ties are currently resolved by the region that was seen -first (has the lowest `id`). - - -Full Example ------------- - -Census an example sequence for appearance of 'N' bases: :: - - from goldilocks.goldilocks import Goldilocks - from goldilocks.strategies import NucleotideCounterStrategy - - sequence_data = { - "my_sample": { - 2: "NANANANANA", - "one": "CATCANCAT", - "X": "GATTACAGATTACAN" - }, - "my_other_sample": { - 2: "GANGANGAN", - "one": "TATANTATA", - "X": "GATTACAGATTACAN" - } - } - - g = Goldilocks(NucleotideCounterStrategy(["N"]), sequence_data, length=3, stride=1) - - regions_max_n_bases = g.query("max") - regions_min_n_bases = g.query("min") - regions_median_n_bases = g.query("min") - regions_mean_n_bases = g.query("min") - diff --git a/docs/index.rst b/docs/index.rst index 3f225d4..f795226 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,7 +13,8 @@ Contents: readme installation - examples + cli + basic advanced exporting plotting diff --git a/goldilocks/goldilocks.py b/goldilocks/goldilocks.py index 486cd90..8e2ccd5 100644 --- a/goldilocks/goldilocks.py +++ b/goldilocks/goldilocks.py @@ -193,7 +193,6 @@ def __init__(self, strategy, data, length, stride, is_pos=False, is_faidx=False, # Read data self.groups = data self.max_chr_max_len = None - self.fa_idx = {} for group in self.groups: if self.IS_FAI: @@ -201,6 +200,7 @@ def __init__(self, strategy, data, length, stride, is_pos=False, is_faidx=False, #TODO Close f #TODO Assume for now records in each input faidx are ordered zipwise for i, line in enumerate(open(self.groups[group]["idx"])): + i += 1 self.groups[group]["seq"][i] = {} fields = line.strip().split("\t") @@ -297,7 +297,6 @@ def census_slide(work_q): i = work_block["i"] zeropos_start = work_block["s0"] - #onepos_end = work_block["e1"] track = work_block["t"] track_id = work_block["tid"] group = work_block["g"] @@ -362,7 +361,6 @@ def census_slide(work_q): wwork_block = { "i": region_i, "s0": zeropos_start, - #"e1": onepos_end, "t": track, "tid": track_id, "g": group, @@ -378,7 +376,6 @@ def census_slide(work_q): for _ in range(self.PROCESSES): p = Process(target=census_slide, args=(work_queue,)) - #p.daemon = True processes.append(p) for p in processes: diff --git a/tests/dat/my_other_sequence.fa b/tests/dat/my_other_sequence.fa new file mode 100644 index 0000000..77a8a92 --- /dev/null +++ b/tests/dat/my_other_sequence.fa @@ -0,0 +1,4 @@ +>hoothoot +AAACCCNNNTTTAAACCCGGGNNNAAACCCGGGTTTNNNCCCGGGTTT +>meowmeow +GCGTNANNGANGGCTANTCTANAGCNNTTTCTNTNNGCANCANTTGNN diff --git a/tests/dat/my_other_sequence.fa.fai b/tests/dat/my_other_sequence.fa.fai new file mode 100644 index 0000000..5b8c6fc --- /dev/null +++ b/tests/dat/my_other_sequence.fa.fai @@ -0,0 +1,2 @@ +hoothoot 48 10 48 49 +meowmeow 48 69 48 49 diff --git a/tests/dat/my_sequence.fa b/tests/dat/my_sequence.fa new file mode 100644 index 0000000..77a8a92 --- /dev/null +++ b/tests/dat/my_sequence.fa @@ -0,0 +1,4 @@ +>hoothoot +AAACCCNNNTTTAAACCCGGGNNNAAACCCGGGTTTNNNCCCGGGTTT +>meowmeow +GCGTNANNGANGGCTANTCTANAGCNNTTTCTNTNNGCANCANTTGNN diff --git a/tests/dat/my_sequence.fa.fai b/tests/dat/my_sequence.fa.fai new file mode 100644 index 0000000..5b8c6fc --- /dev/null +++ b/tests/dat/my_sequence.fa.fai @@ -0,0 +1,2 @@ +hoothoot 48 10 48 49 +meowmeow 48 69 48 49 diff --git a/tests/test_regression.py b/tests/test_regression.py index 5d696a8..16ca450 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -22,6 +22,14 @@ 2: "GCGTNANNGANGGCTANTCTANAGCNNTTTCTNTNNGCANCANTTGNN", } } +DATA_FAI = { + "GroupOne": { + "idx": "tests/dat/my_sequence.fa.fai" + }, + "GroupTwo": { + "idx": "tests/dat/my_other_sequence.fa.fai" + } +} STRIDE = 1 LENGTH = 3 @@ -244,5 +252,221 @@ def test_sort_min(self): number_comparisons += 1 self.assertEqual(self.EXPECTED_REGION_COUNT, number_comparisons) + +class TestGoldilocksRegression_NCounter_FASTA(unittest.TestCase): + +# GroupOne +# CHR 1 +# +# 1 |======***============***============***=========| 48 +# AAACCCNNNTTTAAACCCGGGNNNAAACCCGGGTTTNNNCCCGGGTTT +# |0| |1| |1| |0| |0| |2| |0| |0| |0| |3| |0| |0| +# |0| |2| |0| |0| |0| |3| |0| |0| |0| |2| |0| |0| +# |0| |3| |0| |0| |0| |2| |0| |0| |1| |1| |0| +# |0| |2| |0| |0| |1| |1| |0| |0| |2| |0| |0| +# + +# GroupOne +# CHR 2 +# +# 1 |====*=**==*=====*====*===**=====*=**===*==*===**| 48 +# GCGTNANNGANGGCTANTCTANAGCNNTTTCTNTNNGCANCANTTGNN +# |0| |2| |1| |0| |1| |1| |2| |0| |2| |0| |1| |1| +# |0| |2| |1| |0| |0| |1| |2| |0| |2| |1| |1| |2| +# |1| |2| |1| |1| |0| |0| |1| |1| |2| |1| |1| +# |1| |1| |0| |1| |1| |1| |0| |1| |1| |1| |0| +# + @classmethod + def setUpClass(cls): + cls.g = Goldilocks(NucleotideCounterStrategy(["N"]), DATA_FAI, length=LENGTH, stride=STRIDE, is_faidx=True) + cls.EXPECTED_REGIONS = { + 1: { + 0: 0, + 1: 0, + 2: 0, + 3: 0, + 4: 1, + 5: 2, + 6: 3, + 7: 2, + 8: 1, + 9: 0, + 10: 0, + 11: 0, + 12: 0, + 13: 0, + 14: 0, + 15: 0, + 16: 0, + 17: 0, + 18: 0, + 19: 1, + 20: 2, + 21: 3, + 22: 2, + 23: 1, + 24: 0, + 25: 0, + 26: 0, + 27: 0, + 28: 0, + 29: 0, + 30: 0, + 31: 0, + 32: 0, + 33: 0, + 34: 1, + 35: 2, + 36: 3, + 37: 2, + 38: 1, + 39: 0, + 40: 0, + 41: 0, + 42: 0, + 43: 0, + 44: 0, + 45: 0, + }, + 2: { + 0: 0, + 1: 0, + 2: 1, + 3: 1, + 4: 2, + 5: 2, + 6: 2, + 7: 1, + 8: 1, + 9: 1, + 10: 1, + 11: 0, + 12: 0, + 13: 0, + 14: 1, + 15: 1, + 16: 1, + 17: 0, + 18: 0, + 19: 1, + 20: 1, + 21: 1, + 22: 0, + 23: 1, + 24: 2, + 25: 2, + 26: 1, + 27: 0, + 28: 0, + 29: 0, + 30: 1, + 31: 1, + 32: 2, + 33: 2, + 34: 2, + 35: 1, + 36: 0, + 37: 1, + 38: 1, + 39: 1, + 40: 1, + 41: 1, + 42: 1, + 43: 0, + 44: 1, + 45: 2, + } + } + + cls.EXPECTED_REGION_TOTALS = [] + for j in cls.EXPECTED_REGIONS: + for i in cls.EXPECTED_REGIONS[j]: + cls.EXPECTED_REGION_TOTALS.append(cls.EXPECTED_REGIONS[j][i]*2) + + cls.EXPECTED_BUCKETS = { + 0: [0, 1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 57, 58, 59, 63, 64, 68, 73, 74, 75, 82, 89], + 1: [4, 8, 19, 23, 34, 38, + 48, 49, 53, 54, 55, 56, 60, 61, 62, 65, 66, 67, 69, + 72, 76, 77, 81, 83, 84, 85, 86, 87, 88, 90], + 2: [5, 7, 20, 22, 35, 37, + 50, 51, 52, 70, 71, 78, 79, 80, 91], + 3: [6, 21, 36], + } + cls.EXPECTED_REGION_COUNT = len(cls.EXPECTED_REGIONS[1]) + len(cls.EXPECTED_REGIONS[2]) + + def test_number_regions(self): + self.assertEqual(self.EXPECTED_REGION_COUNT, len(self.g.regions.items())) + + # TODO Technically these check the metadata is correct and not the *actual* regions + def test_region_lengths(self): + number_comparisons = 0 + # Ensure ALL meet LENGTH + for region_i, region_data in self.g.regions.items(): + number_comparisons += 1 + self.assertEqual(LENGTH, len(range(region_data["pos_start"], region_data["pos_end"])) + 1) + self.assertEqual(self.EXPECTED_REGION_COUNT, number_comparisons) + + # TODO Technically these check the metadata is correct and not the *actual* regions + def test_region_stride(self): + number_comparisons = 0 + # Ensure regions begin at right STRIDE + for region_i, region_data in self.g.regions.items(): + number_comparisons += 1 + expected_start = 1 + (region_data["ichr"] * STRIDE) + self.assertEqual(expected_start, region_data["pos_start"]) + + # -1 as the region includes the start element + expected_end = (expected_start - 1) + LENGTH + self.assertEqual(expected_end, region_data["pos_end"]) + self.assertEqual(self.EXPECTED_REGION_COUNT, number_comparisons) + +############################################################################### +# \/ NOTE Tests below depend only on GroupOne \/ # +############################################################################### + + def test_content_counters(self): + number_comparisons = 0 + for region_i, region_data in self.g.regions.items(): + number_comparisons += 1 + self.assertEqual(self.EXPECTED_REGIONS[region_data["chr"]][region_data["ichr"]], self.g.counter_matrix[self.g._get_group_id("GroupOne"), self.g._get_track_id("N"), region_i]) + self.assertEqual(self.EXPECTED_REGION_COUNT, number_comparisons) + + def test_number_buckets(self): + self.assertEqual(len(self.EXPECTED_BUCKETS), len(self.g.group_buckets["GroupOne"]["N"])) + total_in_bucket = 0 + total_in_bucket_g = 0 + + for b in self.EXPECTED_BUCKETS: + total_in_bucket += len(self.EXPECTED_BUCKETS[b]) + for b in self.g.group_buckets["GroupOne"]["N"]: + total_in_bucket_g += len(self.g.group_buckets["GroupOne"]["N"][b]) + + self.assertEqual(total_in_bucket, total_in_bucket_g) + + def test_content_buckets(self): + number_comparisons = 0 + for bucket, content in self.g.group_buckets["GroupOne"]["N"].items(): + number_comparisons += 1 + self.assertEqual(sorted(self.EXPECTED_BUCKETS[bucket]), sorted(content)) + self.assertEqual(len(self.EXPECTED_BUCKETS), number_comparisons) + +############################################################################### +# /\ NOTE Tests above depend only on GroupOne /\ # +############################################################################### + + def test_sort_min(self): + number_comparisons = 0 + regions_min = self.g.query("min").candidates + + # Merge sort required for stability + EXPECTED_MIN_REGION_ORDER = np.argsort(self.EXPECTED_REGION_TOTALS, kind="mergesort") + + for i, region in enumerate(regions_min): + self.assertEquals(EXPECTED_MIN_REGION_ORDER[i], region['id']) + number_comparisons += 1 + self.assertEqual(self.EXPECTED_REGION_COUNT, number_comparisons) + if __name__ == '__main__': unittest.main()