## Summary of BioNumPy Usage for Computing GC Content Inside Genes
BioNumPy simplifies classic bioinformatics tasks such as computing GC content within genes using sequence and interval arrays.

1. Reading Sequence and Interval Data:
Sequence arrays from FASTA files.
Interval arrays from BED files.

2. Extracting Gene Parts:
selected_seq = sequence[intervals]

3. Counting Nucleotides:
nn_counts = {nn: (selected_seq == nn).sum() for nn in "ACTG"}

4. GC Content Calculation:
Inside genes: gc_count = sum([nn_counts[nn] for nn in "GC"])
Outside genes: gc_outside = get_gc_content(chr1_sequence, ~get_boolean_mask(genes, len(chr1_sequence)))

In [7]:
from typing import Tuple

import bionumpy as bnp
from bionumpy.arithmetics import get_boolean_mask


def analyze_within_chromosome(seq_filename: str, genes_filename: str):
    chr1 = bnp.open(seq_filename).read()[0]
    genes = bnp.open(genes_filename).read()
    print("Gene-regions: ", genes)
    print("Fasta: ", chr1)
    gc_inside, gc_outside = gc_inside_and_outside(chr1.sequence, genes)
    print(f"GC inside: {gc_inside:.2f}, GC outside: {gc_outside:.2f}")
    return gc_inside, gc_outside


def gc_inside_and_outside(chr1_sequence: bnp.EncodedArray, genes: bnp.Interval) -> Tuple[int, int]:
    gc_inside = get_gc_content(chr1_sequence, genes)
    gc_outside = get_gc_content(chr1_sequence, ~get_boolean_mask(genes, len(chr1_sequence)))
    return gc_inside, gc_outside


def get_gc_content(sequence, intervals):
    selected_seq = sequence[intervals]
    nn_counts = {nn: (selected_seq == nn).sum() for nn in "ACTG"}
    gc_count = sum([nn_counts[nn] for nn in "GC"])
    return gc_count / sum(nn_counts.values())


if __name__ == '__main__':
    result = analyze_within_chromosome(
        "C:/Users/admin/OneDrive/Desktop/gc_test_onechr.fa",
        "C:/Users/admin/OneDrive/Desktop/gc_bedtest_onechr.bed"
    )
    assert result == (0.6, 0.2)

Gene-regions:  

ParsingException: Error when parsing field start from Interval