## Filtering FASTQ reads

In [1]:
pip install bionumpy


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import bionumpy as bnp

def test(file="C:\\Users\\admin\\Downloads\\reads_1.fq.gz", out_filename="C:\\Users\\admin\\Downloads\\reads_1_filtered.fq.gz"):
    with bnp.open(out_filename, 'w') as out_file:
        for reads in bnp.open(file).read_chunks():
            min_quality_mask = reads.quality.min(axis=-1) > 1
            max_quality_mask = reads.quality.mean(axis=-1) > 10
            mask = min_quality_mask & max_quality_mask
            print(f'Filtering reads: {len(reads)} -> {mask.sum()}')
            out_file.write(reads[mask])

if __name__ == "__main__":
    test()


Filtering reads: 19032 -> 16838
Filtering reads: 19032 -> 16779
Filtering reads: 19033 -> 16783
Filtering reads: 19033 -> 16785
Filtering reads: 19031 -> 16760
Filtering reads: 5010 -> 4420


## Working with BAM-files

The example provided demonstrates how to handle BAM files using BioNumPy (bionumpy) and the npstructures module for analyzing alignments and base pairs.

In [3]:
import bionumpy as bnp
from npstructures import ragged_slice

def test_bamquality():
    # Opens the alignments file
    alignments = bnp.open("C:\\Users\\admin\\Downloads\\alignments.bam").read()
    
    # Extracts the first cigar operation for each alignment
    start_cigar = alignments.cigar_op[..., 0]
    
    # Get alignments that start with soft-clip
    start_clipped_alignments = alignments[start_cigar == "s"]
    
    # Get the number of softclipped bases
    n_clipped_bases = start_clipped_alignments.cigar_length[..., 0]
    
    # Extract clipped bases
    clipped_bases = ragged_slice(start_clipped_alignments.sequence, ends=n_clipped_bases)
    
    # Count bases in softclipped regions
    print(bnp.count_encoded(clipped_bases.ravel()))
    
    # Count bases in whole reads
    print(bnp.count_encoded(alignments.sequence.ravel()))

if __name__ == "__main__":
    test_bamquality()


=: 0
A: 0
C: 0
M: 0
G: 0
R: 0
S: 0
V: 0
T: 0
W: 0
Y: 0
H: 0
K: 0
D: 0
B: 0
N: 0
=: 0
A: 19
C: 8
M: 0
G: 10
R: 0
S: 0
V: 0
T: 12
W: 0
Y: 0
H: 0
K: 0
D: 0
B: 0
N: 0
