In [38]:
import gzip
from Bio import SeqIO
import subprocess, pathlib

## Filter sequencing data

To filter sequencing data, we use the software `fastp`, which is executed from the command line. Here, we show an example code that is executed in the command line. We assume that there is a mamba environment installed with the name 'fastp'. Instructions for this environment can be found in the README.md file in the first page of this repository. If `conda` is used instead of `mamba`, simply exchange the code below.

Here we have a sample dataset for the mapping step, where read 1 contains the sequenced mutated promoter variants, and read 2 contains the corresponding barcodes. We filter the reads for quality and to not contain any unidentified bases, as well as trimming extra bases from the reads (e.g. read 1 is 171 bases, with the promoter variants being in the first 160). The filtered reads are then stored and we can investigate them later.

In [37]:
filtering_code = """#!/usr/bin/env bash
READ1="../data/sequencing_reads/mapping/read1_mapping.fastq.gz"
READ2="../data/sequencing_reads/mapping/read2_mapping.fastq.gz"
OUT1="../data/filtered_sequencing/mapping/read1_mapping_filtered.fastq.gz"
OUT2="../data/filtered_sequencing/mapping/read2_mapping_filtered.fastq.gz"

mamba run -n fastp fastp \
       --in1 "$READ1" --in2 "$READ2" \
       --out1 "$OUT1" --out2 "$OUT2" \
       --trim_tail1 11 --trim_tail2 11 \
       --verbose --disable_length_filtering \
       --thread 6 -q 20 --n_base_limit 0 --unqualified_percent_limit 10
"""

with open("filtering_example.sh", "w") as f:
    f.write(filtering_code)

subprocess.run(['chmod', '+x', './filtering_example.sh'], text=True)
subprocess.run(['./filtering_example.sh'], check=True)

[10:07:47] start to load data of read1 
[10:07:47] start to load data of read2 
[10:07:49] Read2: loaded 1M reads 
[10:07:49] Read2: loading completed with 1001 packs 
[10:07:49] Read1: loaded 1M reads 
[10:07:49] Read1: loading completed with 1001 packs 
[10:07:50] thread 2 data processing completed 
[10:07:50] thread 2 finished 
[10:07:50] thread 5 data processing completed 
[10:07:50] thread 5 finished 
[10:07:50] thread 4 data processing completed 
[10:07:50] thread 4 finished 
[10:07:50] thread 3 data processing completed 
[10:07:50] thread 3 finished 
[10:07:50] thread 1 data processing completed 
[10:07:50] thread 1 finished 
[10:07:50] thread 6 data processing completed 
[10:07:50] thread 6 finished 
[10:07:50] ../data/filtered_sequencing/mapping/read2_mapping_filtered.fastq.gz writer finished 
[10:07:50] ../data/filtered_sequencing/mapping/read1_mapping_filtered.fastq.gz writer finished 
[10:07:50] start to generate reports
 
Read1 before filtering:
total reads: 1000000
total 

CompletedProcess(args=['./filtering_example.sh'], returncode=0)

## Extracting promoter variants and barcodes from sequencing files

First we have to import the sequencing data, which is in the compressed `gzip` format. We decompress the data and import the `fastq` files, then extract the sequence from each record.

In [39]:
def import_fastq_gz(filename):
    """
    Imports a gzipped FASTQ file and returns a generator of SeqRecord objects.

    Args:
        filename (str): The path to the gzipped FASTQ file.

    Yields:
        SeqRecord: A SeqRecord object representing a single read.
    """
    with gzip.open(filename, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            yield record



In [42]:
# Get promoters
file_path = "../data/filtered_sequencing/mapping/read1_mapping_filtered.fastq.gz"
records = import_fastq_gz(file_path)

promoters = [r.seq for r in records]

# Get barcodes
file_path = "../data/filtered_sequencing/mapping/read2_mapping_filtered.fastq.gz"
records = import_fastq_gz(file_path)

barcodes = [r.seq for r in records]

In [43]:
promoters

[Seq('GCCACGTTGATTATTTGCACGGCGTCACACTTTGCTATGCCATAGCATATTTAT...GCC'),
 Seq('GTCAGCTAGGATCCGCTACTCTTATGGATAAATATGCTATGGCATAGCAAAGTG...TTT'),
 Seq('CAGCGGCGCGCAGATATAGCCTTAATGGCGGCTGCGGTGCCTGTAAGGTTCTTG...ACG'),
 Seq('GCGTTTTCATCTCGTTGATGCCCCTTGTCTGTTTGATAATACGTACAATGGCTA...TAG'),
 Seq('GCGTTTTCATCTCGTTGATTCGCATTGTCTGATTGCTAATGCGCACATTGGGTA...CAG'),
 Seq('GGATGTGACTTCGGCAGACATTCGGAATATTGTCACTCGCCAGTTAGGAAATTG...AGC'),
 Seq('GTCTCGGTTTTCTCCCAGTGGCGGAGGTCTCTGAGGAAGATGGTAGCCCAAAGA...TGA'),
 Seq('CACTGCTTGGGAGTATCCCCGGGTTGATTCACCCGTTCTGGGTGCAAACCCGCG...TTT'),
 Seq('CAACCCGGATATGAACTGCATCTATTACAAATAAAGAAATTAAGCTGGAACGCA...ATC'),
 Seq('GCTGTCGCGAGTCACAGATGGAGAACTTACGATTGAGGCTGCAAACATGCGTCA...GAC'),
 Seq('GTCCTACTTGGAGTATGTGATGCAACTGTAACTCATTGTTTTGAACTAAAAATA...ATC'),
 Seq('CCTTCGGTGAAAGCTAACGACGATGCCAGTTTATTTTGTTTCGGAAATCGAACA...TCA'),
 Seq('GGCTTCCCTATCGGGAGGCCGTTTATGTGCCTCTCACTCCTCGAATACTTTTCA...CGA'),
 Seq('GCCTGCATAACAATGTCCTGGGATAAGTTTTATTGTCACGGAACACGAACGCCA...CTG'),
 Seq('CTACAAAAGCAATA