In [14]:
from collections import defaultdict
from Bio import Align
from Bio import SeqIO
import matplotlib_inline.backend_inline
import os
import pandas as pd
import RCUtils

fastQBaseDir = "../Q8/barcode-debug/Q8a_fast_0-fastq/"
OC43Primer = "CTAGTGCAGGATCGCGTAGTAG"
ONTBarcodes = list(SeqIO.parse("ONTBarcodes.fasta", format="fasta"))

def displayBarcodes(fastQFile):
    reads = 0
    OC43Reads = 0
    primerHits = defaultdict(int)
    print(f"Processing {fastQFile}")

    for read in RCUtils.readFastQ(fastQFile):
        # Count the number of reads that contain the OC43 primer
        reads += 1
        if OC43Primer in read.seq:
            OC43Reads += 1
            #print(f"OC43 read: {read.id}  len: {len(read.seq)}")
            hits = RCUtils.computePrimerHits(read, ONTBarcodes, allowOverlaps=True, matchThreshold=0.80)
            for hit in hits:
                primerHits[hit.primer.id] += 1
                #print(f"  {hit.primer.id} {hit.start}-{hit.end} mr={hit.mr}")

    print(f"Reads: {reads}, OC43Reads: {OC43Reads}")
    table = {"Primer": [], "Count": []}
    for primer in ONTBarcodes:
        if  primerHits[primer.id] > 0:
            table["Primer"].append(primer.id)
            table["Count"].append(primerHits[primer.id])

    df = pd.DataFrame(data=table)
    df.set_index(["Primer"], inplace=True)
    df.sort_values("Count", ascending=False, inplace=True)
    display(df)

displayBarcodes(os.path.join(fastQBaseDir,"Q8a_fast_0_barcode15.fastq.gz"))
displayBarcodes(os.path.join(fastQBaseDir,"Q8a_fast_0_barcode18.fastq.gz"))


Processing ../Q8/barcode-debug/Q8a_fast_0-fastq/Q8a_fast_0_barcode15.fastq.gz
Reads: 98310, OC43Reads: 4


Unnamed: 0_level_0,Count
Primer,Unnamed: 1_level_1
NB15f,3
NB15r,1
NB18r,1


Processing ../Q8/barcode-debug/Q8a_fast_0-fastq/Q8a_fast_0_barcode18.fastq.gz
Reads: 9239, OC43Reads: 1109


Unnamed: 0_level_0,Count
Primer,Unnamed: 1_level_1
NB18f,1071
NB18r,354
NB16r,4
NB21r,4
NB05r,2
NB13r,2
NB15r,2
NB01r,1
NB04r,1
NB06f,1
