# Assay Binding analysis
Analyze how well my PCR primers and probes match the sequences I've found in my samples.

## Initialization, configuration and utility functions

In [19]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Align

import RCUtils

# For now this is just Enterovirus primers, but could expand it to other species
primers = RCUtils.readPrimers("qPCRPrimers.fasta")
print("Read %i primers" % (len(primers)))   

aligner = Align.PairwiseAligner(mode='local', match_score=1, mismatch_score=0, gap_score=-1)

def printSeqBinding(path):
    # TODO: Try to print a semi-global alignment. Can use global with end_gap_score=0    
    record = SeqIO.read(path, "fastq")
    hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
    for hit in hits:
        print ("%s len=%d match=%d%%" % (hit.primer.id, len(hit.primer.seq), 100*hit.mr))
        if hit.mr < 1:
            a = aligner.align(record.seq, hit.primer.seq, strand="-" if hit.rev else "+")[0]
            if a.coordinates[1][0] > 0:
                if a.coordinates[0][0] == 0:
                    print ("  Primer falls %d bases off the start of the sequence" % a.coordinates[1][0])
                else:
                    print ("  Primer mismatch in first %d bases" % a.coordinates[1][0])
            pt = len(hit.primer.seq) - a.coordinates[1][-1]
            if pt > 0:
                if a.coordinates[0][-1] == len(record):
                    print ("  Primer falls %d bases off the end of the sequence" % pt)
                else:
                    print ("  Primer mismatch in the last %d bases" % pt)
            print(a)

Read 17 primers


## Summary


In [33]:
import glob
from collections import defaultdict
import pandas as pd

# Show a table of primer match scores for each sequence
table = dict()
for path in glob.glob("myseqs/*.fastq"):
    record = SeqIO.read(path, "fastq")
    hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
    scores = dict()
    for hit in hits:
        # Get the primer name without the suffix
        pname = hit.primer.id.split("-")[0]
        if pname in scores:
            scores[pname] *= hit.mr
        else:
            scores[pname] = hit.mr
        
    table[record.id] = scores

df = pd.DataFrame.from_dict(table, orient='index')
df *= 100
df = df.round(0).astype(int)
df

Unnamed: 0,ENTng,ENTrc,HRVma
S45-51-RC1,100,100,80
S44-RC1,100,95,85
S28-RC1,95,100,95


Overall we see HRVMa is a pretty poor match for everything except S28, which matches qPCR experimental results. ENTrc and ENTrc are generally both good.

# S28 - Rhinovirus A-23

In [2]:
printSeqBinding("myseqs/S28-RVA-23.fastq")

ENTng-f.1 len=19 match=100%
ENTng-p.6 len=18 match=100%
ENTrc-f1 len=21 match=100%
ENTrc-r len=21 match=100%
HRVma-f len=19 match=100%
HRVma-p len=18 match=100%
ENTng-r len=22 match=95%
  Primer falls 1 bases off the end of the sequence
target          371 CTACTTTGGGTGTCCGTGTTT 392
                  0 |||||||||||||||||||||  21
query             0 CTACTTTGGGTGTCCGTGTTT  21

HRVma-r len=20 match=95%
target          359 GGATGGGACCAACTACTTTG 379
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20



## S44 - Rhinovirus A-56

In [3]:
printSeqBinding("myseqs/S44-RVA-56.fastq")

ENTng-f.1 len=19 match=100%
ENTng-r len=22 match=100%
ENTng-p.6 len=18 match=100%
ENTrc-r len=21 match=100%
HRVma-p len=18 match=100%
ENTrc-f1 len=21 match=95%
target           23 CAAGCACTTCTGTCTCCCCGG 44
                  0 |||||||||||||.||||||| 21
query             0 CAAGCACTTCTGTTTCCCCGG 21

HRVma-r len=20 match=95%
target          383 GGATGGGACCAACTACTTTG 403
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

HRVma-f len=19 match=89%
target          259 TTGACAAGGTGTGAAGAGC 278
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19



## S45-51 - Rhinovirus C-1

In [18]:
printSeqBinding("myseqs/S45-51-RVC-1.fastq")

ENTng-f.1 len=19 match=100%
ENTng-r len=22 match=100%
ENTng-p.4 len=18 match=100%
ENTrc-f1 len=21 match=100%
ENTrc-r len=21 match=100%
HRVma-p len=18 match=100%
HRVma-r len=20 match=90%
target          363 GGATGGAACCAACTACTTTG 383
                  0 ||||||.|||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

HRVma-f len=19 match=89%
  Primer mismatch in the last 1 bases
target          239 TGGACAAGGTGTGAAGAG 257
                  0 ||||||.|||||||||||  18
query             0 TGGACAGGGTGTGAAGAG  18

