# Assay Binding analysis
Analyze how well my PCR primers and probes match the sequences I've found in my samples.

## Initialization, configuration and utility functions

In [2]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Align

%load_ext autoreload
%autoreload 1
%aimport RCUtils

# For now this is just Enterovirus primers, but could expand it to other species
primers = RCUtils.readPrimers("qPCRPrimers.fasta", display=True)

aligner = Align.PairwiseAligner(mode='local', match_score=1, mismatch_score=0, gap_score=-1)

def printSeqBinding(path, format="fastq"):
    # TODO: Try to print a semi-global alignment. Can use global with end_gap_score=0    
    record = SeqIO.read(path, format)
    hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
    for hit in sorted(hits, key=lambda hit: hit.primer.id):
        print ("%s len=%d match=%d%% [%d:%d]" % (hit.primer.id, len(hit.primer.seq), 100*hit.mr, hit.start, hit.end))
        if hit.mr < 1:
            a = aligner.align(record.seq, hit.primer.seq, strand="-" if hit.rev else "+")[0]
            if a.coordinates[1][0] > 0:
                if a.coordinates[0][0] == 0:
                    print ("  Primer falls %d bases off the start of the sequence" % a.coordinates[1][0])
                else:
                    print ("  Primer mismatch in first %d bases" % a.coordinates[1][0])
            pt = len(hit.primer.seq) - a.coordinates[1][-1]
            if pt > 0:
                if a.coordinates[0][-1] == len(record):
                    print ("  Primer falls %d bases off the end of the sequence" % pt)
                else:
                    print ("  Primer mismatch in the last %d bases" % pt)
            print(a)

Reading primers: qPCRPrimers.fasta
  ENTng-f (2 variations)
  ENTng-r
  ENTng-p (8 variations)
  ENTrc-f1
  ENTrc-f2
  ENTrc-r
  HRVma-f
  HRVma-r
  HRVma-p
  HRVkaV-fo (2 variations)
  HRVkaV-fi
  HRVkaV-r (768 variations)
  HRVka5-f
  HRVka5-ro
  HRVka5-ri
Read 791 primers


## Summary


In [2]:
import glob
from collections import defaultdict
import pandas as pd

# Show a table of primer match scores for each sequence
table = dict()
for path in glob.glob("myseqs/*.fastq"):
    record = SeqIO.read(path, "fastq")
    hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
    scores = dict()
    for hit in hits:
        # Get the primer name without the suffix
        pname = hit.primer.id.split("-")[0]
        if pname in scores:
            scores[pname] *= hit.mr
        else:
            scores[pname] = hit.mr
        
    table[record.id] = scores

df = pd.DataFrame.from_dict(table, orient='index')
df *= 100
df = df.round(0).astype(int)
df

Unnamed: 0,ENTrc,HRVka5,HRVma,HRVkaV,ENTng
S44-RVA-56,95,95,85,100,100
S48-RVC-1,100,100,80,100,100
S28-RVA-23,100,96,95,95,95


Overall we see HRVMa is a pretty poor match for everything except S28, which matches qPCR experimental results. ENTng and ENTrc are generally both good.

# S28 - Rhinovirus A-23

In [6]:
printSeqBinding("myseqs/S28-RVA-23.fastq")

ENTng-f.1 len=19 match=100% [279:298]
ENTng-p.6 len=18 match=100% [361:379]
ENTng-r len=22 match=95% [371:392]
  Primer falls 1 bases off the end of the sequence
target          371 CTACTTTGGGTGTCCGTGTTT 392
                  0 |||||||||||||||||||||  21
query             0 CTACTTTGGGTGTCCGTGTTT  21

ENTrc-f1 len=21 match=100% [1:22]
ENTrc-r len=21 match=100% [370:391]
HRVka5-f len=21 match=100% [1:22]
HRVka5-ri len=18 match=100% [273:291]
HRVka5-ro len=23 match=96% [370:392]
  Primer mismatch in first 23 bases
  Primer falls 22 bases off the end of the sequence
target          370 ACTACTTTGGGTGTCCGTGTTT 392
                  0 ||||||||||||||||||||||  22
query            23 ACTACTTTGGGTGTCCGTGTTT   1

HRVkaV-fi len=22 match=95% [371:392]
  Primer falls 1 bases off the end of the sequence
target          371 CTACTTTGGGTGTCCGTGTTT 392
                  0 |||||||||||||||||||||  21
query             0 CTACTTTGGGTGTCCGTGTTT  21

HRVkaV-fo.1 len=20 match=100% [278:298]
HRVma-f len=19 match=10

## S44 - Rhinovirus A-56

In [7]:
printSeqBinding("myseqs/S44-RVA-56.fastq")

ENTng-f.1 len=19 match=100% [303:322]
ENTng-p.6 len=18 match=100% [385:403]
ENTng-r len=22 match=100% [395:417]
ENTrc-f1 len=21 match=95% [23:44]
target           23 CAAGCACTTCTGTCTCCCCGG 44
                  0 |||||||||||||.||||||| 21
query             0 CAAGCACTTCTGTTTCCCCGG 21

ENTrc-r len=21 match=100% [394:415]
HRVka5-f len=21 match=95% [23:44]
target           23 CAAGCACTTCTGTCTCCCCGG 44
                  0 |||||||||||||.||||||| 21
query             0 CAAGCACTTCTGTTTCCCCGG 21

HRVka5-ri len=18 match=100% [297:315]
HRVka5-ro len=23 match=100% [394:417]
HRVkaV-fi len=22 match=100% [395:417]
HRVkaV-fo.1 len=20 match=100% [302:322]
HRVma-f len=19 match=89% [259:278]
target          259 TTGACAAGGTGTGAAGAGC 278
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

HRVma-p len=18 match=100% [297:315]
HRVma-r len=20 match=95% [383:403]
target          383 GGATGGGACCAACTACTTTG 403
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGAC

## S48-RC-1 - Rhinovirus C-1

In [5]:
printSeqBinding("myseqs/S48-RVC-1.fastq")

ENTng-f.1 len=19 match=100% [283:302]
ENTng-p.4 len=18 match=100% [365:383]
ENTng-r len=22 match=100% [375:397]
ENTrc-f1 len=21 match=100% [1:22]
ENTrc-r len=21 match=100% [374:395]
HRVka5-f len=21 match=100% [1:22]
HRVka5-ri len=18 match=100% [277:295]
HRVka5-ro len=23 match=100% [374:397]
HRVkaV-fi len=22 match=100% [375:397]
HRVkaV-fo.1 len=20 match=100% [282:302]
HRVma-f len=19 match=89% [239:257]
  Primer mismatch in the last 1 bases
target          239 TGGACAAGGTGTGAAGAG 257
                  0 ||||||.|||||||||||  18
query             0 TGGACAGGGTGTGAAGAG  18

HRVma-p len=18 match=100% [277:295]
HRVma-r len=20 match=90% [363:383]
target          363 GGATGGAACCAACTACTTTG 383
                  0 ||||||.|||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20



Here we see the critical final base of the HRVMA-f primer mismatches our S48 RVC-1 sequence, and there are three other mismatches. This explains why I couldn't get S48 to test positive via HRV qPCR. However, it does seem to be a perfect match for ENTng, so I still don't know why I had so much trouble with that assay being unreliable for this sample.

## RefSeq Rhinovirus C-1

Test against the full C-1 genome since earlier primers and probes weren't designed for Rhinovirus C.

In [3]:
printSeqBinding("refseq/Rhinovirus-C1.gb", "gb")

ENTng-f.1 len=19 match=100% [309:328]
ENTng-p.4 len=18 match=100% [391:409]
ENTng-r len=22 match=100% [401:423]
ENTrc-f1 len=21 match=100% [27:48]
ENTrc-r len=21 match=100% [400:421]
HRVka5-f len=21 match=100% [27:48]
HRVka5-ri len=18 match=100% [303:321]
HRVka5-ro len=23 match=100% [400:423]
HRVkaV-fi len=22 match=100% [401:423]
HRVkaV-fo.1 len=20 match=100% [308:328]
HRVkaV-r.102 len=23 match=100% [907:930]
HRVma-f len=19 match=95% [265:284]
target          265 TGGACAAGGTGTGAAGAGC 284
                  0 ||||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

HRVma-p len=18 match=100% [303:321]
HRVma-r len=20 match=90% [389:409]
target          389 GGATGGAACCAACTACTTTG 409
                  0 ||||||.|||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20



The KRV ka primer sets match perfectly.