# Assay Binding analysis
Analyze how well my PCR primers and probes match the sequences I've found in my samples.

## Initialization, configuration and utility functions

In [1]:
%load_ext autoreload
%autoreload 1
%aimport RCUtils

In [10]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Align
import RCUtils
import glob
from pathlib import Path
from collections import defaultdict
import pandas as pd

# For now this is just Enterovirus primers, but could expand it to other species
primers = RCUtils.readPrimers("qPCRPrimers.fasta", display=True)

aligner = RCUtils.getPrimerAligner()

def printSeqBinding(path, format="fastq", primers=primers):
    # TODO: Try to print a semi-global alignment. Can use global with end_gap_score=0    
    record = SeqIO.read(path, format)
    hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
    for hit in sorted(hits, key=lambda hit: hit.primer.id):
        print ("%s%s len=%d match=%d%% [%d:%d]" % (hit.primer.id, " (rev)" if hit.rev else "", len(hit.primer.seq), 100*hit.mr, hit.start+1, hit.end))
        if hit.mr < 1:
            a = aligner.align(record.seq, hit.primer.rcSeq if hit.rev else hit.primer.seq)[0]
            if a.coordinates[1][0] > 0:
                if a.coordinates[0][0] == 0:
                    print ("  Primer falls %d bases off the start of the target" % a.coordinates[1][0])
                else:
                    print ("  Primer mismatch in first %d bases" % a.coordinates[1][0])
            pt = len(hit.primer.seq) - a.coordinates[1][-1]
            if pt > 0:
                if a.coordinates[0][-1] == len(record):
                    print ("  Primer falls %d bases off the end of the target" % pt)
                else:
                    print ("  Primer mismatch in the last %d bases" % pt)
            RCUtils.extendAlignment(a)
            print(a)

# Show a table of primer match scores for each sequence
def summaryTable(pathGlob, format="fastq", primers=primers):
    table = dict()
    for path in glob.glob(pathGlob):
        record = SeqIO.read(path, format)
        hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
        scores = dict()
        for hit in hits:
            # Get the primer name without the suffix
            pname = hit.primer.id.rsplit("-",1)[0]
            if pname in scores:
                scores[pname] *= hit.mr
            else:
                scores[pname] = hit.mr
        
        # Get the base filename without the suffix
        file = Path(path).stem
        table[file] = scores

    df = pd.DataFrame.from_dict(table, orient='index')
    df.fillna(0, inplace=True)
    df *= 100
    df = df.round(0).astype(int)
    return df

Reading primers: qPCRPrimers.fasta
  ENTng-f (2 variations)
  ENTng-r
  ENTng-p (8 variations)
  ENTrc-f1
  ENTrc-f2
  ENTrc-r
  HRVma-f
  HRVma-r
  HRVma-p
  HRVkaV-fo (2 variations)
  HRVkaV-fi
  HRVkaV-r (768 variations)
  HRVka5-f
  HRVka5-ro
  HRVka5-ri
Read 791 primers


## Summary


In [3]:
display(summaryTable("myseqs/*.fastq"))

summaryTable("refseq/Rhinovirus-*.gb", "gb")

Unnamed: 0,ENTrc,HRVka5,HRVma,HRVkaV,ENTng
S44-RVA-56,95,95,85,100,100
S48-RVC-1,100,100,80,100,100
S28-RVA-23,100,96,95,95,95


Unnamed: 0,ENTrc,HRVka5,HRVma,HRVkaV,ENTng
Rhinovirus-A89,100,100,85,100,100
Rhinovirus-C1,100,100,86,100,100
Rhinovirus-A56,95,95,80,100,100
Rhinovirus-A23,100,100,81,100,100
Rhinovirus-A77,100,100,85,83,100


Overall we see HRVMa is a pretty poor match for everything except S28, which matches qPCR experimental results. ENTng and ENTrc are generally both good.

# S28 - Rhinovirus A-23

In [11]:
printSeqBinding("myseqs/S28-RVA-23.fastq")

ENTng-f.1 len=19 match=100% [280:298]
ENTng-p.6 len=18 match=100% [362:379]
ENTng-r (rev) len=22 match=95% [372:392]
  Primer falls 1 bases off the end of the target
target          371 CTACTTTGGGTGTCCGTGTTT- 392
                  0 |||||||||||||||||||||-  22
query             0 CTACTTTGGGTGTCCGTGTTTC  22

ENTrc-f1 len=21 match=100% [2:22]
ENTrc-r (rev) len=21 match=100% [371:391]
HRVka5-f len=21 match=100% [2:22]
HRVka5-ri (rev) len=18 match=100% [274:291]
HRVka5-ro (rev) len=23 match=96% [371:392]
  Primer falls 1 bases off the end of the target
target          370 ACTACTTTGGGTGTCCGTGTTT- 392
                  0 ||||||||||||||||||||||-  23
query             0 ACTACTTTGGGTGTCCGTGTTTC  23

HRVkaV-fi len=22 match=95% [372:392]
  Primer falls 1 bases off the end of the target
target          371 CTACTTTGGGTGTCCGTGTTT- 392
                  0 |||||||||||||||||||||-  22
query             0 CTACTTTGGGTGTCCGTGTTTC  22

HRVkaV-fo.1 len=20 match=100% [279:298]
HRVma-f len=19 match=100% [236:25

## S44 - Rhinovirus A-56

In [5]:
printSeqBinding("myseqs/S44-RVA-56.fastq")

ENTng-f.1 len=19 match=100% [304:322]
ENTng-p.6 len=18 match=100% [386:403]
ENTng-r (rev) len=22 match=100% [396:417]
ENTrc-f1 len=21 match=95% [24:44]
target           23 CAAGCACTTCTGTCTCCCCGG 44
                  0 |||||||||||||.||||||| 21
query             0 CAAGCACTTCTGTTTCCCCGG 21

ENTrc-r (rev) len=21 match=100% [395:415]
HRVka5-f len=21 match=95% [24:44]
target           23 CAAGCACTTCTGTCTCCCCGG 44
                  0 |||||||||||||.||||||| 21
query             0 CAAGCACTTCTGTTTCCCCGG 21

HRVka5-ri (rev) len=18 match=100% [298:315]
HRVka5-ro (rev) len=23 match=100% [395:417]
HRVkaV-fi len=22 match=100% [396:417]
HRVkaV-fo.1 len=20 match=100% [303:322]
HRVma-f len=19 match=89% [260:278]
target          259 TTGACAAGGTGTGAAGAGC 278
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

HRVma-p len=18 match=100% [298:315]
HRVma-r (rev) len=20 match=95% [384:403]
target          383 GGATGGGACCAACTACTTTG 403
                  0 ||||||||||.|||||||||  20

## S48-RC-1 - Rhinovirus C-1

In [6]:
printSeqBinding("myseqs/S48-RVC-1.fastq")

ENTng-f.1 len=19 match=100% [284:302]
ENTng-p.4 len=18 match=100% [366:383]
ENTng-r (rev) len=22 match=100% [376:397]
ENTrc-f1 len=21 match=100% [2:22]
ENTrc-r (rev) len=21 match=100% [375:395]
HRVka5-f len=21 match=100% [2:22]
HRVka5-ri (rev) len=18 match=100% [278:295]
HRVka5-ro (rev) len=23 match=100% [375:397]
HRVkaV-fi len=22 match=100% [376:397]
HRVkaV-fo.1 len=20 match=100% [283:302]
HRVma-f len=19 match=89% [240:257]
  Primer mismatch in the last 1 bases
target          239 TGGACAAGGTGTGAAGAGT 258
                  0 ||||||.|||||||||||.  19
query             0 TGGACAGGGTGTGAAGAGC  19

HRVma-p len=18 match=100% [278:295]
HRVma-r (rev) len=20 match=90% [364:383]
target          363 GGATGGAACCAACTACTTTG 383
                  0 ||||||.|||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20



Here we see the critical final base of the HRVMA-f primer mismatches our S48 RVC-1 sequence, and there are three other mismatches. This explains why I couldn't get S48 to test positive via HRV qPCR. However, it does seem to be a perfect match for ENTng, so I still don't know why I had so much trouble with that assay being unreliable for this sample.

## RefSeq Rhinovirus C-1

Test against the full C-1 genome since earlier primers and probes weren't designed for Rhinovirus C.

In [7]:
printSeqBinding("refseq/Rhinovirus-C1.gb", "gb")

ENTng-f.1 len=19 match=100% [310:328]
ENTng-p.4 len=18 match=100% [392:409]
ENTng-r (rev) len=22 match=100% [402:423]
ENTrc-f1 len=21 match=100% [28:48]
ENTrc-r (rev) len=21 match=100% [401:421]
HRVka5-f len=21 match=100% [28:48]
HRVka5-ri (rev) len=18 match=100% [304:321]
HRVka5-ro (rev) len=23 match=100% [401:423]
HRVkaV-fi len=22 match=100% [402:423]
HRVkaV-fo.1 len=20 match=100% [309:328]
HRVkaV-r.102 (rev) len=23 match=100% [908:930]
HRVma-f len=19 match=95% [266:284]
target          265 TGGACAAGGTGTGAAGAGC 284
                  0 ||||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

HRVma-p len=18 match=100% [304:321]
HRVma-r (rev) len=20 match=90% [390:409]
target          389 GGATGGAACCAACTACTTTG 409
                  0 ||||||.|||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20



The KRV ka primer sets match perfectly.

## RefSeq Rhinovirus A-89

Test against the full A-89 genome since this is what's often used as the common coordinate system

In [8]:
printSeqBinding("refseq/Rhinovirus-A89.gb", "gb")

ENTng-f.1 len=19 match=100% [448:466]
ENTng-p.6 len=18 match=100% [530:547]
ENTng-r (rev) len=22 match=100% [540:561]
ENTrc-f1 len=21 match=100% [165:185]
ENTrc-r (rev) len=21 match=100% [539:559]
HRVka5-f len=21 match=100% [165:185]
HRVka5-ri (rev) len=18 match=100% [442:459]
HRVka5-ro (rev) len=23 match=100% [539:561]
HRVkaV-fi len=22 match=100% [540:561]
HRVkaV-fo.1 len=20 match=100% [447:466]
HRVkaV-r.354 (rev) len=23 match=100% [1054:1076]
HRVma-f len=19 match=89% [404:422]
target          403 TTGACAAGGTGTGAAGAGC 422
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

HRVma-p len=18 match=100% [442:459]
HRVma-r (rev) len=20 match=95% [528:547]
target          527 GGATGGGACCAACTACTTTG 547
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20



## Wisdom primers

See how the primers from this older paper compare: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2786677/



In [9]:
wisdomPrimers = RCUtils.readPrimers("HRV-Wisdom-primers.fasta", display=True)
printSeqBinding("refseq/Rhinovirus-A89.gb", "gb", wisdomPrimers)

summaryTable("refseq/Rhinovirus-*.gb", "gb", wisdomPrimers)

Reading primers: HRV-Wisdom-primers.fasta
  HRV-Wi-5UTR-fo (48 variations)
  HRV-Wi-5UTR-ro
  HRV-Wi-5UTR-f1i (16 variations)
  HRV-Wi-5UTR-f2i (4 variations)
  HRV-Wi-5UTR-f3i (4 variations)
  HRV-Wi-5UTR-ri (2 variations)
  HRV-Wi-VP4VP2-fo (2 variations)
  HRV-Wi-VP4VP2-fi (2 variations)
  HRV-Wi-VP4VP2-ri (192 variations)
  HRV-Wi-VP4VP2-ro (768 variations)
  HRV-Wi-VP1-fo (96 variations)
  HRV-Wi-VP1-fi (64 variations)
  HRV-Wi-VP1-ri (96 variations)
  HRV-Wi-VP1-ro (48 variations)
Read 1343 primers
HRV-Wi-5UTR-f1i.7 len=21 match=100% [354:374]
HRV-Wi-5UTR-fo.39 len=22 match=100% [164:185]
HRV-Wi-5UTR-ri.0 (rev) len=21 match=100% [446:466]
HRV-Wi-5UTR-ro (rev) len=23 match=100% [539:561]
HRV-Wi-VP1-fi.56 len=19 match=89% [2441:2459]
target         2440 TGGATGCTGCGGAAACCGG 2459
                  0 ||||||||||.|||||.||   19
query             0 TGGATGCTGCAGAAACAGG   19

HRV-Wi-VP1-fo.75 len=23 match=100% [1966:1988]
HRV-Wi-VP1-ri.94 (rev) len=26 match=96% [3319:3344]
target         33

Unnamed: 0,HRV-Wi-5UTR,HRV-Wi-VP4VP2,HRV-Wi-VP1
Rhinovirus-A89,100,100,85
Rhinovirus-C1,100,96,83
Rhinovirus-A56,100,77,100
Rhinovirus-A23,100,100,89
Rhinovirus-A77,95,100,96


Overall this isn't anything too special, no compelling reason to switch from HRV Ka.