# Assay Binding analysis
Analyze how well my PCR primers and probes match the sequences I've found in my samples.

## Initialization, configuration and utility functions

In [1]:
%load_ext autoreload
%autoreload 1
%aimport RCUtils

In [2]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Align
import RCUtils
import glob
from pathlib import Path
from collections import defaultdict
import pandas as pd

# For now this is just Enterovirus primers, but could expand it to other species
primers = RCUtils.readPrimers("HRVPrimers.fasta", display=True)

aligner = RCUtils.getPrimerAligner()

def printSeqBinding(path, format="fastq", primers=primers):
    print("Sequence: %s" % path)
    record = SeqIO.read(path, format)
    hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
    for hit in sorted(hits, key=lambda hit: hit.primer.id):
        print (" %s%s len=%d match=%d%% [%d:%d]" % (hit.primer.id, " (rev)" if hit.rev else "", len(hit.primer.seq), 100*hit.mr, hit.start+1, hit.end))
        if hit.mr < 1:
            a = aligner.align(record.seq, hit.primer.rcSeq if hit.rev else hit.primer.seq)[0]
            if a.coordinates[1][0] > 0:
                if a.coordinates[0][0] == 0:
                    print ("  Primer falls %d bases off the start of the target" % a.coordinates[1][0])
                else:
                    print ("  Primer mismatch in first %d bases" % a.coordinates[1][0])
            pt = len(hit.primer.seq) - a.coordinates[1][-1]
            if pt > 0:
                if a.coordinates[0][-1] == len(record):
                    print ("  Primer falls %d bases off the end of the target" % pt)
                else:
                    print ("  Primer mismatch in the last %d bases" % pt)
            RCUtils.extendAlignment(a)
            print(a)
    print()

# Show a table of primer match scores for each sequence
def summaryTable(pathGlob, format="fastq", primers=primers):
    table = dict()
    for path in glob.glob(pathGlob):
        record = SeqIO.read(path, format)
        hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True)
        scores = dict()
        for hit in hits:
            # Get the primer name without the suffix
            pname = hit.primer.id.rsplit("-",1)[0]
            if pname in scores:
                scores[pname] *= hit.mr
            else:
                scores[pname] = hit.mr
        
        # Get the base filename without the suffix
        file = Path(path).stem
        table[file] = scores

    df = pd.DataFrame.from_dict(table, orient='index')
    df.fillna(0, inplace=True)
    df *= 100
    df = df.round(0).astype(int)
    return df

Reading primers: HRVPrimers.fasta
  ENTng-f (2 variations)
  ENTng-r
  ENTng-p (8 variations)
  ENTrc-f1
  ENTrc-f2
  ENTrc-r
  HRVma-f
  HRVma-r
  HRVma-p
  HRVkaV-fo (2 variations)
  HRVkaV-fi
  HRVkaV-r (768 variations)
  HRVka5-f
  HRVka5-ro
  HRVka5-ri
  HRVbo-f (4 variations)
  HRVbo-r
  HRVbo-p
Read 797 primers


## Summary

Note that the myseqs sequences are ampicons generated from these same (or similar) primers, and so often will not have hits for many of the primers

In [3]:
display(summaryTable("myseqs/*.fastq"))

summaryTable("refseq/Rhinovirus-*.gb", "gb")

Unnamed: 0,HRVbo,HRVma,HRVka5,HRVkaV,ENTng
S59-RVA-77,100,89,100,100,100
S44-RVA-56,100,89,100,100,100
S48-RVC-1,100,89,100,100,100
S28-RVA-23,100,100,100,100,100


Unnamed: 0,ENTrc,HRVka5,HRVbo,HRVma,HRVkaV,ENTng,random
Rhinovirus-A89,100,100,100,85,100,100,80
Rhinovirus-C1,100,100,100,86,100,100,0
Rhinovirus-A56,95,95,100,80,100,100,0
Rhinovirus-A23,100,100,100,81,100,100,0
Rhinovirus-A77,100,100,100,85,83,100,0


Overall we see HRVMa is a pretty poor match for everything except S28, which matches qPCR experimental results. Everything else is generally good.

# S28 - Rhinovirus A-23

In [4]:
printSeqBinding("myseqs/S28-RVA-23.fastq")
printSeqBinding("refseq/Rhinovirus-A23.gb", "gb")

Sequence: myseqs/S28-RVA-23.fastq
 ENTng-f.1 len=19 match=100% [258:276]
 HRVbo-f.3 len=15 match=100% [169:183]
 HRVbo-p len=18 match=100% [252:269]
 HRVka5-ri (rev) len=18 match=100% [252:269]
 HRVkaV-fo.1 len=20 match=100% [257:276]
 HRVma-f len=19 match=100% [214:232]
 HRVma-p len=18 match=100% [252:269]

Sequence: refseq/Rhinovirus-A23.gb
 ENTng-f.1 len=19 match=100% [365:383]
 ENTng-p.2 (rev) len=18 match=100% [2:19]
 ENTng-p.2 len=18 match=100% [447:464]
 ENTng-r (rev) len=22 match=100% [457:478]
 ENTrc-f1 len=21 match=100% [86:106]
 ENTrc-r (rev) len=21 match=100% [456:476]
 HRVbo-f.3 len=15 match=100% [276:290]
 HRVbo-p len=18 match=100% [359:376]
 HRVbo-r (rev) len=19 match=100% [460:478]
 HRVka5-f len=21 match=100% [86:106]
 HRVka5-ri (rev) len=18 match=100% [359:376]
 HRVka5-ro (rev) len=23 match=100% [456:478]
 HRVkaV-fi len=22 match=100% [457:478]
 HRVkaV-fo.1 len=20 match=100% [364:383]
 HRVkaV-r.521 (rev) len=23 match=100% [970:992]
 HRVma-f len=19 match=100% [321:339]
 

## S44 - Rhinovirus A-56

In [5]:
printSeqBinding("myseqs/S44-RVA-56.fastq")
printSeqBinding("refseq/Rhinovirus-A56.gb", "gb")

Sequence: myseqs/S44-RVA-56.fastq
 ENTng-f.1 len=19 match=100% [260:278]
 HRVbo-f.3 len=15 match=100% [171:185]
 HRVbo-p len=18 match=100% [254:271]
 HRVka5-ri (rev) len=18 match=100% [254:271]
 HRVkaV-fo.1 len=20 match=100% [259:278]
 HRVma-f len=19 match=89% [216:234]
target          215 TTGACAAGGTGTGAAGAGC 234
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [254:271]

Sequence: refseq/Rhinovirus-A56.gb
 ENTng-f.1 len=19 match=100% [441:459]
 ENTng-p.6 len=18 match=100% [523:540]
 ENTng-r (rev) len=22 match=100% [533:554]
 ENTrc-f1 len=21 match=95% [161:181]
target          160 CAAGCACTTCTGTCTCCCCGG 181
                  0 |||||||||||||.|||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 ENTrc-r (rev) len=21 match=100% [532:552]
 HRVbo-f.3 len=15 match=100% [352:366]
 HRVbo-p len=18 match=100% [435:452]
 HRVbo-r (rev) len=19 match=100% [536:554]
 HRVka5-f len=21 match=95% [161:181]
target          160 CAAGCAC

## S48 - Rhinovirus C-1

In [6]:
printSeqBinding("myseqs/S48-RVC-1.fastq")
printSeqBinding("refseq/Rhinovirus-C1.gb", "gb")

Sequence: myseqs/S48-RVC-1.fastq
 ENTng-f.1 len=19 match=100% [262:280]
 HRVbo-f.3 len=15 match=100% [172:186]
 HRVbo-p len=18 match=100% [256:273]
 HRVka5-ri (rev) len=18 match=100% [256:273]
 HRVkaV-fo.1 len=20 match=100% [261:280]
 HRVma-f len=19 match=89% [218:235]
  Primer mismatch in the last 1 bases
target          217 TGGACAAGGTGTGAAGAGT 236
                  0 ||||||.|||||||||||.  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [256:273]

Sequence: refseq/Rhinovirus-C1.gb
 ENTng-f.1 len=19 match=100% [310:328]
 ENTng-p.4 len=18 match=100% [392:409]
 ENTng-r (rev) len=22 match=100% [402:423]
 ENTrc-f1 len=21 match=100% [28:48]
 ENTrc-r (rev) len=21 match=100% [401:421]
 HRVbo-f.3 len=15 match=100% [220:234]
 HRVbo-p len=18 match=100% [304:321]
 HRVbo-r (rev) len=19 match=100% [405:423]
 HRVka5-f len=21 match=100% [28:48]
 HRVka5-ri (rev) len=18 match=100% [304:321]
 HRVka5-ro (rev) len=23 match=100% [401:423]
 HRVkaV-fi len=22 match=100% [402:423]
 HR

Here we see the critical final base of the HRVMA-f primer mismatches our S48 RVC-1 sequence, and there are three other mismatches. This explains why I couldn't get S48 to test positive via HRV qPCR. However, it does seem to be a perfect match for ENTng, so I still don't know why I had so much trouble with that assay being unreliable for this sample. The ENT rc and HRVka primer sets match perfectly.

## S59 Rhinovirus A-77

In [7]:
printSeqBinding("myseqs/S59-RVA-77.fastq")
printSeqBinding("refseq/Rhinovirus-A77.gb", "gb")

Sequence: myseqs/S59-RVA-77.fastq
 ENTng-f.1 len=19 match=100% [261:279]
 HRVbo-f.3 len=15 match=100% [171:185]
 HRVbo-p len=18 match=100% [255:272]
 HRVka5-ri (rev) len=18 match=100% [255:272]
 HRVkaV-fo.1 len=20 match=100% [260:279]
 HRVma-f len=19 match=89% [217:235]
target          216 TTGACAAGGTGTGAAGAGC 235
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [255:272]

Sequence: refseq/Rhinovirus-A77.gb
 ENTng-f.1 len=19 match=100% [444:462]
 ENTng-p.6 len=18 match=100% [526:543]
 ENTng-r (rev) len=22 match=100% [536:557]
 ENTrc-f1 len=21 match=100% [163:183]
 ENTrc-r (rev) len=21 match=100% [535:555]
 HRVbo-f.3 len=15 match=100% [354:368]
 HRVbo-p len=18 match=100% [438:455]
 HRVbo-r (rev) len=19 match=100% [539:557]
 HRVka5-f len=21 match=100% [163:183]
 HRVka5-ri (rev) len=18 match=100% [438:455]
 HRVka5-ro (rev) len=23 match=100% [535:557]
 HRVkaV-fi len=22 match=100% [536:557]
 HRVkaV-fo.1 len=20 match=100% [443

## RefSeq Rhinovirus A-89

Test against the full A-89 genome since this is what's often used as the common coordinate system

In [8]:
printSeqBinding("refseq/Rhinovirus-A89.gb", "gb")

Sequence: refseq/Rhinovirus-A89.gb
 ENTng-f.1 len=19 match=100% [448:466]
 ENTng-p.6 len=18 match=100% [530:547]
 ENTng-r (rev) len=22 match=100% [540:561]
 ENTrc-f1 len=21 match=100% [165:185]
 ENTrc-r (rev) len=21 match=100% [539:559]
 HRVbo-f.3 len=15 match=100% [357:371]
 HRVbo-p len=18 match=100% [442:459]
 HRVbo-r (rev) len=19 match=100% [543:561]
 HRVka5-f len=21 match=100% [165:185]
 HRVka5-ri (rev) len=18 match=100% [442:459]
 HRVka5-ro (rev) len=23 match=100% [539:561]
 HRVkaV-fi len=22 match=100% [540:561]
 HRVkaV-fo.1 len=20 match=100% [447:466]
 HRVkaV-r.354 (rev) len=23 match=100% [1054:1076]
 HRVma-f len=19 match=89% [404:422]
target          403 TTGACAAGGTGTGAAGAGC 422
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [442:459]
 HRVma-r (rev) len=20 match=95% [528:547]
target          527 GGATGGGACCAACTACTTTG 547
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 r

## Wisdom primers

See how the primers from this older paper compare: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2786677/



In [9]:
wisdomPrimers = RCUtils.readPrimers("HRV-Wisdom-primers.fasta", display=True)
printSeqBinding("refseq/Rhinovirus-A89.gb", "gb", wisdomPrimers)

summaryTable("refseq/Rhinovirus-*.gb", "gb", wisdomPrimers)

Reading primers: HRV-Wisdom-primers.fasta
  HRV-Wi-5UTR-fo (48 variations)
  HRV-Wi-5UTR-ro
  HRV-Wi-5UTR-f1i (16 variations)
  HRV-Wi-5UTR-f2i (4 variations)
  HRV-Wi-5UTR-f3i (4 variations)
  HRV-Wi-5UTR-ri (2 variations)
  HRV-Wi-VP4VP2-fo (2 variations)
  HRV-Wi-VP4VP2-fi (2 variations)
  HRV-Wi-VP4VP2-ri (192 variations)
  HRV-Wi-VP4VP2-ro (768 variations)
  HRV-Wi-VP1-fo (96 variations)
  HRV-Wi-VP1-fi (64 variations)
  HRV-Wi-VP1-ri (96 variations)
  HRV-Wi-VP1-ro (48 variations)
Read 1343 primers
Sequence: refseq/Rhinovirus-A89.gb
 HRV-Wi-5UTR-f1i.7 len=21 match=100% [354:374]
 HRV-Wi-5UTR-fo.39 len=22 match=100% [164:185]
 HRV-Wi-5UTR-ri.0 (rev) len=21 match=100% [446:466]
 HRV-Wi-5UTR-ro (rev) len=23 match=100% [539:561]
 HRV-Wi-VP1-fi.56 len=19 match=89% [2441:2459]
target         2440 TGGATGCTGCGGAAACCGG 2459
                  0 ||||||||||.|||||.||   19
query             0 TGGATGCTGCAGAAACAGG   19

 HRV-Wi-VP1-fo.75 len=23 match=100% [1966:1988]
 HRV-Wi-VP1-ri.94 (rev) len=

Unnamed: 0,HRV-Wi-5UTR,HRV-Wi-VP4VP2,HRV-Wi-VP1
Rhinovirus-A89,100,100,85
Rhinovirus-C1,100,96,83
Rhinovirus-A56,100,77,100
Rhinovirus-A23,100,100,89
Rhinovirus-A77,95,100,96


Overall this isn't anything too special, no compelling reason to switch from HRV Ka.