# Assay Binding analysis
Analyze how well my PCR primers and probes match the sequences I've found in my samples.

## Initialization, configuration and utility functions

In [1]:
%load_ext autoreload
%autoreload 1
%aimport RCUtils

In [65]:
from IPython.display import display, HTML
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Align
from Bio.Align import PairwiseAligner, substitution_matrices
from Bio.Data import IUPACData
import numpy as np
import RCUtils
import glob
from pathlib import Path
from collections import defaultdict
import pandas as pd

# For now this is just Enterovirus primers, but could expand it to other species
# We don't want to expand ambiguities because the HRVkaV primers blow up into a huge number.
# Instead we use a custom substitution matrix that gives a match if the sets of bases overlap.
primers = RCUtils.readPrimers("HRVPrimers.fasta", display=True, expandAmbiguous=False)

# Build a substitution matrix where score = +m if sets intersect else -x
def iupac_subst(match=1, mismatch=0):
    alphabet="ACGTRYSWKMBDHVNXI" # Use X to prevent any matching (for masking)
    m = substitution_matrices.Array(alphabet, dims=2, data=np.full((len(alphabet), len(alphabet)), mismatch, float))
    lut = {c:set(IUPACData.ambiguous_dna_values.get(c, c)) for c in alphabet}
    for i,a in enumerate(alphabet):
        for j,b in enumerate(alphabet):
            # Match if the sets of bases overlap, or if either is Inosine (matches anything)
            if (lut.get(a,{a}) & lut.get(b,{b})) or a == 'I' or b == 'I':
                if a != 'X' and b != 'X':
                    m[i,j] = match
    return m

aligner = RCUtils.getPrimerAligner()
aligner.substitution_matrix = iupac_subst()

def printAlignment(ref, seq, printAlignment=False):
    a = aligner.align(ref.seq, seq.seq)[0]
    RCUtils.extendAlignment(a)
    display(HTML("<b>%s aligns to %s[%d:%d]</b> match %d%%" % 
        (seq.name, ref.name, a.coordinates[0][0], a.coordinates[0][-1], 100 * a.score / len(seq))))
    if printAlignment:
        print(a)

def printSeqBinding(path, format=None, primers=primers):
    if format is None:
        format = path.rsplit(".",1)[-1]
    record = SeqIO.read(path, format)
    display(HTML("<b>Sequence: %s</b>  len=%d" % (path, len(record))))
    hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True, aligner=aligner)
    for hit in sorted(hits, key=lambda hit: hit.start):
        print (" %s%s len=%d match=%d%% [%d:%d]" % (hit.primer.id, " (rev)" if hit.rev else "", len(hit.primer.seq), 100*hit.mr, hit.start+1, hit.end))
        if hit.mr < 1:
            a = aligner.align(record.seq, hit.primer.rcSeq if hit.rev else hit.primer.seq)[0]
            if a.coordinates[1][0] > 0:
                if a.coordinates[0][0] == 0:
                    print ("  Primer falls %d bases off the start of the target" % a.coordinates[1][0])
                else:
                    print ("  Primer mismatch in first %d bases" % a.coordinates[1][0])
            pt = len(hit.primer.seq) - a.coordinates[1][-1]
            if pt > 0:
                if a.coordinates[0][-1] == len(record):
                    print ("  Primer falls %d bases off the end of the target" % pt)
                else:
                    print ("  Primer mismatch in the last %d bases" % pt)
            RCUtils.extendAlignment(a)
            print(a)
    return record

# Show a table of primer match scores for each sequence
# If collapse is True, then include fwd, rev and probe in the same
# group. May be confusing when only some of the set are present.
def summaryTable(pathGlob, format="fastq", collapse=True, primers=primers):
    table = dict()
    for path in sorted(glob.glob(pathGlob)):
        try:
            record = SeqIO.read(path, format)
        except ValueError as e:
            raise ValueError(f"Error processing file {path}") from e

        # Note that unlike above, we use the default behavior of removing redundant hits
        # here so we take the maximum score for variants like ENTrc-f1 and ENTrc-f2 while
        # still multiplying scores for distinct primers like ENTrc-f1 and ENTrc-r.
        hits = RCUtils.computePrimerHits(record, primers, allowOverlaps=True, aligner=aligner)
        scores = dict()
        for hit in hits:
            # Combine scores for multiple primers in the set
            pe = "-" if collapse else "."
            pname = hit.primer.id.rsplit(pe,1)[0]
            if pname in scores:
                scores[pname] *= hit.mr
            else:
                scores[pname] = hit.mr
        
        # Get the base filename without the suffix
        file = Path(path).stem
        table[file] = scores

    pd.set_option('display.max_rows', 500)
    df = pd.DataFrame.from_dict(table, orient='index')
    df.fillna(0, inplace=True)
    df *= 100
    df = df.round(0).astype(int)
    return df

Reading primers: HRVPrimers.fasta
  ENTng-f (degeneracy 2)
  ENTng-r
  ENTng-p (degeneracy 8)
  ENTrc-f1
  ENTrc-f2
  ENTrc-r
  HRVma-f
  HRVma-r
  HRVma-p
  HRVkaV-fo (degeneracy 2)
  HRVkaV-fi
  HRVkaV-r (degeneracy 768)
  HRVka5-f
  HRVka5-ro
  HRVka5-ri
  HRVbo-f (degeneracy 4)
  HRVbo-r
  HRVbo-p
Read 18 primers


## Summary

Note that the myseqs sequences are ampicons generated from these same (or similar) primers and should have the primers trimmed and so will not match the primers they were generated from.

In [60]:
summaryTable("myseqs/*-RV*.fastq", collapse=False)


Unnamed: 0,HRVbo-f,HRVma-f,HRVma-p,HRVka5-ri,HRVbo-p,HRVkaV-fo,ENTng-f
S130-RVA-58,100,89,100,100,100,100,100
S142-RVA-62,100,89,100,100,100,100,100
S147-RVC-44,100,89,100,100,100,100,100
S148-RVA-68,100,89,100,100,100,100,100
S153-RVB-27,100,89,100,100,100,100,100
S28-RVA-23,100,100,100,100,100,100,100
S44-RVA-56,100,89,100,100,100,100,100
S48-RVC-1,100,89,100,100,100,100,100
S59-RVA-77,100,89,100,100,100,100,100
S65-RVA-54,100,89,100,100,100,100,100


Overall we see HRVMa is a pretty poor match for everything except S28, which matches qPCR experimental results. Everything else is generally good.

In [63]:
summaryTable("refseq/Rhinovirus-*.fasta", "fasta")

Unnamed: 0,ENTrc,HRVka5,HRVbo,HRVma,HRVkaV,ENTng,random
Rhinovirus-A1,100,100,100,95,100,100,0
Rhinovirus-A10,100,100,100,80,100,100,80
Rhinovirus-A100,100,100,100,85,100,100,0
Rhinovirus-A101,95,95,100,80,100,100,0
Rhinovirus-A102,95,95,100,86,100,100,0
Rhinovirus-A103,100,100,100,86,100,100,0
Rhinovirus-A104,100,100,100,85,100,100,0
Rhinovirus-A105,82,73,93,71,95,83,86
Rhinovirus-A106,78,100,41,80,61,66,61
Rhinovirus-A11,95,95,100,85,100,100,80


# My sequence details
## Rhinovirus A-23: S28

In [66]:
ref = printSeqBinding("refseq/Rhinovirus-A23.gb", "gb")
seq = printSeqBinding("myseqs/S28-RVA-23.fastq")
printAlignment(ref, seq)

 ENTng-p (rev) len=18 match=100% [2:19]
 HRVma-r len=20 match=90% [2:21]
target            1 CAAAGTAGTTGGTCCCGTCC 21
                  0 |||||||||.||||||.||| 20
query             0 CAAAGTAGTCGGTCCCATCC 20

 ENTrc-f1 len=21 match=100% [86:106]
 HRVka5-f len=21 match=100% [86:106]
 HRVbo-f len=15 match=100% [276:290]
 HRVma-f len=19 match=100% [321:339]
 HRVma-p len=18 match=100% [359:376]
 HRVka5-ri (rev) len=18 match=100% [359:376]
 HRVbo-p len=18 match=100% [359:376]
 HRVkaV-fo len=20 match=100% [364:383]
 ENTng-f len=19 match=100% [365:383]
 HRVma-r (rev) len=20 match=90% [445:464]
target          444 GGACGGGACCAACTACTTTG 464
                  0 |||.||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [447:464]
 ENTrc-r (rev) len=21 match=100% [456:476]
 HRVka5-ro (rev) len=23 match=100% [456:478]
 ENTng-r (rev) len=22 match=100% [457:478]
 HRVkaV-fi len=22 match=100% [457:478]
 HRVbo-r (rev) len=19 match=100% [460:478]
 HRVkaV-r (rev) len=23 

 HRVbo-f len=15 match=100% [169:183]
 HRVma-f len=19 match=100% [214:232]
 HRVma-p len=18 match=100% [252:269]
 HRVka5-ri (rev) len=18 match=100% [252:269]
 HRVbo-p len=18 match=100% [252:269]
 HRVkaV-fo len=20 match=100% [257:276]
 ENTng-f len=19 match=100% [258:276]


## Rhinovirus A-31: S81

In [6]:
seq = printSeqBinding("myseqs/S81-RVA-31.fastq")
ref = printSeqBinding("refseq/Rhinovirus-A31.fasta", "fasta")
printAlignment(ref, seq)

 HRVbo-f len=15 match=100% [170:184]
 HRVma-f len=19 match=89% [216:234]
target          215 TTGACAAGGTGTGAAGAGC 234
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [254:271]
 HRVka5-ri (rev) len=18 match=100% [254:271]
 HRVbo-p len=18 match=100% [254:271]
 HRVkaV-fo len=20 match=100% [259:278]
 ENTng-f len=19 match=100% [260:278]


 ENTrc-f1 len=21 match=100% [162:182]
 HRVka5-f len=21 match=100% [162:182]
 HRVbo-f len=15 match=100% [352:366]
 HRVma-f len=19 match=89% [398:416]
target          397 TTGACAAGGTGTGAAGAGC 416
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [436:453]
 HRVka5-ri (rev) len=18 match=100% [436:453]
 HRVbo-p len=18 match=100% [436:453]
 HRVkaV-fo len=20 match=100% [441:460]
 ENTng-f len=19 match=100% [442:460]
 HRVma-r (rev) len=20 match=90% [522:541]
target          521 GGACGGGACCAACTACTTTG 541
                  0 |||.||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [524:541]
 ENTrc-r (rev) len=21 match=100% [533:553]
 HRVka5-ro (rev) len=23 match=100% [533:555]
 ENTng-r (rev) len=22 match=100% [534:555]
 HRVkaV-fi len=22 match=100% [534:555]
 HRVbo-r (rev) len=19 match=100% [537:555]
 random (rev) len=15 match=80% [817:831]
target          816 AATCTCCTACAGTAG 831
              

## Rhinovirus A-56: S44

In [7]:
seq = printSeqBinding("myseqs/S44-RVA-56.fastq")
ref = printSeqBinding("refseq/Rhinovirus-A56.gb", "gb")
printAlignment(ref, seq)

 HRVbo-f len=15 match=100% [171:185]
 HRVma-f len=19 match=89% [216:234]
target          215 TTGACAAGGTGTGAAGAGC 234
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [254:271]
 HRVka5-ri (rev) len=18 match=100% [254:271]
 HRVbo-p len=18 match=100% [254:271]
 HRVkaV-fo len=20 match=100% [259:278]
 ENTng-f len=19 match=100% [260:278]


 ENTrc-f1 len=21 match=95% [161:181]
target          160 CAAGCACTTCTGTCTCCCCGG 181
                  0 |||||||||||||.|||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVka5-f len=21 match=95% [161:181]
target          160 CAAGCACTTCTGTCTCCCCGG 181
                  0 |||||||||||||.|||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVbo-f len=15 match=100% [352:366]
 HRVma-f len=19 match=89% [397:415]
target          396 TTGACAAGGTGTGAARAGC 415
                  0 |.||||.||||||||.|||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [435:452]
 HRVka5-ri (rev) len=18 match=100% [435:452]
 HRVbo-p len=18 match=100% [435:452]
 HRVkaV-fo len=20 match=100% [440:459]
 ENTng-f len=19 match=100% [441:459]
 HRVma-r (rev) len=20 match=95% [521:540]
target          520 GGATGGGACCAACTACTTTG 540
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [523:540]
 ENTrc-r (rev) len=21 match=100%

Here we see the critical final base of the HRVMA-f primer mismatches our S48 RVC-1 sequence, and there are three other mismatches. This explains why I couldn't get S48 to test positive via HRV qPCR. However, it does seem to be a perfect match for ENTng, so I still don't know why I had so much trouble with that assay being unreliable for this sample. The ENT rc and HRVka primer sets match perfectly.

## Rhinovirus A-54: S65

In [8]:
seq = printSeqBinding("myseqs/S65-RVA-54.fastq")
ref = printSeqBinding("refseq/Rhinovirus-A54.fasta", "fasta")
printAlignment(ref, seq)

 HRVbo-f len=15 match=100% [170:184]
 HRVma-f len=19 match=89% [216:234]
target          215 TTGACAAGGTGTGAAGAGC 234
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [254:271]
 HRVka5-ri (rev) len=18 match=100% [254:271]
 HRVbo-p len=18 match=100% [254:271]
 HRVkaV-fo len=20 match=100% [259:278]
 ENTng-f len=19 match=100% [260:278]


 ENTrc-f1 len=21 match=100% [162:182]
 HRVka5-f len=21 match=100% [162:182]
 HRVbo-f len=15 match=100% [352:366]
 HRVma-f len=19 match=89% [398:416]
target          397 TTGACAAGGTGTGAAGAGC 416
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [436:453]
 HRVka5-ri (rev) len=18 match=100% [436:453]
 HRVbo-p len=18 match=100% [436:453]
 HRVkaV-fo len=20 match=100% [441:460]
 ENTng-f len=19 match=100% [442:460]
 HRVma-r (rev) len=20 match=95% [522:541]
target          521 GGATGGGACCAACTACTTTG 541
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [524:541]
 ENTrc-r (rev) len=21 match=100% [533:553]
 HRVka5-ro (rev) len=23 match=100% [533:555]
 ENTng-r (rev) len=22 match=100% [534:555]
 HRVkaV-fi len=22 match=100% [534:555]
 HRVbo-r (rev) len=19 match=100% [537:555]
 HRVkaV-r (rev) len=23 match=100% [1047:1069]


## Rhinovirus A-58: S130

In [9]:
ref = printSeqBinding("refseq/Rhinovirus-A58.fasta", "fasta")
seq = printSeqBinding("myseqs/S130-RVA-58.fastq")
printAlignment(ref, seq)

 ENTrc-f1 len=21 match=100% [164:184]
 HRVka5-f len=21 match=100% [164:184]
 HRVbo-f len=15 match=100% [356:370]
 HRVma-f len=19 match=89% [402:420]
target          401 TTGACATGGTGTGAAGAGC 420
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [440:457]
 HRVka5-ri (rev) len=18 match=100% [440:457]
 HRVbo-p len=18 match=100% [440:457]
 HRVkaV-fo len=20 match=100% [445:464]
 ENTng-f len=19 match=100% [446:464]
 HRVma-r (rev) len=20 match=95% [526:545]
target          525 GGATGGGACCAACTACTTTG 545
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [528:545]
 ENTrc-r (rev) len=21 match=100% [537:557]
 HRVka5-ro (rev) len=23 match=100% [537:559]
 ENTng-r (rev) len=22 match=100% [538:559]
 HRVkaV-fi len=22 match=100% [538:559]
 HRVbo-r (rev) len=19 match=100% [541:559]
 HRVkaV-r (rev) len=23 match=100% [1054:1076]
 random (rev) len=15 match=80% [2774:2788]
  Prim

 HRVbo-f len=15 match=100% [172:186]
 HRVma-f len=19 match=89% [218:236]
target          217 TTGACAAGGTGTGAAGAGC 236
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [256:273]
 HRVka5-ri (rev) len=18 match=100% [256:273]
 HRVbo-p len=18 match=100% [256:273]
 HRVkaV-fo len=20 match=100% [261:280]
 ENTng-f len=19 match=100% [262:280]


## Rhinovirus A-62: S142

In [10]:
ref = printSeqBinding("refseq/Rhinovirus-A62.fasta", "fasta")
seq = printSeqBinding("myseqs/S142-RVA-62.fastq")
printAlignment(ref, seq, True)

 ENTrc-f1 len=21 match=100% [163:183]
 HRVka5-f len=21 match=100% [163:183]
 HRVbo-f len=15 match=100% [353:367]
 HRVma-f len=19 match=89% [400:418]
target          399 TTGACAAGGTGTGAAGAGC 418
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [438:455]
 HRVka5-ri (rev) len=18 match=100% [438:455]
 HRVbo-p len=18 match=100% [438:455]
 HRVkaV-fo len=20 match=100% [443:462]
 ENTng-f len=19 match=100% [444:462]
 HRVma-r (rev) len=20 match=95% [524:543]
target          523 GGATGGGACCAACTACTTTG 543
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [526:543]
 ENTrc-r (rev) len=21 match=100% [535:555]
 HRVka5-ro (rev) len=23 match=100% [535:557]
 ENTng-r (rev) len=22 match=100% [536:557]
 HRVkaV-fi len=22 match=100% [536:557]
 HRVbo-r (rev) len=19 match=100% [539:557]
 HRVkaV-r (rev) len=23 match=100% [1050:1072]


 HRVbo-f len=15 match=100% [163:177]
 HRVma-f len=19 match=89% [209:227]
target          208 TTGACAAGGTGTGAAGAGC 227
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [247:264]
 HRVka5-ri (rev) len=18 match=100% [247:264]
 HRVbo-p len=18 match=100% [247:264]
 HRVkaV-fo len=20 match=100% [252:271]
 ENTng-f len=19 match=100% [253:271]


target          191 GATACGCTCCAACAGGGCGAAAACAATYTAAATCGTTAACCGCAAAGTGACTACGCAAAG
                  0 .||||||||||||||||||||||||.|..|.|||||||||||||||||||||||||||||
query             0 TATACGCTCCAACAGGGCGAAAACAGTCAAGATCGTTAACCGCAAAGTGACTACGCAAAG

target          251 CTTAGTAATACCTTGAAGAATTTATGGCTGGTCGTTCCGCTATAACCCC-TAGTAGACCT
                 60 |||||||||.|||||||||.|.|||||||||||||||||||||||||||-||||||||||
query            60 CTTAGTAATGCCTTGAAGAGTCTATGGCTGGTCGTTCCGCTATAACCCCCTAGTAGACCT

target          310 GGCAGATGAGGCTAGAAATACCCCACTGGTAACAGTGTTCTAGCCTGCGTGGCTGCCTGC
                120 ||||||||||||||||||.|||||||||||.|||||||||||||||||||||||||||||
query           120 GGCAGATGAGGCTAGAAACACCCCACTGGTGACAGTGTTCTAGCCTGCGTGGCTGCCTGC

target          370 ACACCCTTTTTGGGTGTGAAGCCATATATTTGACAAGGTGTGAAGAGCCCCGTGTGCTCA
                180 |||||||||.-||||||||||||.||||||||||||||||||||||||||||||||||||
query           180 ACACCCTTTC-GGGTGTGAAGCCGTATATTTGACAAGGTGTGAAGAGCCCCGTGTGCTCA

target          430 CTTT

This is my first plasmidsaurus sequence. It's interesting that there's about 8 bases missing plus that of the whole primer sequence at the start, and just a couple bases overlap with the primer at the end. This is likely due to the use of rapid ONT chemistry which breaks the amplicons randomly with a transposon. There's also some very low quality bases at either end which I may need to trim for some applications. 

## Rhinovirus A-68: S148

In [11]:
ref = printSeqBinding("refseq/Rhinovirus-A68.fasta", "fasta")
seq = printSeqBinding("myseqs/S148-RVA-68.fastq")
printAlignment(ref, seq)

 ENTrc-f1 len=21 match=95% [165:185]
target          164 CAAGCACTTCTGTTACCCCGG 185
                  0 ||||||||||||||.||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVka5-f len=21 match=95% [165:185]
target          164 CAAGCACTTCTGTTACCCCGG 185
                  0 ||||||||||||||.||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVbo-f len=15 match=100% [357:371]
 HRVma-f len=19 match=89% [405:423]
target          404 TTGACAAGGTGTGAAGAGC 423
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [443:460]
 HRVka5-ri (rev) len=18 match=100% [443:460]
 HRVbo-p len=18 match=100% [443:460]
 HRVkaV-fo len=20 match=100% [448:467]
 ENTng-f len=19 match=100% [449:467]
 HRVma-r (rev) len=20 match=95% [529:548]
target          528 GGATGGGACCAACTACTTTG 548
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [531:548]
 ENTrc-r (rev) len=21 match=100%

 HRVbo-f len=15 match=100% [178:192]
 HRVma-f len=19 match=89% [226:244]
target          225 TTGACAAGGTGTGAAGAGC 244
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [264:281]
 HRVka5-ri (rev) len=18 match=100% [264:281]
 HRVbo-p len=18 match=100% [264:281]
 HRVkaV-fo len=20 match=100% [269:288]
 ENTng-f len=19 match=100% [270:288]


## Rhinovirus A-77: S59

In [12]:
ref = printSeqBinding("refseq/Rhinovirus-A77.gb", "gb")
seq = printSeqBinding("myseqs/S59-RVA-77.fastq")
printAlignment(ref, seq)

 ENTrc-f1 len=21 match=100% [163:183]
 HRVka5-f len=21 match=100% [163:183]
 HRVbo-f len=15 match=100% [354:368]
 HRVma-f len=19 match=89% [400:418]
target          399 TTGACAAGGTGTGAAGAGC 418
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [438:455]
 HRVka5-ri (rev) len=18 match=100% [438:455]
 HRVbo-p len=18 match=100% [438:455]
 HRVkaV-fo len=20 match=100% [443:462]
 ENTng-f len=19 match=100% [444:462]
 HRVma-r (rev) len=20 match=95% [524:543]
target          523 GGATGGGACCAACTACTTTG 543
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [526:543]
 ENTrc-r (rev) len=21 match=100% [535:555]
 HRVka5-ro (rev) len=23 match=100% [535:557]
 ENTng-r (rev) len=22 match=100% [536:557]
 HRVkaV-fi len=22 match=100% [536:557]
 HRVbo-r (rev) len=19 match=100% [539:557]
 HRVkaV-r (rev) len=23 match=100% [1049:1071]
 HRVkaV-r (rev) len=23 match=83% [4273:4295]
targ

 HRVbo-f len=15 match=100% [171:185]
 HRVma-f len=19 match=89% [217:235]
target          216 TTGACAAGGTGTGAAGAGC 235
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [255:272]
 HRVka5-ri (rev) len=18 match=100% [255:272]
 HRVbo-p len=18 match=100% [255:272]
 HRVkaV-fo len=20 match=100% [260:279]
 ENTng-f len=19 match=100% [261:279]


## Rhinovirus B-27: S153

In [13]:
ref = printSeqBinding("refseq/Rhinovirus-B27.fasta", "fasta")
seq = printSeqBinding("myseqs/S153-RVB-27.fastq")
printAlignment(ref, seq)

 ENTrc-f1 len=21 match=100% [180:200]
 HRVka5-f len=21 match=100% [180:200]
 HRVbo-f len=15 match=100% [368:382]
 HRVma-p len=18 match=100% [451:468]
 HRVka5-ri (rev) len=18 match=100% [451:468]
 HRVbo-p len=18 match=100% [451:468]
 HRVkaV-fo len=20 match=100% [456:475]
 ENTng-f len=19 match=100% [457:475]
 HRVma-r (rev) len=20 match=100% [537:556]
 ENTng-p len=18 match=100% [539:556]
 ENTrc-r (rev) len=21 match=100% [548:568]
 HRVka5-ro (rev) len=23 match=100% [548:570]
 ENTng-r (rev) len=22 match=100% [549:570]
 HRVkaV-fi len=22 match=100% [549:570]
 HRVbo-r (rev) len=19 match=100% [552:570]
 HRVkaV-r (rev) len=23 match=100% [1063:1085]


 HRVbo-f len=15 match=100% [173:187]
 HRVma-f len=19 match=89% [217:235]
target          216 TGGACATGGTGTGAAGACC 235
                  0 ||||||.||||||||||.|  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [256:273]
 HRVka5-ri (rev) len=18 match=100% [256:273]
 HRVbo-p len=18 match=100% [256:273]
 HRVkaV-fo len=20 match=100% [261:280]
 ENTng-f len=19 match=100% [262:280]


## Rhinovirus C-1: S48, S94

In [14]:
ref = printSeqBinding("refseq/Rhinovirus-C1.gb", "gb")
seq1 = printSeqBinding("myseqs/S48-RVC-1.fastq")
seq2 = printSeqBinding("myseqs/S94-RVC-1.fastq")

printAlignment(ref, seq1)
printAlignment(ref, seq2)
printAlignment(seq1, seq2, printAlignment=True)


 ENTrc-f1 len=21 match=100% [28:48]
 HRVka5-f len=21 match=100% [28:48]
 HRVbo-f len=15 match=100% [220:234]
 HRVma-f len=19 match=95% [266:284]
target          265 TGGACAAGGTGTGAAGAGC 284
                  0 ||||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [304:321]
 HRVka5-ri (rev) len=18 match=100% [304:321]
 HRVbo-p len=18 match=100% [304:321]
 HRVkaV-fo len=20 match=100% [309:328]
 ENTng-f len=19 match=100% [310:328]
 HRVma-r (rev) len=20 match=90% [390:409]
target          389 GGATGGAACCAACTACTTTG 409
                  0 ||||||.|||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [392:409]
 ENTrc-r (rev) len=21 match=100% [401:421]
 HRVka5-ro (rev) len=23 match=100% [401:423]
 ENTng-r (rev) len=22 match=100% [402:423]
 HRVkaV-fi len=22 match=100% [402:423]
 HRVbo-r (rev) len=19 match=100% [405:423]
 HRVkaV-r (rev) len=23 match=100% [908:930]


 HRVbo-f len=15 match=100% [172:186]
 HRVma-f len=19 match=89% [218:235]
  Primer mismatch in the last 1 bases
target          217 TGGACAAGGTGTGAAGAGT 236
                  0 ||||||.|||||||||||.  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [256:273]
 HRVka5-ri (rev) len=18 match=100% [256:273]
 HRVbo-p len=18 match=100% [256:273]
 HRVkaV-fo len=20 match=100% [261:280]
 ENTng-f len=19 match=100% [262:280]


 HRVbo-f len=15 match=100% [172:186]
 HRVma-f len=19 match=89% [218:235]
  Primer mismatch in the last 1 bases
target          217 TGGACAAGGTGTGAAGAGT 236
                  0 ||||||.|||||||||||.  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [256:273]
 HRVka5-ri (rev) len=18 match=100% [256:273]
 HRVbo-p len=18 match=100% [256:273]
 HRVkaV-fo len=20 match=100% [261:280]
 ENTng-f len=19 match=100% [262:280]


target            0 TACCCTCGTATACGCTTCACCCGAGGCGAAAAATGAGGTTATCGTTACCCGCAAAGTGCC
                  0 ||||||.|||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 TACCCTTGTATACGCTTCACCCGAGGCGAAAAATGAGGTTATCGTTACCCGCAAAGTGCC

target           60 TACGAGAAACCTAGTAGCATTTTTGAAGCCTATGGTTGGTCGCTCAACTGTTTACCCAGC
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TACGAGAAACCTAGTAGCATTTTTGAAGCCTATGGTTGGTCGCTCAACTGTTTACCCAGC

target          120 AGTAGACCTGGCAGATGAGGCTAGATGTTCCCCACCAGCGATGGTGATCTAGCCTGCGTG
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 AGTAGACCTGGCAGATGAGGCTAGATGTTCCCCACCAGCGATGGTGATCTAGCCTGCGTG

target          180 GCTGCCTGCACACTCTATTGAGTGTGAAGCCAGAAAGTGGACAAGGTGTGAAGAGTCTAT
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 GCTGCCTGCACACTCTATTGAGTGTGAAGCCAGAAAGTGGACAAGGTGTGAAGAGTCTAT

target          240 TGTG

## Rhinovirus C-44: S147

In [15]:
ref = printSeqBinding("refseq/Rhinovirus-C44.fasta", "fasta")
seq = printSeqBinding("myseqs/S147-RVC-44.fastq")
printAlignment(ref, seq, True)


 ENTrc-f1 len=21 match=95% [135:155]
target          134 CAAGCACTTCTGTTACCCCGG 155
                  0 ||||||||||||||.||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVka5-f len=21 match=95% [135:155]
target          134 CAAGCACTTCTGTTACCCCGG 155
                  0 ||||||||||||||.||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVbo-f len=15 match=100% [331:345]
 HRVma-f len=19 match=89% [373:391]
target          372 TTGACAAGGTGTGAAGAGC 391
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [411:428]
 HRVka5-ri (rev) len=18 match=100% [411:428]
 HRVbo-p len=18 match=100% [411:428]
 HRVkaV-fo len=20 match=100% [416:435]
 ENTng-f len=19 match=100% [417:435]
 HRVma-r (rev) len=20 match=90% [496:515]
target          495 GGACGGAACCGACTACTTTG 515
                  0 |||.||.|||||||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [498:515]
 ENTrc-r (rev) len=21 match=100%

 HRVbo-f len=15 match=100% [158:172]
 HRVma-f len=19 match=89% [200:218]
target          199 TTGACAAGGTGTGAAGAGC 218
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [238:255]
 HRVka5-ri (rev) len=18 match=100% [238:255]
 HRVbo-p len=18 match=100% [238:255]
 HRVkaV-fo len=20 match=100% [243:262]
 ENTng-f len=19 match=100% [244:262]


target          173 TTCCCCAAGACCGAAGCCTTGACTGTCGTTACCCGCACAACTACTGGACAAAGCCTAGTA
                  0 .|||||||||||||||||||||.||||||||||||||||.||||||||||||||||||||
query             0 GTCCCCAAGACCGAAGCCTTGATTGTCGTTACCCGCACAGCTACTGGACAAAGCCTAGTA

target          233 ATACCTGACGATATAGTGTGGTTGGTCGCTCCACCAGTGAACCCCTGGTAGACCTGGCAG
                 60 ||||||||||||||||.|||||||||||||||||||||||||||||||||||||||||||
query            60 ATACCTGACGATATAGCGTGGTTGGTCGCTCCACCAGTGAACCCCTGGTAGACCTGGCAG

target          293 ATGAGGCTGGAGTTTCCCCACTGGTAACAGTGTTCCAGCCTGCGTGGCTGCCTGCTCCCT
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 ATGAGGCTGGAGTTTCCCCACTGGTAACAGTGTTCCAGCCTGCGTGGCTGCCTGCTCCCT

target          353 CACGGGAGAAGCCATTCTATTGACAAGGTGTGAAGAGCCCCGTACGCTAGTTGTGAGTCC
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 CACGGGAGAAGCCATTCTATTGACAAGGTGTGAAGAGCCCCGTACGCTAGTTGTGAGTCC

target          413 TCCG

## Rhinovirus C-45: S69, S91

In [16]:
ref = printSeqBinding("refseq/Rhinovirus-C45.fasta", "fasta")
seq1 = printSeqBinding("myseqs/S69-RVC-45.fastq")
seq2 = printSeqBinding("myseqs/S91-RVC-45.fastq")

printAlignment(ref, seq1)
printAlignment(ref, seq2)
printAlignment(seq1, seq2, printAlignment=True)


 ENTrc-f1 len=21 match=90% [26:46]
target           25 CAAACACTTCTGTTCCCCCGG 46
                  0 |||.||||||||||.|||||| 21
query             0 CAAGCACTTCTGTTTCCCCGG 21

 HRVka5-f len=21 match=90% [26:46]
target           25 CAAACACTTCTGTTCCCCCGG 46
                  0 |||.||||||||||.|||||| 21
query             0 CAAGCACTTCTGTTTCCCCGG 21

 HRVbo-f len=15 match=100% [220:234]
 HRVma-f len=19 match=95% [267:285]
target          266 TAGACAGGGTGTGAAGAGC 285
                  0 |.|||||||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [305:322]
 HRVka5-ri (rev) len=18 match=100% [305:322]
 HRVbo-p len=18 match=100% [305:322]
 HRVkaV-fo len=20 match=100% [310:329]
 ENTng-f len=19 match=100% [311:329]
 HRVma-r (rev) len=20 match=95% [390:409]
target          389 GGATGGAACCGACTACTTTG 409
                  0 ||||||.|||||||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [392:409]
 ENTrc-r (rev) len=21 match=100% [401:421]

 HRVbo-f len=15 match=100% [174:188]
 HRVma-f len=19 match=95% [221:239]
target          220 TAGACAGGGTGTGAAGAGC 239
                  0 |.|||||||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [259:276]
 HRVka5-ri (rev) len=18 match=100% [259:276]
 HRVbo-p len=18 match=100% [259:276]
 HRVkaV-fo len=20 match=100% [264:283]
 ENTng-f len=19 match=100% [265:283]


 HRVbo-f len=15 match=100% [174:188]
 HRVma-f len=19 match=95% [221:239]
target          220 TAGACAGGGTGTGAAGAGC 239
                  0 |.|||||||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [259:276]
 HRVka5-ri (rev) len=18 match=100% [259:276]
 HRVbo-p len=18 match=100% [259:276]
 HRVkaV-fo len=20 match=100% [264:283]
 ENTng-f len=19 match=100% [265:283]


target            0 CATGTGAGGAATAGGCTCCAAAAGGGCTAAAGCCACTAGTGTCGTTATCCGCATTGGTAC
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 CATGTGAGGAATAGGCTCCAAAAGGGCTAAAGCCACTAGTGTCGTTATCCGCATTGGTAC

target           60 TACGCAAAGCCTAGTATCACTCTGGAAGTCTCTCGGTTGGTCGCTCCACCAGCTACCCCA
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TACGCAAAGCCTAGTATCACTCTGGAAGTCTCTCGGTTGGTCGCTCCACCAGCTACCCCA

target          120 CTGGTAGACCTGGCAGATGAGGCAGGACTCTCCCCACTGGTGACAGTGGTCCTGCCTGCG
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 CTGGTAGACCTGGCAGATGAGGCAGGACTCTCCCCACTGGTGACAGTGGTCCTGCCTGCG

target          180 TGGCTGCCTGCACACCCCTACGGGGTGTGAAGCCTAGAGATAGACAGGGTGTGAAGAGCC
                180 |||||||||||||||||||||.||||||||||||||||||||||||||||||||||||||
query           180 TGGCTGCCTGCACACCCCTACAGGGTGTGAAGCCTAGAGATAGACAGGGTGTGAAGAGCC

target          240 CCGT

# Other reference sequences
## RefSeq Rhinovirus A-89

Test against the full A-89 genome since this is what's often used as the common coordinate system

In [17]:
printSeqBinding("refseq/Rhinovirus-A89.gb", "gb");

 ENTrc-f1 len=21 match=100% [165:185]
 HRVka5-f len=21 match=100% [165:185]
 HRVbo-f len=15 match=100% [357:371]
 HRVma-f len=19 match=89% [404:422]
target          403 TTGACAAGGTGTGAAGAGC 422
                  0 |.||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [442:459]
 HRVka5-ri (rev) len=18 match=100% [442:459]
 HRVbo-p len=18 match=100% [442:459]
 HRVkaV-fo len=20 match=100% [447:466]
 ENTng-f len=19 match=100% [448:466]
 HRVma-r (rev) len=20 match=95% [528:547]
target          527 GGATGGGACCAACTACTTTG 547
                  0 ||||||||||.|||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [530:547]
 ENTrc-r (rev) len=21 match=100% [539:559]
 HRVka5-ro (rev) len=23 match=100% [539:561]
 ENTng-r (rev) len=22 match=100% [540:561]
 HRVkaV-fi len=22 match=100% [540:561]
 HRVbo-r (rev) len=19 match=100% [543:561]
 HRVkaV-r (rev) len=23 match=100% [1054:1076]
 random (rev) len=15 match=80% [2786:2800]
  Prim

Overall this isn't anything too special, no compelling reason to switch from HRV Ka.

## RefSeq Rhinovirus C-11

Test against the full C-11 genome since I have several hits to it. There's a few mismatches.

In [18]:
printSeqBinding("refseq/Rhinovirus-C11.fasta", "fasta");

 ENTrc-f1 len=21 match=90% [163:183]
target          162 CAAATACTTCTGTTTCCCCGG 183
                  0 |||..||||||||||||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVka5-f len=21 match=90% [163:183]
target          162 CAAATACTTCTGTTTCCCCGG 183
                  0 |||..||||||||||||||||  21
query             0 CAAGCACTTCTGTTTCCCCGG  21

 HRVbo-f len=15 match=100% [357:371]
 HRVma-f len=19 match=84% [407:423]
  Primer mismatch in first 2 bases
target          404 CAGACAAGGTGTGAAGAGC 423
                  0 ..||||.||||||||||||  19
query             0 TGGACAGGGTGTGAAGAGC  19

 HRVma-p len=18 match=100% [443:460]
 HRVka5-ri (rev) len=18 match=100% [443:460]
 HRVbo-p len=18 match=100% [443:460]
 HRVkaV-fo len=20 match=100% [448:467]
 ENTng-f len=19 match=100% [449:467]
 HRVma-r (rev) len=20 match=95% [528:547]
target          527 GGATGGAACCGACTACTTTG 547
                  0 ||||||.|||||||||||||  20
query             0 GGATGGGACCGACTACTTTG  20

 ENTng-p len=18 match=100% [530:54

# Wisdom primers

See how the primers from this older paper compare: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2786677/



In [19]:
wisdomPrimers = RCUtils.readPrimers("HRV-Wisdom-primers.fasta", display=True)
printSeqBinding("refseq/Rhinovirus-A89.gb", "gb", primers=wisdomPrimers)

summaryTable("refseq/Rhinovirus-*.gb", "gb", primers=wisdomPrimers)

Reading primers: HRV-Wisdom-primers.fasta
  HRV-Wi-5UTR-fo (48 variations)
  HRV-Wi-5UTR-ro
  HRV-Wi-5UTR-f1i (16 variations)
  HRV-Wi-5UTR-f2i (4 variations)
  HRV-Wi-5UTR-f3i (4 variations)
  HRV-Wi-5UTR-ri (2 variations)
  HRV-Wi-VP4VP2-fo (2 variations)
  HRV-Wi-VP4VP2-fi (2 variations)
  HRV-Wi-VP4VP2-ri (192 variations)
  HRV-Wi-VP4VP2-ro (768 variations)
  HRV-Wi-VP1-fo (96 variations)
  HRV-Wi-VP1-fi (64 variations)
  HRV-Wi-VP1-ri (96 variations)
  HRV-Wi-VP1-ro (48 variations)
Read 1343 primers


 HRV-Wi-5UTR-fo.39 len=22 match=100% [164:185]
 HRV-Wi-5UTR-f1i.7 len=21 match=100% [354:374]
 HRV-Wi-5UTR-ri.0 (rev) len=21 match=100% [446:466]
 HRV-Wi-5UTR-ro (rev) len=23 match=100% [539:561]
 HRV-Wi-VP4VP2-ri.46 (rev) len=23 match=100% [1054:1076]
 HRV-Wi-VP4VP2-ro.231 (rev) len=25 match=100% [1090:1114]
 HRV-Wi-VP1-fo.75 len=23 match=100% [1966:1988]
 HRV-Wi-VP1-fi.56 len=19 match=89% [2441:2459]
target         2440 TGGATGCTGCGGAAACCGG 2459
                  0 ||||||||||.|||||.||   19
query             0 TGGATGCTGCAGAAACAGG   19

 HRV-Wi-VP1-ri.94 (rev) len=26 match=96% [3319:3344]
target         3318 GATCTAATCATATATCGAACAAACAC 3344
                  0 ||||||||||||||.|||||||||||   26
query             0 GATCTAATCATATACCGAACAAACAC   26

 HRV-Wi-VP1-ro.21 (rev) len=23 match=100% [3514:3536]


Unnamed: 0,HRV-Wi-5UTR,HRV-Wi-VP4VP2,HRV-Wi-VP1
Rhinovirus-A89,100,100,85
Rhinovirus-C1,100,96,83
Rhinovirus-A56,100,77,100
Rhinovirus-A23,100,100,89
Rhinovirus-A77,95,100,96


# Bochkov primers

See how the primers from this 2014 paper compare: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2786677/



In [67]:
bochkovPrimers = RCUtils.readPrimers("HRV-Bochkov-primers.fasta", display=True, expandAmbiguous=False)
printSeqBinding("refseq/Rhinovirus-C1.fasta", "fasta", primers=bochkovPrimers)

#summaryTable("refseq/Rhinovirus-*.fasta", "fasta", primers=bochkovPrimers)

Reading primers: HRV-Bochkov-primers.fasta
  HRV-Bv-5UTRn-A1 (degeneracy 2)
  HRV-Bv-5UTRn-A2
  HRV-Bv-5UTRn-A3
  HRV-Bv-5UTRn-B1
  HRV-Bv-5UTRn-B2
  HRV-Bv-5UTRn-Cc (degeneracy 4)
  HRV-Bv-5UTR-rev
  HRV-Bv-5UTR-revseq
  HRV-Bv-A53-f
  HRV-Bv-B6-f
  HRV-Bv-A53-r
  HRV-Bv-B6-R1164-r
  HRV-Bv-A82-f
  HRV-Bv-B27-f
  HRV-Bv-A82-r
  HRV-Bv-B27-r
  HRV-Bv-A21-f
  HRV-Bv-A21-r
  HRV-Bv-B6-R1912-r
  HRV-Bv-5UTR-f (degeneracy 2)
  HRV-Bv-VP2-C252-5r (degeneracy 2)
Read 21 primers


 HRV-Bv-5UTRn-A1 len=19 match=100% [31:49]
 HRV-Bv-A21-f len=20 match=95% [89:107]
  Primer mismatch in first 1 bases
target           87 TATCGTTATCCGCAAAGTGC 107
                  0 .|||||||||||||||||||  20
query             0 GATCGTTATCCGCAAAGTGC  20

 HRV-Bv-5UTR-revseq (rev) len=15 match=100% [304:318]
 HRV-Bv-5UTR-rev (rev) len=18 match=100% [401:418]
 HRV-Bv-VP2-C252-5r (rev) len=21 match=100% [710:730]


SeqRecord(seq=Seq('GTGACAATCAGCCAGATTGTTAACGGTCAAGCACTTCTGTTTCCCCGGTACCCT...TTT'), id='EF077279.1', name='EF077279.1', description='EF077279.1 Rhinovirus C1 isolate NAT001 polyprotein gene, complete cds', dbxrefs=[])

In [38]:
m = iupac_subst();
m["I"]["C"]

1.0