In [1]:
from collections import defaultdict
from typing import Iterable, Dict, List, Tuple

import numpy as np

from nupack import Complex as NupackComplex
from nupack import Model as NupackModel
from nupack import ComplexSet as NupackComplexSet
from nupack import Strand as NupackStrand
from nupack import SetSpec as NupackSetSpec
from nupack import complex_analysis as nupack_complex_analysis
from nupack import PairsMatrix as NupackPairsMatrix


import dsd.vienna_nupack as dv
import dsd.constraints as dc

# constants
DOMAIN_LENGTH = 15
DOMAIN_POOL: dc.DomainPool = dc.DomainPool('DOMAIN_POOL', DOMAIN_LENGTH)
TEMPERATURE = 37.0
NUPACK_MODEL = NupackModel(material='dna', celsius=TEMPERATURE)
NUMBER_OF_TRIALS = 1000

def nupack_base_pair_probabilities(strands: Iterable[dc.Strand], base_index1: int, base_index2: int) -> Tuple[np.ndarray, Dict[str, List[float]]]:
    """Given a design and a specific base pair, assigns random DNA sequences to
    the design and computes the equilibrium base-pairing probability for each
    DNA sequence assignment.
    
    :param strands: The strands that make up the design.
    :type strands: Iterable[dc.Strand]
    :param base_index1: The index of one of the bases that form the base pair
    :type base_index1: int
    :param base_index2: The index of the other base that forms the base pair
    :type base_index2: int

    :return: An array of NUMBER_OF_TRIALS base pair probabilities and a
        dictionary which sorts the results by base (base located at
        base_index1)
    :rtype: Tuple[np.ndarray, Dict[str, List[float]]
    """
    domains_to_assign: Set[dc.Domain] = set()
    base_index1_strand_idx: int = 0
    base_index1_strand_base_idx: int
    
    num_bases_in_prev_strands = 0
    num_strands_seen = 0
    for s in strands:
        for d in s.domains:
            d.pool = DOMAIN_POOL
            domains_to_assign.add(d)

        if base_index1 < num_bases_in_prev_strands + s.length():
            base_index1_strand_idx = num_strands_seen
            base_index1_strand_base_idx = base_index1 - num_bases_in_prev_strands
        else:
            num_bases_in_prev_strands += s.length()
            num_strands_seen += 1
            
    print('DEBUG: Domains to assign:', domains_to_assign)
    print('DEBUG: Using strand index', base_index1_strand_idx, 'and base index', base_index1_strand_base_idx)
            

    base_pair_probabilities = [0] * NUMBER_OF_TRIALS
    base_pair_probabilities_by_base: Dict[str, List[float]] = defaultdict(list)
    for i in range(NUMBER_OF_TRIALS):
        # Assign random DNA sequence
        for d in domains_to_assign:
            rand_dna_seq = dv.random_dna_seq(DOMAIN_LENGTH)
            d.sequence = rand_dna_seq
        
#         print('Sequences:')
#         for s in strands:
#             print(s.sequence())
        
            
        # Initialize NUPACK NupackComplexSet
        nupack_strands = [NupackStrand(strand.sequence(), name=strand.name) for strand in strands]
        nupack_complex: NupackComplex = NupackComplex(nupack_strands)
        nupack_complex_set = NupackComplexSet(nupack_strands, complexes=(nupack_complex,))        
        
        # Call NUPACK complex_analysis        
        nupack_complex_result: np.ndarray = nupack_complex_analysis(nupack_complex_set, compute=['pairs'], model=NUPACK_MODEL)[nupack_complex].pairs.to_array()
            
        # Collect results
        assert nupack_complex_result[base_index1][base_index2] == nupack_complex_result[base_index2][base_index1]
        p = nupack_complex_result[base_index1][base_index2]
        base_pair_probabilities[i] = p
        base_at_base_index1: str = strands[base_index1_strand_idx].sequence()[base_index1_strand_base_idx]
        base_pair_probabilities_by_base[base_at_base_index1].append(p)
        
    return np.array(base_pair_probabilities), base_pair_probabilities_by_base

def summarize_bpps(bpps: np.ndarray, verbose: bool = False) -> None:
    if verbose:
        print('bpps:', bpps)
    print('mean', np.mean(bpps))
    print('standard deviation:', np.std(bpps))
    print()
    
def summarize_bpps_by_base(bpps_by_base: Dict[str, List[float]], verbose: bool = False) -> None:
    for base in 'ACGT':
        bpps = bpps_by_base[base]
        print('base:', base, 'sample size:', len(bpps))
        summarize_bpps(np.array(bpps), verbose=verbose)

In [2]:
print('INTERIOR_TO_STRAND')
# INTERIOR_TO_STRAND
#                       a      b
#                     0  14 15  29
#                     |   |  |   |
#                    [-----##----->
#                     |||||  |||||
#                    <-----##-----]
#                     |   |  |   |
#                     59 45  44  30
#                       a*^   b*
#                         |
#                     base pair
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand: dc.Strand = dc.Strand(['b*', 'a*'], name='bot strand')

interior_to_strand_bpps, interior_to_strand_bpps_by_base = nupack_base_pair_probabilities((top_strand, bot_strand), 14, 45)
summarize_bpps(interior_to_strand_bpps)
summarize_bpps_by_base(interior_to_strand_bpps_by_base)

INTERIOR_TO_STRAND
DEBUG: Domains to assign: {b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.996139864044728
standard deviation: 0.004231660501159015

base: A sample size: 248
mean 0.9935249351570609
standard deviation: 0.004340098677019922

base: C sample size: 257
mean 0.9989495829272403
standard deviation: 0.0007747872665446448

base: G sample size: 240
mean 0.9989600520844661
standard deviation: 0.0007668391426108173

base: T sample size: 255
mean 0.9931969600517819
standard deviation: 0.004471311228479622



In [3]:
print('BLUNT_END')
#                       a
#                     0  14
#                     |   |
#                    [----->
#                     |||||
#                    <-----]
#                     |   |
#                     29  15
#
#                         ^
#                         |
#                     base pair

top_strand: dc.Strand = dc.Strand(['a'], name='top strand')
bot_strand: dc.Strand = dc.Strand(['a*'], name='bot strand')

blunt_end_bpps, blunt_end_bpps_by_base = nupack_base_pair_probabilities((top_strand, bot_strand), 14, 15)
summarize_bpps(blunt_end_bpps)
summarize_bpps_by_base(blunt_end_bpps_by_base)

BLUNT_END
DEBUG: Domains to assign: {a}
DEBUG: Using strand index 0 and base index 14
mean 0.4733166229374988
standard deviation: 0.19127905042615134

base: A sample size: 259
mean 0.2938893694410884
standard deviation: 0.09825986585636196

base: C sample size: 235
mean 0.6452566935726917
standard deviation: 0.12252186002559047

base: G sample size: 242
mean 0.6364867088367911
standard deviation: 0.13467265087307204

base: T sample size: 264
mean 0.3467203398642841
standard deviation: 0.02822973808321057



In [4]:
print('NICK_3P')
#                       a      b
#                     0  14 15  29
#                     |   |  |   |
#                    [----->[----->
#                     |||||  |||||
#                    <-----##-----]
#                     |   |  |   |
#                     59 45  44  30
#                       a*    b*
#                         ^
#                         |
#                     base pair
top_strand_left = dc.Strand(['a'], name='top strand left')
top_strand_right = dc.Strand(['b'], name='top strand right')
bot_strand = dc.Strand(['b*', 'a*'], name='bot strand')
strands = (top_strand_left, top_strand_right, bot_strand)

nick_3p_bpps, nick_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 45)
summarize_bpps(nick_3p_bpps)
summarize_bpps_by_base(nick_3p_bpps_by_base)

# Test if swapping base index leads to consistent results
# Expected: Means and Standard deviations for A and T should swap as should G and C
# nick_3p_bpps, nick_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 45, 14)
# summarize_bpps(nick_3p_bpps)
# summarize_bpps_by_base(nick_3p_bpps_by_base)

NICK_3P
DEBUG: Domains to assign: {b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.8745329341891518
standard deviation: 0.10086994798477672

base: A sample size: 261
mean 0.871364354562543
standard deviation: 0.08801782201329217

base: C sample size: 260
mean 0.8751411265155076
standard deviation: 0.069979802497329

base: G sample size: 239
mean 0.967143184352861
standard deviation: 0.03750701828347114

base: T sample size: 240
mean 0.7850955153915095
standard deviation: 0.10350017644920348



In [5]:
print('NICK_5P')
#                       a      b
#                     0  14 15  29
#                     |   |  |   |
#                    [-----##----->
#                     |||||  |||||
#                    <-----]<-----]
#                     |   |  |   |
#                     59 45  44  30
#                       a*    b*
#                         ^
#                         |
#                     base pair
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand_right: dc.Strand = dc.Strand(['b*'], name='bot strand right')
bot_strand_left: dc.Strand = dc.Strand(['a*'], name='bot strand left')
strands = (top_strand, bot_strand_right, bot_strand_left)

nick_5p_bpps, nick_5p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 45)
summarize_bpps(nick_5p_bpps)
summarize_bpps_by_base(nick_5p_bpps_by_base)

NICK_5P
DEBUG: Domains to assign: {b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.8409437282759049
standard deviation: 0.14986277526944322

base: A sample size: 240
mean 0.6653598641230015
standard deviation: 0.13812206280298805

base: C sample size: 252
mean 0.9107106640159925
standard deviation: 0.0830886107987958

base: G sample size: 257
mean 0.9568783529615285
standard deviation: 0.04815530989063456

base: T sample size: 251
mean 0.8200818200925963
standard deviation: 0.11561798059337085



In [6]:
print('DANGLE_3P')
#                       a      b
#                     0  14 15  29
#                     |   |  |   |
#                    [-----##----->
#                     |||||
#                    <-----]
#                     |   |
#                     44 30
#                       a*
#                         ^
#                         |
#                     base pair
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand: dc.Strand = dc.Strand(['a*'], name='bot strand')
strands = (top_strand, bot_strand)

dangle_3p_bpps, dangle_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 30)
summarize_bpps(dangle_3p_bpps)
summarize_bpps_by_base(dangle_3p_bpps_by_base)

DANGLE_3P
DEBUG: Domains to assign: {b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.654700146871485
standard deviation: 0.20544362789599882

base: A sample size: 256
mean 0.42119212208884804
standard deviation: 0.1213167616936697

base: C sample size: 257
mean 0.7845312045792187
standard deviation: 0.15006973684273464

base: G sample size: 242
mean 0.8274828789474443
standard deviation: 0.13309740026067055

base: T sample size: 245
mean 0.5918350503453028
standard deviation: 0.08761917340751331



In [7]:
print('DANGLE_5P')
#                       a
#                     0  14
#                     |   |
#                    [----->
#                     |||||
#                    <-----##-----]
#                     |   |  |   |
#                     44 30  29  15
#                       a*    b*
#                         ^
#                         |
#                     base pair
top_strand: dc.Strand = dc.Strand(['a'], name='top strand')
bot_strand: dc.Strand = dc.Strand(['b*', 'a*'], name='bot strand')
strands = (top_strand, bot_strand)

dangle_5p_bpps, dangle_5p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 30)
summarize_bpps(dangle_5p_bpps)
summarize_bpps_by_base(dangle_5p_bpps_by_base)

DANGLE_5P
DEBUG: Domains to assign: {b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.6783246680712568
standard deviation: 0.18532563210274025

base: A sample size: 255
mean 0.5057445899385706
standard deviation: 0.14733023377077312

base: C sample size: 245
mean 0.7446469407612092
standard deviation: 0.17560696985700794

base: G sample size: 240
mean 0.8631021892850903
standard deviation: 0.08665539105324747

base: T sample size: 260
mean 0.6145260450846287
standard deviation: 0.07666025042200283



In [8]:
print('DANGLE_5P_3P')
#                       a      b
#                     0  14 15  29
#                     |   |  |   |
#                    [-----##---->
#                     |||||
#                    <-----##----]
#                     |   |  |   |
#                     59 45  44  30
#                       a*    c
#                         ^
#                         |
#                     base pair
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand: dc.Strand = dc.Strand(['c', 'a*'], name='bot strand')
strands = (top_strand, bot_strand)

dangle_5p_3p_bpps, dangle_5p_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 45)
summarize_bpps(dangle_5p_3p_bpps)
summarize_bpps_by_base(dangle_5p_3p_bpps_by_base)

DANGLE_5P_3P
DEBUG: Domains to assign: {c, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.8263392892048792
standard deviation: 0.1705027089001471

base: A sample size: 241
mean 0.7318440944454684
standard deviation: 0.19179983168624098

base: C sample size: 241
mean 0.8560361248225861
standard deviation: 0.1779914048427839

base: G sample size: 252
mean 0.9165546347001837
standard deviation: 0.09269235332371747

base: T sample size: 266
mean 0.7995804075820746
standard deviation: 0.14765396534219646



In [9]:
print('OVERHANG_ON_THIS_STRAND_3P')
#                          ^
#                          |-29
#                          |   b
#                          |-15
#                          #
#                          #
#                       a  #    c
#                     0  14#  30  44
#                     |   |#  |   |
#                    [-----# [----->
#                     |||||   |||||
#                    <-----###-----]
#                     |   |   |   |
#                     74  60  59  45
#                       a*      c*
#                         ^
#                         |
#                     base pair
top_strand_left: dc.Strand = dc.Strand(['a', 'b'], name='top strand left')
top_strand_right: dc.Strand = dc.Strand(['c'], name='top strand right')
bot_strand: dc.Strand = dc.Strand(['c*', 'a*'], name='bot strand')
strands = (top_strand_left, top_strand_right, bot_strand)

overhang_on_this_strand_3p_bpps, overhang_on_this_strand_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 60)
summarize_bpps(overhang_on_this_strand_3p_bpps)
summarize_bpps_by_base(overhang_on_this_strand_3p_bpps_by_base)

OVERHANG_ON_THIS_STRAND_3P
DEBUG: Domains to assign: {c, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.8709961268508676
standard deviation: 0.12577237257663734

base: A sample size: 235
mean 0.8607754333015771
standard deviation: 0.1049531255485245

base: C sample size: 254
mean 0.8666935306531415
standard deviation: 0.13986840417747567

base: G sample size: 238
mean 0.9659393164527569
standard deviation: 0.04964336629859248

base: T sample size: 273
mean 0.8010263220635269
standard deviation: 0.12349253541739529



In [10]:
print('OVERHANG_ON_THIS_STRAND_5P')
#                     base pair
#                         |
#                         v
#                       a       b
#                     0   14  15  29
#                     |   |   |   |
#                    [-----###----->
#                     |||||   |||||
#                    <-----# <-----]
#                     |   |#  |   |
#                     74 60#  44  30
#                       a* #    b*
#                          #
#                          #
#                          |-59
#                          |    c
#                          |-45
#                          ]
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand_right: dc.Strand = dc.Strand(['b*'], name='bot strand right')
bot_strand_left: dc.Strand = dc.Strand(['c', 'a*'], name='bot strand left')
strands = (top_strand, bot_strand_right, bot_strand_left)

overhang_on_this_strand_5p_bpps, overhang_on_this_strand_5p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 60)
summarize_bpps(overhang_on_this_strand_5p_bpps)
summarize_bpps_by_base(overhang_on_this_strand_5p_bpps_by_base)

OVERHANG_ON_THIS_STRAND_5P
DEBUG: Domains to assign: {c, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.8543380093819098
standard deviation: 0.15265644623208846

base: A sample size: 237
mean 0.70598980370478
standard deviation: 0.16076688410334453

base: C sample size: 251
mean 0.9027180148829604
standard deviation: 0.12613355333062898

base: G sample size: 252
mean 0.9530741058103965
standard deviation: 0.0851049555949977

base: T sample size: 260
mean 0.8471597288616695
standard deviation: 0.10895771403577921



In [11]:
print('OVERHANG_ON_ADJACENT_STRAND_3P')
#                            [
#                            |-15
#                            |    b
#                            |-29
#                            #
#                            #
#                       a    #  c
#                     0   14 #30  44
#                     |   |  #|   |      
#                    [-----> #----->
#                     |||||   |||||
#                    <-----###-----]
#                     |   |   |   |
#                     74  60  59  45
#                       a*      c*
#                         ^
#                         |
#                     base pair
top_strand_left: dc.Strand = dc.Strand(['a'], name='top strand left')
top_strand_right: dc.Strand = dc.Strand(['b', 'c'], name='top strand right')
bot_strand: dc.Strand = dc.Strand(['c*', 'a*'], name='bot strand')
strands = (top_strand_left, top_strand_right, bot_strand)

overhang_on_adj_strand_3p_bpps, overhang_on_adj_strand_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 60)
summarize_bpps(overhang_on_adj_strand_3p_bpps)
summarize_bpps_by_base(overhang_on_adj_strand_3p_bpps_by_base)

OVERHANG_ON_ADJACENT_STRAND_3P
DEBUG: Domains to assign: {c, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.677918132459771
standard deviation: 0.2305652054914311

base: A sample size: 249
mean 0.6677236124232251
standard deviation: 0.19134784560720866

base: C sample size: 257
mean 0.6404521084971335
standard deviation: 0.2724992089685542

base: G sample size: 227
mean 0.8578163933891318
standard deviation: 0.19075432190158378

base: T sample size: 267
mean 0.5705409729711297
standard deviation: 0.14850310416255283



In [12]:
print('OVERHANG_ON_ADJACENT_STRAND_5P')
#                     base pair
#                         |
#                         v
#                       a       b
#                     0   14  15  29
#                     |   |   |   |   
#                    [-----###----->
#                     |||||   |||||
#                    <-----] #-----]
#                     |   |  #|   |
#                     74  60 #44  30
#                       a*   #  b*
#                            #
#                            #
#                            |-45
#                            |   c 
#                            |-59
#                            v
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand_right: dc.Strand = dc.Strand(['b*', 'c'], name='bot strand right')
bot_strand_left: dc.Strand = dc.Strand(['a*'], name='bot strand left')
strands = (top_strand, bot_strand_right, bot_strand_left)

overhang_on_adj_strand_5p_bpps, overhang_on_adj_strand_5p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 60)
summarize_bpps(overhang_on_adj_strand_5p_bpps)
summarize_bpps_by_base(overhang_on_adj_strand_5p_bpps_by_base)

OVERHANG_ON_ADJACENT_STRAND_5P
DEBUG: Domains to assign: {c, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.6612378977853949
standard deviation: 0.26739477348342183

base: A sample size: 250
mean 0.4279570442800491
standard deviation: 0.1921257062559456

base: C sample size: 245
mean 0.7021323234291046
standard deviation: 0.2834369086014489

base: G sample size: 263
mean 0.8672037532071045
standard deviation: 0.15782221423505458

base: T sample size: 242
mean 0.6369902081891877
standard deviation: 0.2105161103482709



In [13]:
print('OVERHANG_ON_BOTH_STRAND_3P')
#                          ^ [
#                       29-| |-30
#                     b    | |    c
#                       15-| |-44
#                          # #
#                       a  # #   d
#                     0  14# #45  59
#                     |   |   |   |
#                    [-----# #----->
#                     |||||   |||||
#                    <-----###-----]
#                     |   |   |   |
#                     89 75   74  60
#                       a*      d*
#                         ^
#                         |
#                     base pair
top_strand_left: dc.Strand = dc.Strand(['a', 'b'], name='top strand left')
top_strand_right: dc.Strand = dc.Strand(['c', 'd'], name='top strand right')
bot_strand: dc.Strand = dc.Strand(['d*', 'a*'], name='bot strand')
strands = (top_strand_left, top_strand_right, bot_strand)

overhang_on_both_strand_3p_bpps, overhang_on_both_strand_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 75)
summarize_bpps(overhang_on_both_strand_3p_bpps)
summarize_bpps_by_base(overhang_on_both_strand_3p_bpps_by_base)

OVERHANG_ON_BOTH_STRAND_3P
DEBUG: Domains to assign: {c, d, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.72325467443002
standard deviation: 0.23870524339716756

base: A sample size: 230
mean 0.6971261473001897
standard deviation: 0.20918040158649245

base: C sample size: 272
mean 0.6866306246297865
standard deviation: 0.2866168616709433

base: G sample size: 258
mean 0.8433488683756104
standard deviation: 0.20642639137178664

base: T sample size: 240
mean 0.6607005108781956
standard deviation: 0.1870354615322912



In [14]:
print('OVERHANG_ON_BOTH_STRAND_5P')
#
#
#                     base pair
#                         |
#                         v
#                       a       b
#                     0   14  15  29
#                     |   |   |   |   
#                    [-----###----->
#                     |||||   |||||
#                    <-----# #-----]
#                     |   |# #|   |
#                     89 75# #44  30
#                       a* # #  b*
#                          # #
#                          # #
#                       74-| |-45
#                      d   | |   c
#                       60-| |-59
#                          ] v
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand_right: dc.Strand = dc.Strand(['b*', 'c'], name='bot strand right')
bot_strand_left: dc.Strand = dc.Strand(['d', 'a*'], name='bot strand left')
strands = (top_strand, bot_strand_right, bot_strand_left)

overhang_on_both_strand_5p_bpps, overhang_on_both_strand_5p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 75)
summarize_bpps(overhang_on_both_strand_5p_bpps)
summarize_bpps_by_base(overhang_on_both_strand_5p_bpps_by_base)

OVERHANG_ON_BOTH_STRAND_5P
DEBUG: Domains to assign: {c, d, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.7237435563453959
standard deviation: 0.24173076469853497

base: A sample size: 259
mean 0.5933384951163806
standard deviation: 0.22297472473703503

base: C sample size: 258
mean 0.7388118065280196
standard deviation: 0.26627798628287547

base: G sample size: 231
mean 0.8656700567860921
standard deviation: 0.19057585756313497

base: T sample size: 252
mean 0.712244670271575
standard deviation: 0.19644195978038356



In [15]:
print('THREE_ARM_JUNCTION_3P')
#                          ^ [
#                       29-|-|-30
#                     b    |-|    b*
#                       15-|-|-44
#                          # #
#                       a  # #   c
#                     0  14# #45  59
#                     |   |   |   |
#                    [-----# #----->
#                     |||||   |||||
#                    <-----###-----]
#                     |   |   |   |
#                     89 75   74  60
#                       a*      c*
#                         ^
#                         |
#                     base pair
top_strand_left: dc.Strand = dc.Strand(['a', 'b'], name='top strand left')
top_strand_right: dc.Strand = dc.Strand(['b*', 'c'], name='top strand right')
bot_strand: dc.Strand = dc.Strand(['c*', 'a*'], name='bot strand')
strands = (top_strand_left, top_strand_right, bot_strand)

three_arm_junction_3p_bpps, three_arm_junction_3p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 75)
summarize_bpps(three_arm_junction_3p_bpps)
summarize_bpps_by_base(three_arm_junction_3p_bpps_by_base)

THREE_ARM_JUNCTION_3P
DEBUG: Domains to assign: {c, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.7932873725912214
standard deviation: 0.16434888832699426

base: A sample size: 271
mean 0.7004845935374057
standard deviation: 0.14790563548811772

base: C sample size: 235
mean 0.791716782970993
standard deviation: 0.17551135143826002

base: G sample size: 249
mean 0.9491319360025092
standard deviation: 0.04745777315726583

base: T sample size: 245
mean 0.7390561293052095
standard deviation: 0.1306133558511293



In [16]:
# EQUIVALENT TO THREE_ARM_JUNCTION_3P
print('THREE_ARM_JUNCTION_5P')
#
#
#                     base pair
#                         |
#                         v
#                       a       b
#                     0   14  15  29
#                     |   |   |   |   
#                    [-----###----->
#                     |||||   |||||
#                    <-----# #-----]
#                     |   |# #|   |
#                     89 75# #44  30
#                       a* # #  b*
#                          # #
#                          # #
#                       74-|-|-45
#                      c*  |-|   c
#                       60-|-|-59
#                          ] v
top_strand: dc.Strand = dc.Strand(['a', 'b'], name='top strand')
bot_strand_right: dc.Strand = dc.Strand(['b*', 'c'], name='bot strand right')
bot_strand_left: dc.Strand = dc.Strand(['c*', 'a*'], name='bot strand left')
strands = (top_strand, bot_strand_right, bot_strand_left)

three_arm_junction_5p_bpps, three_arm_junction_5p_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 75)
summarize_bpps(three_arm_junction_5p_bpps)
summarize_bpps_by_base(three_arm_junction_5p_bpps_by_base)

THREE_ARM_JUNCTION_5P
DEBUG: Domains to assign: {c, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.7989352188852504
standard deviation: 0.1643243914087958

base: A sample size: 270
mean 0.7094327147237839
standard deviation: 0.15515540784247872

base: C sample size: 233
mean 0.8000221587217748
standard deviation: 0.17376043136422178

base: G sample size: 253
mean 0.9528928129464077
standard deviation: 0.04416486458076376

base: T sample size: 244
mean 0.7373005789025163
standard deviation: 0.12622251019758576



In [17]:
print('FOUR_ARM_JUNCTION')
#                          ^ [
#                       29-|-|-30
#                      b   |-|   b*
#                       15-|-|-44
#                          # #
#                          # #
#                       a  # #  c
#                     0  14# #45  59
#                     |   |# #|   |
#                    [-----# #----->
#                     |||||   |||||
#                    <-----# #-----]
#                     |   |# #|   |
#                   120 105# #74  60
#                      a*  # #  c*
#                          # #
#                          # #
#                      104-|-|-75
#                     d*   |-|    d
#                       90-|-|-89
#                          ] v
top_strand_left: dc.Strand = dc.Strand(['a', 'b'], name='top strand left')
top_strand_right: dc.Strand = dc.Strand(['b*', 'c'], name='top strand right')
bot_strand_right: dc.Strand = dc.Strand(['c*', 'd'], name='bot strand right')
bot_strand_left: dc.Strand = dc.Strand(['d*', 'a*'], name='bot strand left')
strands = (top_strand_left, top_strand_right, bot_strand_right, bot_strand_left)

four_arm_junction_bpps, four_arm_junction_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 105)
summarize_bpps(four_arm_junction_bpps)
summarize_bpps_by_base(four_arm_junction_bpps_by_base)

print('------------------------------------------------')

four_arm_junction_bpps, four_arm_junction_bpps_by_base = nupack_base_pair_probabilities(strands, 44, 15)
summarize_bpps(four_arm_junction_bpps)
summarize_bpps_by_base(four_arm_junction_bpps_by_base)

print('------------------------------------------------')

four_arm_junction_bpps, four_arm_junction_bpps_by_base = nupack_base_pair_probabilities(strands, 74, 45)
summarize_bpps(four_arm_junction_bpps)
summarize_bpps_by_base(four_arm_junction_bpps_by_base)

print('------------------------------------------------')

four_arm_junction_bpps, four_arm_junction_bpps_by_base = nupack_base_pair_probabilities(strands, 104, 75)
summarize_bpps(four_arm_junction_bpps)
summarize_bpps_by_base(four_arm_junction_bpps_by_base)

FOUR_ARM_JUNCTION
DEBUG: Domains to assign: {c, d, b, a}
DEBUG: Using strand index 0 and base index 14
mean 0.8438606570623834
standard deviation: 0.24513255194445582

base: A sample size: 271
mean 0.8251777178480196
standard deviation: 0.19225563655813616

base: C sample size: 212
mean 0.763731150345522
standard deviation: 0.3491110986087579

base: G sample size: 274
mean 0.9431889267434718
standard deviation: 0.1602209647054263

base: T sample size: 243
mean 0.8226038095662888
standard deviation: 0.23069280910560017

------------------------------------------------
DEBUG: Domains to assign: {c, d, b, a}
DEBUG: Using strand index 1 and base index 14
mean 0.8228769009823952
standard deviation: 0.2677841932812746

base: A sample size: 262
mean 0.8157969834519879
standard deviation: 0.20779463464422052

base: C sample size: 258
mean 0.7642366607116827
standard deviation: 0.35827181584161544

base: G sample size: 236
mean 0.9498834518767884
standard deviation: 0.1279462346453522

base: T 

In [18]:
print('FIVE_ARM_JUNCTION')
top_strand_left: dc.Strand = dc.Strand(['a', 'b'], name='top strand left')
top_strand_right: dc.Strand = dc.Strand(['b*', 'c'], name='top strand right')
bot_strand_right: dc.Strand = dc.Strand(['c*', 'd'], name='bot strand right')
bot_strand_left: dc.Strand = dc.Strand(['d*', 'e'], name='bot strand left')
other_strand: dc.Strand = dc.Strand(['e*', 'a*'], name='other strand')
strands = (top_strand_left, top_strand_right, bot_strand_right, bot_strand_left, other_strand)

five_arm_junction_bpps, five_arm_junction_bpps_by_base = nupack_base_pair_probabilities(strands, 14, 135)
summarize_bpps(five_arm_junction_bpps)
summarize_bpps_by_base(five_arm_junction_bpps_by_base)

FIVE_ARM_JUNCTION
DEBUG: Domains to assign: {b, a, e, d, c}
DEBUG: Using strand index 0 and base index 14
mean 0.8332362060397392
standard deviation: 0.1674862095792048

base: A sample size: 230
mean 0.7881105937298064
standard deviation: 0.13938527970567274

base: C sample size: 280
mean 0.8049183404113645
standard deviation: 0.21269601798335197

base: G sample size: 235
mean 0.9711428130037553
standard deviation: 0.02912298836333044

base: T sample size: 255
mean 0.7779414631796837
standard deviation: 0.13683959686596217



In [20]:
from IPython.display import display
import pandas as pd

# Constants
DATAFRAME_STYLE = [
    dict(selector="caption", 
    props=[
        ("text-align", "center"),
        ("font-size", "120%"),
        ("font-weight", "bold"),
        ("color", 'black'),
   ])
]

# Utility function for displaying data in a pandas DataFrame
def display_data(data: Dict, caption: str) -> None:
    display(pd.DataFrame(data).style.set_caption(caption).set_table_styles(DATAFRAME_STYLE))  

class BasePairProbabilityResult:
    def __init__(self, name, bpps_by_base) -> None:
        self.name = name
        self.bpps_by_base = bpps_by_base
        
    def mean_by_base(self, base) -> float:
        return np.mean(self.bpps_by_base[base])
    
    def std_by_base(self, base) -> float:
        return np.std(self.bpps_by_base[base])
        
# Collect all results
base_pair_probability_results = [
    BasePairProbabilityResult('INTERIOR_TO_STRAND', interior_to_strand_bpps_by_base),
    BasePairProbabilityResult('BLUNT_END', blunt_end_bpps_by_base),
    BasePairProbabilityResult('NICK_3P', nick_3p_bpps_by_base),
    BasePairProbabilityResult('NICK_5P', nick_5p_bpps_by_base),
    BasePairProbabilityResult('DANGLE_3P', dangle_3p_bpps_by_base),
    BasePairProbabilityResult('DANGLE_5P', dangle_5p_bpps_by_base),
    BasePairProbabilityResult('DANGLE_5P_3P', dangle_5p_3p_bpps_by_base),
    BasePairProbabilityResult('OVERHANG_ON_THIS_STRAND_3P', overhang_on_this_strand_3p_bpps_by_base),
    BasePairProbabilityResult('OVERHANG_ON_THIS_STRAND_5P', overhang_on_this_strand_5p_bpps_by_base),
    BasePairProbabilityResult('OVERHANG_ON_ADJACENT_STRAND_3P', overhang_on_adj_strand_3p_bpps_by_base),
    BasePairProbabilityResult('OVERHANG_ON_ADJACENT_STRAND_5P', overhang_on_adj_strand_5p_bpps_by_base),
    BasePairProbabilityResult('OVERHANG_ON_BOTH_STRAND_3P', overhang_on_both_strand_3p_bpps_by_base),
    BasePairProbabilityResult('OVERHANG_ON_BOTH_STRAND_5P', overhang_on_both_strand_5p_bpps_by_base),
    BasePairProbabilityResult('THREE_ARM_JUNCTION_3P', three_arm_junction_3p_bpps_by_base),
    BasePairProbabilityResult('THREE_ARM_JUNCTION_5P', three_arm_junction_5p_bpps_by_base),
    BasePairProbabilityResult('FOUR_ARM_JUNCTION', four_arm_junction_bpps_by_base),
    BasePairProbabilityResult('FIVE_ARM_JUNCTION', five_arm_junction_bpps_by_base),
]

# Maps base to a sorted list of most probable base pair type
sorted_base_pair_type_by_base: Dict[str, List[str]] = {}

for base in 'ACGT':
    # Sorts result by most probable base pair type
    base_pair_probability_results = sorted(base_pair_probability_results, key=lambda bppr: -bppr.mean_by_base(base))
    
    display_data(
        {
            "Base Pair Type": [bppr.name for bppr in base_pair_probability_results],
            "Mean Probability": [bppr.mean_by_base(base) for bppr in base_pair_probability_results],
            "Standard Deviation": [bppr.std_by_base(base) for bppr in base_pair_probability_results]
        },
        caption=f'Equilibrium Base Pair Probability (Base {base})'
    )
    
    # Collect sorted Base Pair Type for this base
    sorted_base_pair_type_by_base[base] = [bppr.name for bppr in base_pair_probability_results]

display_data(sorted_base_pair_type_by_base, 'Most Probabable Base Pair Type By Base')

Unnamed: 0,Base Pair Type,Mean Probability,Standard Deviation
0,INTERIOR_TO_STRAND,0.993525,0.00434
1,NICK_3P,0.871364,0.088018
2,OVERHANG_ON_THIS_STRAND_3P,0.860775,0.104953
3,FOUR_ARM_JUNCTION,0.820307,0.193592
4,FIVE_ARM_JUNCTION,0.788111,0.139385
5,DANGLE_5P_3P,0.731844,0.1918
6,THREE_ARM_JUNCTION_5P,0.709433,0.155155
7,OVERHANG_ON_THIS_STRAND_5P,0.70599,0.160767
8,THREE_ARM_JUNCTION_3P,0.700485,0.147906
9,OVERHANG_ON_BOTH_STRAND_3P,0.697126,0.20918


Unnamed: 0,Base Pair Type,Mean Probability,Standard Deviation
0,INTERIOR_TO_STRAND,0.99895,0.000775
1,NICK_5P,0.910711,0.083089
2,OVERHANG_ON_THIS_STRAND_5P,0.902718,0.126134
3,NICK_3P,0.875141,0.06998
4,OVERHANG_ON_THIS_STRAND_3P,0.866694,0.139868
5,DANGLE_5P_3P,0.856036,0.177991
6,FIVE_ARM_JUNCTION,0.804918,0.212696
7,THREE_ARM_JUNCTION_5P,0.800022,0.17376
8,THREE_ARM_JUNCTION_3P,0.791717,0.175511
9,DANGLE_3P,0.784531,0.15007


Unnamed: 0,Base Pair Type,Mean Probability,Standard Deviation
0,INTERIOR_TO_STRAND,0.99896,0.000767
1,FIVE_ARM_JUNCTION,0.971143,0.029123
2,NICK_3P,0.967143,0.037507
3,OVERHANG_ON_THIS_STRAND_3P,0.965939,0.049643
4,NICK_5P,0.956878,0.048155
5,OVERHANG_ON_THIS_STRAND_5P,0.953074,0.085105
6,THREE_ARM_JUNCTION_5P,0.952893,0.044165
7,THREE_ARM_JUNCTION_3P,0.949132,0.047458
8,FOUR_ARM_JUNCTION,0.940986,0.149247
9,DANGLE_5P_3P,0.916555,0.092692


Unnamed: 0,Base Pair Type,Mean Probability,Standard Deviation
0,INTERIOR_TO_STRAND,0.993197,0.004471
1,OVERHANG_ON_THIS_STRAND_5P,0.84716,0.108958
2,NICK_5P,0.820082,0.115618
3,FOUR_ARM_JUNCTION,0.81708,0.238912
4,OVERHANG_ON_THIS_STRAND_3P,0.801026,0.123493
5,DANGLE_5P_3P,0.79958,0.147654
6,NICK_3P,0.785096,0.1035
7,FIVE_ARM_JUNCTION,0.777941,0.13684
8,THREE_ARM_JUNCTION_3P,0.739056,0.130613
9,THREE_ARM_JUNCTION_5P,0.737301,0.126223


Unnamed: 0,A,C,G,T
0,INTERIOR_TO_STRAND,INTERIOR_TO_STRAND,INTERIOR_TO_STRAND,INTERIOR_TO_STRAND
1,NICK_3P,NICK_5P,FIVE_ARM_JUNCTION,OVERHANG_ON_THIS_STRAND_5P
2,OVERHANG_ON_THIS_STRAND_3P,OVERHANG_ON_THIS_STRAND_5P,NICK_3P,NICK_5P
3,FOUR_ARM_JUNCTION,NICK_3P,OVERHANG_ON_THIS_STRAND_3P,FOUR_ARM_JUNCTION
4,FIVE_ARM_JUNCTION,OVERHANG_ON_THIS_STRAND_3P,NICK_5P,OVERHANG_ON_THIS_STRAND_3P
5,DANGLE_5P_3P,DANGLE_5P_3P,OVERHANG_ON_THIS_STRAND_5P,DANGLE_5P_3P
6,THREE_ARM_JUNCTION_5P,FIVE_ARM_JUNCTION,THREE_ARM_JUNCTION_5P,NICK_3P
7,OVERHANG_ON_THIS_STRAND_5P,THREE_ARM_JUNCTION_5P,THREE_ARM_JUNCTION_3P,FIVE_ARM_JUNCTION
8,THREE_ARM_JUNCTION_3P,THREE_ARM_JUNCTION_3P,FOUR_ARM_JUNCTION,THREE_ARM_JUNCTION_3P
9,OVERHANG_ON_BOTH_STRAND_3P,DANGLE_3P,DANGLE_5P_3P,THREE_ARM_JUNCTION_5P
