In [1]:
import pandas as pd

In [2]:
import copy

In [3]:
import dgutils.pandas as dgp

  from tqdm.autonotebook import tqdm


In [4]:
import sys
sys.path.insert(0, '../../rna_ss/')
from local_struct_utils import LocalStructureParser

In [5]:
df = pd.read_pickle('../2020_09_22/data/rand_s1_bb_0p1.pkl.gz')

In [6]:
row = df.iloc[4]
row

ensemble_diversity                                                 4.15
free_energy                                                        -3.9
len                                                                  45
mfe_frequency                                                  0.229379
one_idx               ([13, 14, 15, 16, 17, 19, 20, 24, 25, 32, 33, ...
seq                       ACUACUCACUAAGAUGACUUUUUGCCGCACUAGGAACGACGUUAU
bounding_boxes        [((13, 40), (5, 5), stem), ((19, 37), (2, 2), ...
bb_stem               [{'bb_x': 5, 'bb_y': 16, 'siz_x': 3, 'siz_y': ...
bb_iloop              [{'bb_x': 17, 'bb_y': 33, 'siz_x': 5, 'siz_y':...
bb_hloop              [{'bb_x': 7, 'bb_y': 14, 'siz_x': 8, 'siz_y': ...
Name: 4, dtype: object

In [7]:
bb_name_mapping = {
    'hairpin_loop': 'hloop',
    'hloop': 'hloop',
    'internal_loop': 'iloop',
    'iloop': 'iloop',
    'bulge': 'iloop',
    'stem': 'stem',
    # not local ss
    'pesudo_knot': 'pknot',
    'pseudo_knot': 'pknot',
}


def process_bb_old_to_new(old_bb):
    # old_bb: list of:
    # (top_left_x, top_left_y), (siz_x, siz_y), bb_type
    df = []
    for (top_left_x, top_left_y), (siz_x, siz_y), bb_type in old_bb:
        bb_x = top_left_x
        bb_y = top_left_y + siz_y - 1
        df.append({
            'bb_x': bb_x,
            'bb_y': bb_y,
            'siz_x': siz_x,
            'siz_y': siz_y,
            'bb_type': bb_name_mapping[bb_type]
        })
    return pd.DataFrame(df)

In [8]:
df_target = process_bb_old_to_new(row['bounding_boxes'])

In [9]:
df_target

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,bb_type
0,13,44,5,5,stem
1,19,38,2,2,stem
2,24,33,2,2,stem
3,17,40,3,3,iloop
4,20,37,5,5,iloop
5,25,32,8,8,hloop


In [10]:
def add_bb_bottom_left(df):
    # add bottom left reference point
    # df: requires columns bb_x, bb_y, siz_x, siz_y
    
    def add_bl(bb_x, bb_y, siz_x, siz_y):
        x = bb_x + siz_x - 1
        y = bb_y - siz_y + 1
        return x, y
    
    df = dgp.add_columns(df, ['bl_x', 'bl_y'], ['bb_x', 'bb_y', 'siz_x', 'siz_y'], add_bl)
    return df

In [11]:
df_stem = pd.DataFrame(row['bb_stem'])
df_stem = add_bb_bottom_left(df_stem)

df_iloop = pd.DataFrame(row['bb_iloop'])
df_iloop = add_bb_bottom_left(df_iloop)

df_hloop = pd.DataFrame(row['bb_hloop'])
df_hloop = add_bb_bottom_left(df_hloop)

In [12]:
df_stem

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,prob,bl_x,bl_y
0,5,16,3,3,"[0.1490665853886835, 0.24864900273776647, 0.29...",7,14
1,8,32,3,3,"[0.16853989897361105, 0.14421550251522583, 0.1...",10,30
2,10,19,3,3,[0.12216360047266443],12,17
3,13,44,5,5,"[0.7305797787502836, 0.7035426244401864, 0.812...",17,40
4,19,38,2,2,"[0.7279137845500362, 0.829618027530799, 0.7814...",20,37
5,21,35,2,2,"[0.18925865017357865, 0.25012681908297557]",22,34
6,24,33,2,2,"[0.9167302891209327, 0.9266829697603598, 0.764...",25,32


In [13]:
df_iloop

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,prob,bl_x,bl_y
0,17,33,5,5,"[0.15545129202512073, 0.05504290146882441, 0.1...",21,29
1,17,33,8,8,[0.1144296437188239],24,26
2,17,34,5,5,[0.1452210171965283],21,30
3,17,34,8,8,[0.16347625528339768],24,27
4,17,35,8,8,"[0.24376617371504158, 0.11478901300030442]",24,28
5,17,36,8,8,"[0.26832040616423486, 0.06279155022198056]",24,29
6,17,38,3,3,"[0.589986535355823, 0.649057071655922, 0.68693...",19,36
7,17,38,8,8,"[0.08544047568930521, 0.11084255769946931]",24,31
8,17,39,3,3,"[0.5317535184193329, 0.7241452935434343, 0.597...",19,37
9,17,40,3,3,"[0.4094213981394772, 0.6459865894325693, 0.435...",19,38


In [14]:
df_hloop

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,prob,bl_x,bl_y
0,7,14,8,8,"[0.12397423149413879, 0.23361334980278547, 0.1...",14,7
1,25,32,8,8,"[0.8107588025173745, 0.8814300326341995, 0.842...",32,25


In [15]:
def compatible_counts(df1, df2, col1, col2, out_name):
    # join df1 and df2 on col1/col2, count the number of compatible entries
    # this is equivalent to:
    # for each row of df1, find how many rows there are in df2 that's compatible
    
    if isinstance(col1, str):
        col1 = [col1]
    if isinstance(col2, str):
        col2 = [col2]
    
    assert out_name not in col1 + col2
    
    # first aggregate count on df2 using col2
    df2_ct = df2[col2].groupby(col2).size().reset_index(name=out_name)
    # hack col name, to avoid duplication
    df2_ct = df2_ct.rename(columns={a: b for a, b in zip(col2, col1)})
    # join to df1
    df = pd.merge(df1, df2_ct, left_on=col1, right_on=col1, how='outer')
    # replace missing entry with 0 (count)
    df = df.fillna(0)
    
    return df

In [16]:
# for each iloop, check:
# how many compatible outer stems (stem.bottom_left == iloop.top_right)
df_iloop_cleanup = compatible_counts(df_iloop, df_stem, col1=['bb_x', 'bb_y'], col2=['bl_x', 'bl_y'], out_name='num_compatible_stem_outer')
# how many compatible inner stems (stem.top_right == iloop.bottom_left)
df_iloop_cleanup = compatible_counts(df_iloop_cleanup, df_stem, col1=['bl_x', 'bl_y'], col2=['bb_x', 'bb_y'], out_name='num_compatible_stem_inner')
# drop those rows without compatible stems on both ends
df_iloop_cleanup = df_iloop_cleanup[(df_iloop_cleanup['num_compatible_stem_inner'] > 0) & (df_iloop_cleanup['num_compatible_stem_outer'] > 0)]

In [17]:
df_iloop_cleanup

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,prob,bl_x,bl_y,num_compatible_stem_outer,num_compatible_stem_inner
10,17.0,40.0,3.0,3.0,"[0.4094213981394772, 0.6459865894325693, 0.435...",19.0,38.0,1.0,1.0
11,17.0,40.0,8.0,8.0,[0.11125106289936942],24.0,33.0,1.0,1.0
12,20.0,37.0,5.0,5.0,"[0.4656916699489385, 0.24332181886780138, 0.26...",24.0,33.0,1.0,1.0


In [18]:
# for each hloop, check:
# how many compatible outer stems (stem.bottom_left == iloop.top_right)
df_hloop_cleanup = compatible_counts(df_hloop, df_stem, col1=['bb_x', 'bb_y'], col2=['bl_x', 'bl_y'], out_name='num_compatible_stem_outer')
# drop those rows without compatible stem
df_hloop_cleanup = df_hloop_cleanup[df_hloop_cleanup['num_compatible_stem_outer'] > 0]
# drop those not symmetric & across diagonal
df_hloop_cleanup = df_hloop_cleanup[(df_hloop_cleanup['bb_x'] == df_hloop_cleanup['bl_y']) & (df_hloop_cleanup['bb_y'] == df_hloop_cleanup['bl_x']) & (df_hloop_cleanup['siz_x'] == df_hloop_cleanup['siz_y'])]

In [19]:
df_hloop_cleanup

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,prob,bl_x,bl_y,num_compatible_stem_outer
0,7,14,8.0,8.0,"[0.12397423149413879, 0.23361334980278547, 0.1...",14.0,7.0,1
1,25,32,8.0,8.0,"[0.8107588025173745, 0.8814300326341995, 0.842...",32.0,25.0,1


In [20]:
df_stem

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,prob,bl_x,bl_y
0,5,16,3,3,"[0.1490665853886835, 0.24864900273776647, 0.29...",7,14
1,8,32,3,3,"[0.16853989897361105, 0.14421550251522583, 0.1...",10,30
2,10,19,3,3,[0.12216360047266443],12,17
3,13,44,5,5,"[0.7305797787502836, 0.7035426244401864, 0.812...",17,40
4,19,38,2,2,"[0.7279137845500362, 0.829618027530799, 0.7814...",20,37
5,21,35,2,2,"[0.18925865017357865, 0.25012681908297557]",22,34
6,24,33,2,2,"[0.9167302891209327, 0.9266829697603598, 0.764...",25,32


In [21]:
class LocalStructureBb(object):
    
    def __init__(self, top_right_x, top_right_y, size_x, size_y, bb_id, bb_type):
        self.id = bb_id
        assert bb_type in ['stem', 'iloop', 'hloop']
        self.type = bb_type
        self.tr_x = top_right_x
        self.tr_y = top_right_y
        self.size_x = size_x
        self.size_y = size_y
        # also store bottom left
        self.bl_x = top_right_x + size_x - 1
        self.bl_y = top_right_y - size_y + 1

    def share_top_right_corner(self, another_bb):
        # check if self.top_right == another_bb.bottom_left
        if self.tr_x == another_bb.bl_x and self.tr_y == another_bb.bl_y:
            return True
        else:
            return False
        
    def share_bottom_left_corner(self, another_bb):
        # check if self.bottom_left == another_bb.top_right
        if self.bl_x == another_bb.tr_x and self.bl_y == another_bb.tr_y:
            return True
        else:
            return False
        
    def overlaps(self, another_bb):
        raise NotImplementedError
        
    def bp_conflict(self, another_bb):
        # only makes sense for stem bb comparison
        # where the base pair ranges are in conflict with each other
        # i.e. if another_bb cannot be included if self is included in global structure
        # x range
        x1_1, x1_2 = self.tr_x, self.bl_x
        x2_1, x2_2 = another_bb.tr_x, another_bb.bl_x
        x_range_conflict = (x1_1 <= x2_1 <= x1_2) or (x2_1 <= x1_1 <= x2_2)
        # y range
        y1_1, y1_2 = self.bl_y, self.tr_y
        y2_1, y2_2 = another_bb.bl_y, another_bb.tr_y
        y_range_conflict = (y1_1 <= y2_1 <= y1_2) or (y2_1 <= y1_1 <= y2_2)
        return x_range_conflict or y_range_conflict
    
    def __repr__(self):
        return f"{self.type} {self.id} top right ({self.tr_x}, {self.tr_y}), bottom left ({self.bl_x}, {self.bl_y})"

In [22]:
# bb objects
# use enumerate on df since we want to contiguous ids, not the original df index
stems = [LocalStructureBb(row['bb_x'], row['bb_y'], row['siz_x'], row['siz_y'], f'stem_{idx}', 'stem') for idx, (_, row) in enumerate(df_stem.iterrows())]
iloops = [LocalStructureBb(row['bb_x'], row['bb_y'], row['siz_x'], row['siz_y'], f'iloop_{idx}', 'iloop') for idx, (_, row) in enumerate(df_iloop_cleanup.iterrows())]
hloops = [LocalStructureBb(row['bb_x'], row['bb_y'], row['siz_x'], row['siz_y'], f'hloop_{idx}', 'hloop') for idx, (_, row) in enumerate(df_hloop_cleanup.iterrows())]


In [23]:
class OneStepChain(object):
    
    def __init__(self, bb, next_bb=None):
        self.bb = bb
        self.next_bb = next_bb
        
    def clear_next_bb(self):
        self.next_bb = None
    
    def add_next_bb(self, next_bb, validate=True):
        if validate:
            assert self.bb.share_top_right_corner(next_bb)
        if self.next_bb is None:
            self.next_bb = []
        self.next_bb.append(next_bb)
        
    def __repr__(self):
        if self.next_bb:
            tmp = [x.id for x in self.next_bb]
        else:
            tmp = 'N/A'
        return f"{self.bb}. Next: {tmp}"

In [24]:
class GlobalConstraint(object):
    
    def __init__(self, stems, iloops, hloops):
        raise NotImplementedError
    
    def conflict(self, bb1, bb2):
        # check bb1 and bb2 are in the known bb's
        # same type?
        # check overlap and bp conflict
        raise NotImplementedError

In [25]:
# find next compatible, start with iloop
iloop_os_chain = []
for iloop in iloops:
    iloop_os = OneStepChain(iloop)
    for stem in stems:
        if iloop.share_top_right_corner(stem):
            iloop_os.add_next_bb(stem)
    iloop_os_chain.append(iloop_os)

In [26]:
# find next compatible, start with stem
stem_os_chain = []
for stem in stems:
    stem_os = OneStepChain(stem)
    for iloop in iloops:
        if stem.share_top_right_corner(iloop):
            stem_os.add_next_bb(iloop)
    stem_os_chain.append(stem_os)

In [27]:
# find next compatible, start with hloop
hloop_os_chain = []
for hloop in hloops:
    hloop_os = OneStepChain(hloop)
    for stem in stems:
        if hloop.share_top_right_corner(stem):
            hloop_os.add_next_bb(stem)
    hloop_os_chain.append(hloop_os)

In [28]:
iloop_os_chain

[iloop iloop_0 top right (17.0, 40.0), bottom left (19.0, 38.0). Next: ['stem_3'],
 iloop iloop_1 top right (17.0, 40.0), bottom left (24.0, 33.0). Next: ['stem_3'],
 iloop iloop_2 top right (20.0, 37.0), bottom left (24.0, 33.0). Next: ['stem_4']]

In [29]:
stem_os_chain

[stem stem_0 top right (5, 16), bottom left (7, 14). Next: N/A,
 stem stem_1 top right (8, 32), bottom left (10, 30). Next: N/A,
 stem stem_2 top right (10, 19), bottom left (12, 17). Next: N/A,
 stem stem_3 top right (13, 44), bottom left (17, 40). Next: N/A,
 stem stem_4 top right (19, 38), bottom left (20, 37). Next: ['iloop_0'],
 stem stem_5 top right (21, 35), bottom left (22, 34). Next: N/A,
 stem stem_6 top right (24, 33), bottom left (25, 32). Next: ['iloop_1', 'iloop_2']]

In [30]:
hloop_os_chain

[hloop hloop_0 top right (7, 14), bottom left (14.0, 7.0). Next: ['stem_0'],
 hloop hloop_1 top right (25, 32), bottom left (32.0, 25.0). Next: ['stem_6']]

In [31]:
# for convenience
os_chain = {x.bb.id: x for x in iloop_os_chain + stem_os_chain + hloop_os_chain}

In [32]:
os_chain

{'iloop_0': iloop iloop_0 top right (17.0, 40.0), bottom left (19.0, 38.0). Next: ['stem_3'],
 'iloop_1': iloop iloop_1 top right (17.0, 40.0), bottom left (24.0, 33.0). Next: ['stem_3'],
 'iloop_2': iloop iloop_2 top right (20.0, 37.0), bottom left (24.0, 33.0). Next: ['stem_4'],
 'stem_0': stem stem_0 top right (5, 16), bottom left (7, 14). Next: N/A,
 'stem_1': stem stem_1 top right (8, 32), bottom left (10, 30). Next: N/A,
 'stem_2': stem stem_2 top right (10, 19), bottom left (12, 17). Next: N/A,
 'stem_3': stem stem_3 top right (13, 44), bottom left (17, 40). Next: N/A,
 'stem_4': stem stem_4 top right (19, 38), bottom left (20, 37). Next: ['iloop_0'],
 'stem_5': stem stem_5 top right (21, 35), bottom left (22, 34). Next: N/A,
 'stem_6': stem stem_6 top right (24, 33), bottom left (25, 32). Next: ['iloop_1', 'iloop_2'],
 'hloop_0': hloop hloop_0 top right (7, 14), bottom left (14.0, 7.0). Next: ['stem_0'],
 'hloop_1': hloop hloop_1 top right (25, 32), bottom left (32.0, 25.0). Ne

In [33]:
class FullChain(object):
    
    def __init__(self, start):
        # make sure start is either stem or hloop
        assert start.type in ['stem', 'hloop']
        self.chain = (start, )
        self.start = start
        self.completed = False
        self.id = None
    
    def add_bb(self, bb):
        last_bb = self.chain[-1]
        assert last_bb.share_top_right_corner(bb), f"{last_bb} {bb}"
        self.chain = self.chain + (bb,)
    
    def complete(self, validate=True):
        if validate:
            end = self.chain[-1]
            assert end.type == 'stem'
        self.end = end
        self.completed = True
    
    def merge_chain(self, another_chain):
        # merge another (completed) chain
        assert another_chain.completed
        # make sure they share the same bb
        assert self.chain[-1] == another_chain.chain[0]
        self.chain = self.chain + another_chain.chain[1:]
        self.end = another_chain.end
        self.completed == True
        
    def __repr__(self):
        status = "Completed" if self.completed else "Incomplete"
        return f"FullChain {self.id} {self.chain} {status}"
#         return f"FullChain {[x.id for x in self.chain]} {status}"

In [34]:
# for convenience, keep track of chains starting with each bb
# not used for now, will need this to improve efficiency
bb2chain = {bb: [] for bb in stems + iloops + hloops} 
# all chains (some might be subset of others)
full_chains = []


class ChainIdCounter(object):
    prefix = 'chain_'

    def __init__(self):
        self.ct = 0
    
    def get_id(self):
        ct = self.ct
        self.ct += 1
        return f"{self.prefix}{ct}"
    

def grow_chain(chain):        
    # look up last item in chain
    this_bb = chain.chain[-1]
    # check its next compatible elements
    next_bbs = os_chain[this_bb.id].next_bb
    # finish if empty
    # otherwise add next item & recursion
    if next_bbs is None:
        tmp = copy.copy(chain)
        tmp.complete()
        tmp.id = chain_id_counter.get_id()  # assign ID
        full_chains.append(tmp)
        return
    else:
#         print(next_bbs, '\n')
        # if end with stem, make a copy, add to full chain list
        if chain.chain[-1].type == 'stem':
            tmp = copy.copy(chain)
            tmp.complete()
            tmp.id = chain_id_counter.get_id()  # assign ID
            full_chains.append(tmp)
        
#         for i in range(len(os_chain[chain.chain[-1].id].next_bb)):
        for i in range(len(next_bbs)):
            # operate on copy before 'branching out' to avoid messing up with a global 'chain'
            tmp = copy.copy(chain)
            tmp.add_bb(next_bbs[i])
            grow_chain(tmp)


# global id counter
chain_id_counter = ChainIdCounter()
            
# start with stem or hloop
for x in stems + hloops:
#     print(x)
    fc = FullChain(x)
    grow_chain(fc)




In [35]:
# full_chains

In [36]:
len(full_chains)

16

In [37]:
import numpy as np

In [38]:
# from scipy.spatial.distance import pdist

In [39]:
# all pairwise compatibility of stems
# dm = pdist(pd.DataFrame({'stem': stems}).values, lambda x, y: x.bp_conflict(y))

distances = np.zeros((len(stems), len(stems)), dtype=object)
for i in range(len(stems)):
    for j in range(len(stems)):
        d = stems[i].bp_conflict(stems[j])
        distances[i, j] = d
        distances[j, i] = d

        
stem_ids = [x.id for x in stems]
df_stem_conflict = pd.DataFrame(distances, index=stem_ids, columns=stem_ids)
        

In [40]:
df_stem_conflict

Unnamed: 0,stem_0,stem_1,stem_2,stem_3,stem_4,stem_5,stem_6
stem_0,True,False,False,False,False,False,False
stem_1,False,True,True,False,False,False,True
stem_2,False,True,True,False,False,False,False
stem_3,False,False,False,True,False,False,False
stem_4,False,False,False,False,True,False,False
stem_5,False,False,False,False,False,True,False
stem_6,False,True,False,False,False,False,True


In [41]:
df_stem_conflict['stem_0']['stem_3']

False

In [42]:
# two chains A & B are compatible if all stems in A is compatible (no bp conflict) with all stems in B


def chain_compatible(c1, c2):
    stems_1 = [bb for bb in c1.chain if bb.type =='stem']
    stems_2 = [bb for bb in c2.chain if bb.type =='stem']
    for s1 in stems_1:
        for s2 in stems_2:
            if df_stem_conflict[s1.id][s2.id]:
                return False
    return True
            

In [43]:
chain_compatible(full_chains[0], full_chains[0])


False

In [44]:
# all pairwise compatibility of chains

distances = np.zeros((len(full_chains), len(full_chains)), dtype=object)
for i in range(len(full_chains)):
    for j in range(len(full_chains)):
        d = chain_compatible(full_chains[i], full_chains[j])
        distances[i, j] = d
        distances[j, i] = d

        
chain_ids = [x.id for x in full_chains]
df_chain_compatibility = pd.DataFrame(distances, index=chain_ids, columns=chain_ids)

In [45]:
df_chain_compatibility

Unnamed: 0,chain_0,chain_1,chain_2,chain_3,chain_4,chain_5,chain_6,chain_7,chain_8,chain_9,chain_10,chain_11,chain_12,chain_13,chain_14,chain_15
chain_0,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True
chain_1,True,False,False,True,True,True,True,False,False,False,False,True,False,False,False,False
chain_2,True,False,False,True,True,True,True,True,True,True,True,True,True,True,True,True
chain_3,True,True,True,False,True,False,True,True,False,True,False,True,True,False,True,False
chain_4,True,True,True,True,False,False,True,True,True,False,False,True,True,True,False,False
chain_5,True,True,True,False,False,False,True,True,False,False,False,True,True,False,False,False
chain_6,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True
chain_7,True,False,True,True,True,True,True,False,False,False,False,True,False,False,False,False
chain_8,True,False,True,False,True,False,True,False,False,False,False,True,False,False,False,False
chain_9,True,False,True,True,False,False,True,False,False,False,False,True,False,False,False,False


In [46]:
global_structs = []
compatible_chains = copy.copy(full_chains)


def grow_global_struct(struct, comp_chains):
    # global assembly can terminate any time (even for the empty one)
    global_structs.append(copy.copy(struct))
    
    # if no more chain in the compatible list, return
    if len(comp_chains) == 0:
        return

    # add one chain from the list of compatible ones
    for chain in comp_chains:
        struct_new = copy.copy(struct)  # important to make copy inside loop
        struct_new.append(copy.copy(chain))
        # update compatibility list
        chain_id_compatible = set(df_chain_compatibility[chain.id].index[df_chain_compatibility[chain.id]].to_list())
        
#         print(chain, chain_id_compatible)
        comp_chains = [x for x in comp_chains if x.id in chain_id_compatible]
#         print([x.id for x in struct_new], chain.id, chain_id_compatible, [x.id for x in comp_chains])
#         print(comp_chains)
        grow_global_struct(copy.copy(struct_new), copy.copy(comp_chains))


grow_global_struct([], compatible_chains)

In [47]:
# global_structs

In [48]:
def validate_global_struct(global_struct):
    # global_struct: list of chains
    # empty structure is always valid
    if len(global_struct) == 0:
        return True
    
    # find all stems, collect all base pairs
    all_stems = []
    for chain in global_struct:
        all_stems.extend([x for x in chain.chain if x.type == 'stem'])
    bps = []  # list of (i, j) tuple
    for s in all_stems:
        for i, j in zip(range(s.tr_x, s.bl_x+1), range(s.bl_y, s.tr_y+1)[::-1]):
            bps.append((i, j))
    bps = sorted(bps)
    
    # bps can not contain duplicated pairs
    # this should not happen
    assert len(set(bps)) == len(bps)
        
    # parse local structure implied by these base pairing
    # FIXME for now skip cases we cannot parse
    try:
        lsp = LocalStructureParser(bps)
        lss = process_bb_old_to_new(lsp.local_structure_bounding_boxes)
    except ValueError:
        return False
    # ignore pseudoknot
    lss = lss[lss['bb_type'].isin(['stem', 'iloop', 'hloop'])]
    
    # check if the bounding boxes are the same after conversion
    df_before = []
    for chain in global_struct:
        for struct in chain.chain:
            df_before.append({
                'bb_x': int(struct.tr_x),
                'bb_y': int(struct.tr_y),
                'siz_x': int(struct.size_x),
                'siz_y': int(struct.size_y),
                'bb_type': struct.type,
            })
    df_before = pd.DataFrame(df_before)
    
    # check the two dfs are equal (up to row/col swap)
    # col should be same order, no need to check
    # sort rows
    df_before = df_before.sort_values(by=df_before.columns.tolist())
    lss = lss.sort_values(by=lss.columns.tolist())
    return np.array_equal(df_before.values, lss.values)   # use np so we don't compare index
    
    

In [49]:
valid_global_structs = [x for x in global_structs if validate_global_struct(x)]

In [50]:
print(len(global_structs), len(valid_global_structs))

77 20


In [51]:
valid_global_structs[2]

[FullChain chain_0 (stem stem_0 top right (5, 16), bottom left (7, 14),) Completed,
 FullChain chain_1 (stem stem_1 top right (8, 32), bottom left (10, 30),) Completed,
 FullChain chain_3 (stem stem_3 top right (13, 44), bottom left (17, 40),) Completed]

In [52]:
# [[y.id for y in x] for x in valid_global_structs]
valid_global_structs

[[],
 [FullChain chain_0 (stem stem_0 top right (5, 16), bottom left (7, 14),) Completed,
  FullChain chain_1 (stem stem_1 top right (8, 32), bottom left (10, 30),) Completed],
 [FullChain chain_0 (stem stem_0 top right (5, 16), bottom left (7, 14),) Completed,
  FullChain chain_1 (stem stem_1 top right (8, 32), bottom left (10, 30),) Completed,
  FullChain chain_3 (stem stem_3 top right (13, 44), bottom left (17, 40),) Completed],
 [FullChain chain_0 (stem stem_0 top right (5, 16), bottom left (7, 14),) Completed,
  FullChain chain_1 (stem stem_1 top right (8, 32), bottom left (10, 30),) Completed,
  FullChain chain_4 (stem stem_4 top right (19, 38), bottom left (20, 37),) Completed],
 [FullChain chain_0 (stem stem_0 top right (5, 16), bottom left (7, 14),) Completed,
  FullChain chain_1 (stem stem_1 top right (8, 32), bottom left (10, 30),) Completed,
  FullChain chain_5 (stem stem_4 top right (19, 38), bottom left (20, 37), iloop iloop_0 top right (17.0, 40.0), bottom left (19.0, 38

In [53]:
df_target

Unnamed: 0,bb_x,bb_y,siz_x,siz_y,bb_type
0,13,44,5,5,stem
1,19,38,2,2,stem
2,24,33,2,2,stem
3,17,40,3,3,iloop
4,20,37,5,5,iloop
5,25,32,8,8,hloop


In [54]:
# lookup their probabilities

In [55]:
# validate global structure:
# find chain where inner-most bounding box is stem, make sure some base pairing exit within i-j 

In [56]:
# each 'chain' can start with stem or hloop (although there will be further implication if starting with stem),
# and can only end with stem
# to combine multiple chains in the same global structure, global constraints need to be satisfied (no overlapping OS, no bp conflict between stems)



In [57]:
# DP on growing the 'chain'
# e.g. if we already know the tree 'growing' from stem_a we don't need to recompute it

In [58]:
# incompatible stems (local constraints)



In [59]:
# 'implied' loops:
# once we put together a 'global structure', any implicit loop (from choice of stems)
# should also be included
# this can be done by naively:
# apply stems -> binary array -> run script to find local bb -> check it's the same

In [60]:
# assembly of 'stretches':
# can start with stem or hloop (starting with stem implies it's connected to non-local structure <- to be verified later?)

In [61]:
# clean up

# add bottom left corner (1-index) for easy comparison

# any bb out of range



# rank by: number of proposals, probabilities