In [None]:
import numpy as np
import pandas as pd

In [None]:
from collections import namedtuple

In [None]:
from utils.util_global_struct import process_bb_old_to_new
from utils.rna_ss_utils import arr2db, one_idx2arr, compute_fe

In [None]:
from utils.misc import add_column

In [None]:
import plotly.express as px

In [None]:
def stem_bbs2db(bbs, seq_len):  # TODO move to utils
    # TODO validate there's no conflict!
    one_idx = []
    for bb in bbs:
        for offset in range(bb.siz_x):
            x = bb.bb_x + offset
            y = bb.bb_y - offset
            one_idx.append((x, y))
    # convert to list of 2 tuples
    one_idx = list(zip(*one_idx))
    
    # convert to arr
    pairs, arr = one_idx2arr(one_idx, seq_len)
    
    # db str
    db_str, is_ambuguious = arr2db(arr)
    
    # TODO check is_ambuguious
    if is_ambuguious:
        print(f'ambuguious db_str: {db_str}')
    
    return db_str

In [None]:
BoundingBox = namedtuple("BoundingBox", ['bb_x', 'bb_y', 'siz_x', 'siz_y'])

In [None]:
df = pd.read_pickle('data/data_len40_1000_s1_stem_bb_le10_combos.pkl.gz')

In [None]:
# df

In [None]:
def get_bps_fe(row):
    seq = row.seq
    df_target = process_bb_old_to_new(row.bounding_boxes)
    df_target = df_target[df_target['bb_type'] == 'stem']
    
    df_stem = pd.DataFrame(row.pred_stem_bb)
    # we use df index, make sure it's contiguous
    assert df_stem.iloc[-1].name == len(df_stem) - 1

    bbs = {}
    for idx, r in df_stem.iterrows():
        bbs[idx] = BoundingBox(bb_x=r['bb_x'],
                               bb_y=r['bb_y'],
                               siz_x=r['siz_x'],
                               siz_y=r['siz_y'])
        
    df_valid_combos = pd.DataFrame(row.valid_combos)
    # get rid of no structure
    df_valid_combos = df_valid_combos[df_valid_combos['total_bps'] > 0]
    
    df_valid_combos = add_column(df_valid_combos, 'db_str', ['bb_inc'],
                                lambda bb_idx: stem_bbs2db([bbs[i] for i in bb_idx], len(seq)))
    df_valid_combos = add_column(df_valid_combos, 'fe', ['db_str'],
                            lambda db_str: compute_fe(seq, db_str))
    
    return df_valid_combos[['bb_inc', 'db_str', 'total_bps', 'fe']]  # TODO add in seq len?

In [None]:
def get_combo_fe_and_rank(row):
    df_fe = get_bps_fe(row)
    # sort by num bps, descending
    df_fe = df_fe.sort_values(by='total_bps', ascending=False)
    # reset index (so index will be rank-0)
    df_fe = df_fe.reset_index(drop=True)

    # ranking of ground truth
    df_target = process_bb_old_to_new(row['bounding_boxes'])
    df_target = df_target[df_target['bb_type'] == 'stem']
    df_stem = pd.DataFrame(row.pred_stem_bb)
    # we use df index, make sure it's contiguous
    assert df_stem.iloc[-1].name == len(df_stem) - 1
    bbs = {}
    for idx, r in df_stem.iterrows():
        bbs[idx] = BoundingBox(bb_x=r['bb_x'],
                               bb_y=r['bb_y'],
                               siz_x=r['siz_x'],
                               siz_y=r['siz_y'])
    # inefficient way
    bb_idx_tgt = []
    for _, r in df_target.iterrows():
        tgt_bb = BoundingBox(bb_x=r['bb_x'],
                               bb_y=r['bb_y'],
                               siz_x=r['siz_x'],
                               siz_y=r['siz_y'])
        tgt_idx = next(i for i, bb in bbs.items() if bb == tgt_bb)
        bb_idx_tgt.append(tgt_idx)
    # get rank (index + 1)
    rank = next(i for i, r in df_fe.iterrows() if r['bb_inc'] == bb_idx_tgt) + 1
    
    return df_fe, rank

In [None]:
example_id = df.index[256]
row = df.iloc[example_id]

df_fe, rank = get_combo_fe_and_rank(row)

fig = px.scatter(df_fe, 
           x='total_bps', y='fe')
num_bbs = len(df.iloc[example_id]['pred_stem_bb']['bb_x'])
num_combo_all = len(df_fe)
# num_combo_neg_fe = len(df_fe[df_fe['fe'] < 0])
min_fe = df_fe['fe'].min()
max_bp_fe = sorted(df_fe[df_fe['total_bps'] == df_fe.iloc[0]['total_bps']]['fe'].tolist())
top_10_fe = df_fe[:10]['fe'].min()
fig.update_layout(title=f"idx {example_id}, #bbs {num_bbs}, #combos {num_combo_all}. Min FE {min_fe}, rank {rank}. Max bp FE {max_bp_fe}. Best FE in top 10: {top_10_fe}")

In [None]:
data_fe_vs_top10 = []
for example_id, row in df.iterrows():
    df_fe, rank = get_combo_fe_and_rank(row)
    min_fe = df_fe['fe'].min()
    max_bp_fe = sorted(df_fe[df_fe['total_bps'] == df_fe.iloc[0]['total_bps']]['fe'].tolist())
    top_10_fe = df_fe[:10]['fe'].min()
    data_fe_vs_top10.append({
        'example_id': example_id,
        'min_fe': min_fe,
        'max_bp_fe': max_bp_fe,
        'top_10_fe': top_10_fe,
    })
data_fe_vs_top10 = pd.DataFrame(data_fe_vs_top10)

In [None]:
px.scatter(data_fe_vs_top10, x='min_fe', y='top_10_fe')

In [None]:
df.iloc[example_id].seq

In [None]:
df_fe