In [None]:
import pandas as pd
from utils.rna_ss_utils import one_idx2arr, sort_pairs, LocalStructureParser, make_target_pixel_bb
from utils.inference_s1 import Predictor, Evaluator
from utils.util_global_struct import process_bb_old_to_new

In [None]:
import numpy as np
from utils.inference_s1 import DataEncoder

In [None]:
import dgutils.pandas as dgp

In [None]:
def filter_by_n_proposal(df_bb, threshold):
    
    if len(df_bb) == 0:
        return df_bb
    else:
        # handle cases where there's only softmax predicted or scalar predicted
        if 'prob_other_sm' not in df_bb.columns:
            df_bb = dgp.add_column(df_bb, 'prob_sm', ['siz_x'], lambda a: [])  # hacky way to create a column of empty lists
        if 'prob_other_sl' not in df_bb.columns:
            df_bb = dgp.add_column(df_bb, 'prob_sl', ['siz_x'],
                                   lambda a: [])  # hacky way to create a column of empty lists
        df_bb = dgp.add_column(df_bb, 'n_proposal_norm_sm', ['prob_other_sm', 'siz_x', 'siz_y'],
                          lambda a, b, c: len(a)/float(b * c))
        df_bb = dgp.add_column(df_bb, 'n_proposal_norm_sl', ['prob_other_sl', 'siz_x', 'siz_y'],
                          lambda a, b, c: len(a)/float(b * c))
        return df_bb[(df_bb['n_proposal_norm_sm'] > threshold) | (df_bb['n_proposal_norm_sl'] > threshold)]


In [None]:
def pred_threshold_on_n_proposal(seq, predictor, threshold):
    stems, iloops, hloops = predictor.predict_bb(seq, threshold=0, topk=1, perc_cutoff=0)
    stems = pd.DataFrame(stems)
    iloops = pd.DataFrame(iloops)
    hloops = pd.DataFrame(hloops)

    stems = filter_by_n_proposal(stems, threshold)
    iloops = filter_by_n_proposal(iloops, threshold)
    hloops = filter_by_n_proposal(hloops, threshold / 2)  # /2 threshold due to /2 upper bound
    return stems, iloops, hloops

In [None]:
def compute_sensitivities(df_target, pred_bb_stem, pred_bb_iloop, pred_bb_hloop):
    metric_stem = evaluator.calculate_bb_metrics(df_target=df_target[df_target['bb_type'] == 'stem'][['bb_x', 'bb_y', 'siz_x', 'siz_y']], 
                              df_pred=pd.DataFrame(pred_bb_stem)[['bb_x', 'bb_y', 'siz_x', 'siz_y']])
    metric_iloop = evaluator.calculate_bb_metrics(df_target=df_target[df_target['bb_type'] == 'iloop'][['bb_x', 'bb_y', 'siz_x', 'siz_y']], 
                              df_pred=pd.DataFrame(pred_bb_iloop)[['bb_x', 'bb_y', 'siz_x', 'siz_y']])
    metric_hloop = evaluator.calculate_bb_metrics(df_target=df_target[df_target['bb_type'] == 'hloop'][['bb_x', 'bb_y', 'siz_x', 'siz_y']], 
                              df_pred=pd.DataFrame(pred_bb_hloop)[['bb_x', 'bb_y', 'siz_x', 'siz_y']])
    
    if metric_stem['n_target_total'] != 0:
        sensitivity_stem = metric_stem['n_target_identical']/float(metric_stem['n_target_total'])
    else:
        sensitivity_stem = float('nan')
        
    if metric_iloop['n_target_total'] != 0:
        sensitivity_iloop = metric_iloop['n_target_identical']/float(metric_iloop['n_target_total'])
    else:
        sensitivity_iloop = float('nan')
        
    if metric_hloop['n_target_total'] != 0:
        sensitivity_hloop = metric_hloop['n_target_identical']/float(metric_hloop['n_target_total'])
    else:
        sensitivity_hloop = float('nan')
        
    return sensitivity_stem, sensitivity_iloop, sensitivity_hloop

In [None]:
df = pd.read_pickle('../2021_03_16/data/human_transcriptome_segment_high_mfe_freq_testing_len64_100.pkl.gz')

In [None]:
model_path = '../2021_03_23/s1_training/result/run_7/model_ckpt_ep_17.pth'  # best model

predictor = Predictor(model_ckpt=model_path,
                     num_filters=[32, 32, 64, 64, 64, 128, 128],
                     filter_width=[9, 9, 9, 9, 9, 9, 9],
                     dropout=0.0)

In [None]:
evaluator = Evaluator(predictor=None)

In [None]:
# metrics for each example
df_metric = []

# number of bb by the two methods
df_n_bbs = []

for _, row in df.iterrows():
    seq = row['seq']
    one_idx = row['one_idx']
    bounding_boxes = row['bounding_boxes']
    df_target = process_bb_old_to_new(bounding_boxes)
    
    # threshold on p_on
    pred_bb_stem, pred_bb_iloop, pred_bb_hloop = predictor.predict_bb(seq=seq, threshold=0.1, topk=1, perc_cutoff=0)
    pred_bb_stem = pd.DataFrame(pred_bb_stem)
    pred_bb_iloop = pd.DataFrame(pred_bb_iloop)
    pred_bb_hloop = pd.DataFrame(pred_bb_hloop)
    stem_s1, iloop_s1, hloop_s1 = compute_sensitivities(df_target, pred_bb_stem, pred_bb_iloop, pred_bb_hloop)
    
    # threshold on n_proposal
    pred_bb_stem_2, pred_bb_iloop_2, pred_bb_hloop_2 = pred_threshold_on_n_proposal(seq, predictor, threshold=0.5)
    stem_s2, iloop_s2, hloop_s2 = compute_sensitivities(df_target, pred_bb_stem_2, pred_bb_iloop_2, pred_bb_hloop_2)
    
    # combined
    pred_bb_stem_combined = pd.concat([pred_bb_stem, pred_bb_stem_2]).drop_duplicates(subset=['bb_x', 'bb_y', 'siz_x', 'siz_y'])
    pred_bb_iloop_combined = pd.concat([pred_bb_iloop, pred_bb_iloop_2]).drop_duplicates(subset=['bb_x', 'bb_y', 'siz_x', 'siz_y'])
    pred_bb_hloop_combined = pd.concat([pred_bb_hloop, pred_bb_hloop_2]).drop_duplicates(subset=['bb_x', 'bb_y', 'siz_x', 'siz_y'])
    stem_s3, iloop_s3, hloop_s3 = compute_sensitivities(df_target, pred_bb_stem_combined, pred_bb_iloop_combined, pred_bb_hloop_combined)
    
    # intersection (just for plotting)
    pred_bb_stem_int = pd.merge(pred_bb_stem[['bb_x', 'bb_y', 'siz_x', 'siz_y']],
                                 pred_bb_stem_2[['bb_x', 'bb_y', 'siz_x', 'siz_y']], how='inner')
    pred_bb_iloop_int = pd.merge(pred_bb_iloop[['bb_x', 'bb_y', 'siz_x', 'siz_y']],
                                  pred_bb_iloop_2[['bb_x', 'bb_y', 'siz_x', 'siz_y']], how='inner')
    pred_bb_hloop_int = pd.merge(pred_bb_hloop[['bb_x', 'bb_y', 'siz_x', 'siz_y']],
                                  pred_bb_hloop_2[['bb_x', 'bb_y', 'siz_x', 'siz_y']], how='inner')
    
    df_metric.append({
        'stem_1': stem_s1,
        'stem_2': stem_s2,
        'stem_c': stem_s3,
        'iloop_1': iloop_s1,
        'iloop_2': iloop_s2,
        'iloop_c': iloop_s3,
        'hloop_1': hloop_s1,
        'hloop_2': hloop_s2,
        'hloop_c': hloop_s3,
        'mfe_freq': row['mfe_freq'],
    })
    
    df_n_bbs.append({
        'n_stem_1': len(pred_bb_stem),
        'n_stem_2': len(pred_bb_stem_2),
        'n_stem_i': len(pred_bb_stem_int),
        'n_stem_c': len(pred_bb_stem_combined),
        'n_iloop_1': len(pred_bb_iloop),
        'n_iloop_2': len(pred_bb_iloop_2),
        'n_iloop_i': len(pred_bb_iloop_int),
        'n_iloop_c': len(pred_bb_iloop_combined),
        'n_hloop_1': len(pred_bb_hloop),
        'n_hloop_2': len(pred_bb_hloop_2),
        'n_hloop_i': len(pred_bb_hloop_int),
        'n_hloop_c': len(pred_bb_hloop_combined),
    })
    
df_metric = pd.DataFrame(df_metric)
df_n_bbs = pd.DataFrame(df_n_bbs)

In [None]:
df_metric

In [None]:
import plotly.express as px

In [None]:
fig = px.histogram(df_metric[['stem_1', 'iloop_1', 'hloop_1']], barmode='group')
fig.update_layout(yaxis_range=[0, 100],
                 xaxis_range=[-0.1, 1.1],
                  height=400,
                 title='threshold by p_on>=0.1')

In [None]:
fig = px.histogram(df_metric[['stem_2', 'iloop_2', 'hloop_2']], barmode='group')
fig.update_layout(yaxis_range=[0, 100],
                 xaxis_range=[-0.1, 1.1],
                  height=400,
                 title='threshold by n_proposal>=0.5')

In [None]:
fig = px.histogram(df_metric[['stem_c', 'iloop_c', 'hloop_c']], barmode='group')
fig.update_layout(yaxis_range=[0, 100],
                 xaxis_range=[-0.1, 1.1],
                  height=400,
                 title='union of two methods')

In [None]:
# fig = px.scatter(df_n_bbs, x="n_stem_1", y="n_stem_2",
#          size="n_stem_i", 
# #                  color="continent",
#                  hover_name="n_stem_c", 
#                  log_x=True, log_y=True)
# fig.show()
fig = px.scatter(df_n_bbs, x="n_stem_1", y="n_stem_2",
                 hover_name="n_stem_c", 
                 log_x=True, log_y=True)
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 
                             'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}])  # y=x line
fig.update_layout(title='Number of bb proposal (stem) by two methods',
                 height=500, width=500)
fig.show()

In [None]:
# fig = px.scatter(df_n_bbs, x="n_iloop_1", y="n_iloop_2",
#          size="n_iloop_i", 
#                  hover_name="n_iloop_c", 
#                  log_x=True, log_y=True)
fig = px.scatter(df_n_bbs, x="n_iloop_1", y="n_iloop_2",
                 hover_name="n_iloop_c", 
                 log_x=True, log_y=True)
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 
                             'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}])  # y=x line
fig.update_layout(title='Number of bb proposal (iloop) by two methods',
                 height=500, width=500)
fig.show()

In [None]:
# fig = px.scatter(df_n_bbs, x="n_hloop_1", y="n_hloop_2",
#          size="n_hloop_i", 
#                  hover_name="n_hloop_c", 
#                  log_x=True, log_y=True)
fig = px.scatter(df_n_bbs, x="n_hloop_1", y="n_hloop_2",
                 hover_name="n_hloop_c", 
                 log_x=True, log_y=True)
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 
                             'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}])  # y=x line
fig.update_layout(title='Number of bb proposal (hloop) by two methods',
                 height=500, width=500)
fig.show()

In [None]:
fig = px.scatter(df_metric, x="stem_c", y="mfe_freq")
fig.update_layout(title='Stem sensitivity v.s. MFE frequency',
                 height=400)
fig.show()

In [None]:
fig = px.scatter(df_metric, x="iloop_c", y="mfe_freq")
fig.update_layout(title='iloop sensitivity v.s. MFE frequency',
                 height=400)
fig.show()

In [None]:
fig = px.scatter(df_metric, x="hloop_c", y="mfe_freq")
fig.update_layout(title='hloop sensitivity v.s. MFE frequency',
                 height=400)
fig.show()