In [None]:
import pandas as pd
from model_utils.util_global_struct import process_bb_old_to_new
import model_utils.utils_s2 as us2 # TODO merge s2 util
from model_utils.utils_nn_s2 import predict_wrapper, stem2db_str

In [None]:
from model_utils.utils_model import Evaluator

In [None]:
import numpy as np
import dgutils.pandas as dgp

In [None]:
import plotly.express as px

In [None]:
def add_s2_pred(bb_stem, bb_iloop, bb_hloop, seq):
    df_pred_stem = pd.DataFrame(bb_stem)
    df_pred_iloop = pd.DataFrame(bb_iloop)
    df_pred_hloop = pd.DataFrame(bb_hloop)
    
    df_pred = predict_wrapper(df_pred_stem, df_pred_iloop, df_pred_hloop, 
                          discard_ns_stem=True, min_hloop_size=2, 
                          seq=seq, m_factor=1, predictor=predictor_s2)
    return df_pred

In [None]:
def stem2arr(df_stem, seq_len):
    x = np.zeros((seq_len, seq_len))
    
    for _, row in df_stem.iterrows():
        bb_x = int(row['bb_x'])
        bb_y = int(row['bb_y'])
        siz = int(row['siz_x'])
        siz_y = int(row['siz_y'])
        assert siz == siz_y
        for i in range(siz):
            x[bb_x+i, bb_y-i] = 1
            x[bb_y-i, bb_x+i] = 1
    return x

In [None]:
def calculate_metric(df_stem_pred, df_stem_target, seq_len):
    # convert bounding boxes to binary array
    arr_pred = stem2arr(df_stem_pred, seq_len)
    arr_target = stem2arr(df_stem_target, seq_len)
    
    # pick upper triangular, and flattern
    idx = np.triu_indices(seq_len)
    vals_pred = arr_pred[idx]
    vals_target = arr_target[idx]
    
    # compute TP, FP, TN, FN
    TP, FP, TN, FN = perf_measure(vals_target, vals_pred)
    
    # compute sensitivity, ppv, and f1 score
    sensitivity = TP/(TP+FN)  # recall
    ppv = TP/(TP+FP)  # precision
    if sensitivity + ppv == 0:
        f1_score = 0
    else:
        f1_score = 2 * sensitivity * ppv /(sensitivity + ppv)
    
    return sensitivity, ppv, f1_score
    

In [None]:
def perf_measure(y_actual, y_hat):
    # thanks to https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
            TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
            FP += 1
        if y_actual[i]==y_hat[i]==0:
            TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
            FN += 1

    return TP, FP, TN, FN

In [None]:
predictor_s2 = us2.Predictor('v0.2')

In [None]:
path_dataset = 'data/bprna_t0p1_k1.pkl.gz'
df_full = pd.read_pickle(path_dataset)

In [None]:
print(len(df_full))

In [None]:
# FIXME debug
# sample 100 examples
df = df_full.sample(n=100)

In [None]:
df = dgp.add_column(df, 'df_pred', ['bb_stem', 'bb_iloop', 'bb_hloop', 'seq'], add_s2_pred, pbar=True)

In [None]:
df_metric = []
for _, row in df.iterrows():
    seq_len = row['len']
    df_pred = row['df_pred']
    df_target = process_bb_old_to_new(row['bounding_boxes'])
    
    df_stem_pred = df_pred[df_pred['bb_type'] == 'stem']
    df_stem_target = df_target[df_target['bb_type'] == 'stem']
    
    sensitivity, ppv, f1_score = calculate_metric(df_stem_pred, df_stem_target, seq_len)
    
    df_metric.append({
        'seq_id': row['seq_id'],
        'len': seq_len,
        'sensitivity': sensitivity,
        'ppv': ppv,
        'f1_score': f1_score,
    })

In [None]:
df_metric = pd.DataFrame(df_metric)

In [None]:
fig = px.scatter(df_metric, x='len', y='sensitivity', hover_data=['seq_id', 'f1_score', 'ppv'])
fig.update_layout(title='Base pair metric of S2 prediction. Dataset: {}'.format(path_dataset))
fig.show()

In [None]:
fig = px.scatter(df_metric, x='len', y='f1_score', hover_data=['seq_id', 'sensitivity', 'ppv'])
fig.update_layout(title='Base pair metric of S2 prediction. Dataset: {}'.format(path_dataset))
fig.show()

In [None]:
# S1 eval - bounding box
evaluator = Evaluator(predictor=None)

In [None]:
def compute_metric_bb(bounding_boxes, bb_stem, bb_iloop, bb_hloop):
    df_target = process_bb_old_to_new(bounding_boxes)
    df_target_stem = df_target[df_target['bb_type'] == 'stem']
    df_target_iloop = df_target[df_target['bb_type'] == 'iloop']
    df_target_hloop = df_target[df_target['bb_type'] == 'hloop']

    df_pred_stem = pd.DataFrame(bb_stem)
    df_pred_iloop = pd.DataFrame(bb_iloop)
    df_pred_hloop = pd.DataFrame(bb_hloop)
    
    result = evaluator.calculate_bb_metrics(df_target_stem[['bb_x', 'bb_y', 'siz_x', 'siz_y']],
                               df_pred_stem[['bb_x', 'bb_y', 'siz_x', 'siz_y']])
    if result['n_target_total'] > 0:
        sensitivity_stem_bb = result['n_target_identical']/result['n_target_total']
    else:
        sensitivity_stem_bb = float('nan')
    
    result = evaluator.calculate_bb_metrics(df_target_iloop[['bb_x', 'bb_y', 'siz_x', 'siz_y']],
                               df_pred_iloop[['bb_x', 'bb_y', 'siz_x', 'siz_y']])
    if result['n_target_total'] > 0:
        sensitivity_iloop_bb = result['n_target_identical']/result['n_target_total']
    else:
        sensitivity_iloop_bb = float('nan')
    
    result = evaluator.calculate_bb_metrics(df_target_hloop[['bb_x', 'bb_y', 'siz_x', 'siz_y']],
                               df_pred_hloop[['bb_x', 'bb_y', 'siz_x', 'siz_y']])
    if result['n_target_total'] > 0:
        sensitivity_hloop_bb = result['n_target_identical']/result['n_target_total']
    else:
        sensitivity_hloop_bb = float('nan') 
    
    return sensitivity_stem_bb, sensitivity_iloop_bb, sensitivity_hloop_bb

In [None]:
df_metric_bb = dgp.add_columns(df, ['sensitivity_stem_bb', 'sensitivity_iloop_bb', 'sensitivity_hloop_bb'],
                    ['bounding_boxes', 'bb_stem', 'bb_iloop', 'bb_hloop'], compute_metric_bb)

In [None]:
fig = px.scatter(df_metric_bb, x='len', y='sensitivity_stem_bb', hover_data=['seq_id'])
fig.update_layout(title='Bounding box metric of S1 prediction. Dataset: {}'.format(path_dataset))
fig.show()

In [None]:
fig = px.scatter(df_metric_bb, x='len', y='sensitivity_iloop_bb', hover_data=['seq_id'])
fig.update_layout(title='Bounding box metric of S1 prediction. Dataset: {}'.format(path_dataset))
fig.show()

In [None]:
fig = px.scatter(df_metric_bb, x='len', y='sensitivity_hloop_bb', hover_data=['seq_id'])
fig.update_layout(title='Bounding box metric of S1 prediction. Dataset: {}'.format(path_dataset))
fig.show()