In [None]:
import pandas as pd
import numpy as np

In [None]:
import plotly.express as px

In [None]:
from model_utils.utils_s2 import Predictor

In [None]:
import dgutils.pandas as dgp

In [None]:
from model_utils.utils_nn_s2 import add_bb_bottom_left, greedy_sample

In [None]:
predictor = Predictor('../2020_12_08/result/synthetic_s2_training_2/model_ckpt_ep_12.pth')

In [None]:
def summarize_df(df, hloop=False):
    # calculate median prob and n_proposal_norm
    
    def _tmp(siz_x, siz_y, prob):
        prob_median = np.median(prob)
        n_proposal_norm = len(prob)/float(siz_x * siz_y)
        if hloop:
            n_proposal_norm = 2 * n_proposal_norm
        return prob_median, n_proposal_norm
        
    df = dgp.add_columns(df, ['prob_median', 'n_proposal_norm'],
                        ['siz_x', 'siz_y', 'prob'], _tmp)
    # subset columns
    df = df[['bb_x', 'bb_y', 'siz_x', 'siz_y', 'prob_median', 'n_proposal_norm']]
    return df

In [None]:
def bb_df2arr(df, seq_len):
    x = np.zeros((seq_len, seq_len))
    # subset to stems
    df = df[df['bb_type'] == 'stem']
    for _, row in df.iterrows():
        bb_x = row['bb_x']
        bb_y = row['bb_y']
        siz_x = row['siz_x']
        siz_y = row['siz_y']
        assert siz_x == siz_y
        assert int(siz_x) == siz_x
        siz_x = int(siz_x)
        assert int(bb_x) == bb_x
        bb_x = int(bb_x)
        assert int(bb_y) == bb_y
        bb_y = int(bb_y)
        for offset in range(int(siz_x)):
            idx_x = bb_x + offset
            idx_y = bb_y - offset
            x[idx_x, idx_y] = 1
            x[idx_y, idx_x] = 1
    return x

In [None]:
def tmp_eval(data):
    # FIXME handle cases where some are None
    df_stem = summarize_df(pd.DataFrame(data['bb_stem']))
    df_iloop = summarize_df(pd.DataFrame(data['bb_iloop']))
    df_hloop = summarize_df(pd.DataFrame(data['bb_hloop']), hloop=True)
    # add bottom left coord
    df_stem = add_bb_bottom_left(df_stem)
    df_iloop = add_bb_bottom_left(df_iloop)
    df_hloop = add_bb_bottom_left(df_hloop)
    
    picked_bb, df_data = greedy_sample(df_stem, df_iloop, df_hloop, predictor)
    
    df_picked = df_data[df_data['id_bb'].isin(picked_bb)][['bb_x', 'bb_y', 'siz_x', 'siz_y', 'pred', 'id_bb']]
    # add bb type (using id, hacky)
    df_picked = dgp.add_column(df_picked, 'bb_type', ['id_bb'], lambda x: x.split('_')[0])
    
    seq_len = data['len']
    arr_pred = bb_df2arr(df_picked, seq_len)
    arr_target = bb_df2arr(pd.DataFrame(data['df_target']), seq_len)
    
    
    TP = np.sum(arr_pred * arr_target)   # pred = target = 1
    FP = np.sum(arr_pred * (1 - arr_target))   # pred = 1, target = 0
    TN = np.sum((1 - arr_pred) * (1 - arr_target))  # pred = target = 0
    FN = np.sum((1 - arr_pred) * arr_target) # pred = 0, target = 1
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    
    # whether S1 bb sensitivity is 100%
    s1_ss = len(data['df_target']) == data['n_bb_found']
    
    return TPR, TNR, PPV, s1_ss

In [None]:
df = pd.read_pickle('data/synthetic_s1_pruned.pkl.gz')

In [None]:
# randomly sample 1000 examples
df_result = []

df = df.sample(frac=1)  # shuffle

for _, row in df.iterrows():
    try:  # some corner cases need fix (no bb of certain type, out of bound, etc.)
        TPR, TNR, PPV, s1_ss = tmp_eval(row)
    except:
        continue
    df_result.append({
        'tpr': TPR,
        'tnr': TNR,
        'ppv': PPV,
        's1_ss': s1_ss,
        'seq_len': row['len'],
    })
    if len(df_result) >= 1000:
        break

df_result = pd.DataFrame(df_result)

In [None]:
df_result

In [None]:
px.scatter(df_result[df_result['s1_ss']], x='tpr', y='ppv', hover_data=['seq_len'], marginal_x='violin', marginal_y='violin')

In [None]:
df_result[df_result['s1_ss']].describe()

In [None]:
px.scatter(df_result[~df_result['s1_ss']], x='tpr', y='ppv', hover_data=['seq_len'], marginal_x='violin', marginal_y='violin')

In [None]:
# # synthetic
# df = pd.read_pickle('data/synthetic_s1_pruned.pkl.gz')

# data = df.iloc[0]

In [None]:
# data = df.iloc[111]

In [None]:
# # rfam
# df = pd.read_pickle('../2020_11_24/data/rfam151_s1_pruned.pkl.gz')

# data = df.iloc[4]
# # data = df.iloc[6]

# # data = df.iloc[26]

# # data = df.iloc[55]


In [None]:
# # FIXME handle cases where some are None
# df_stem = summarize_df(pd.DataFrame(data['bb_stem']))
# df_iloop = summarize_df(pd.DataFrame(data['bb_iloop']))
# df_hloop = summarize_df(pd.DataFrame(data['bb_hloop']), hloop=True)
# # add bottom left coord
# df_stem = add_bb_bottom_left(df_stem)
# df_iloop = add_bb_bottom_left(df_iloop)
# df_hloop = add_bb_bottom_left(df_hloop)

In [None]:
# picked_bb, df_data = greedy_sample(df_stem, df_iloop, df_hloop, predictor)

In [None]:
# picked_bb

In [None]:
# df_picked = df_data[df_data['id_bb'].isin(picked_bb)][['bb_x', 'bb_y', 'siz_x', 'siz_y', 'pred', 'id_bb']]
# # add bb type (using id, hacky)
# df_picked = dgp.add_column(df_picked, 'bb_type', ['id_bb'], lambda x: x.split('_')[0])

In [None]:
# df_picked

In [None]:
# pd.DataFrame(data['df_target'])

In [None]:
# seq_len = data['len']

In [None]:
# arr_pred = bb_df2arr(df_picked, seq_len)
# arr_target = bb_df2arr(pd.DataFrame(data['df_target']), seq_len)

In [None]:

# TP = np.sum(arr_pred * arr_target)   # pred = target = 1
# FP = np.sum(arr_pred * (1 - arr_target))   # pred = 1, target = 0
# TN = np.sum((1 - arr_pred) * (1 - arr_target))  # pred = target = 0
# FN = np.sum((1 - arr_pred) * arr_target) # pred = 0, target = 1
        


# # Sensitivity, hit rate, recall, or true positive rate
# TPR = TP/(TP+FN)
# # Specificity or true negative rate
# TNR = TN/(TN+FP) 
# # Precision or positive predictive value
# PPV = TP/(TP+FP)

In [None]:
# print(TPR, TNR, PPV)

In [None]:
# print(len(data['df_target']), data['n_bb_found'])