In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [None]:
from collections import namedtuple

In [None]:
from scipy.stats import pearsonr

In [None]:
import sys
sys.path.insert(0, '../../rna_ss/')
from utils import pairs2idx, idx2arr, arr2db

In [None]:
def stem_bb_to_db_str(stem_bbs, seq_len):
    # conert stem bounding boxes to dot-bracket
    # hacky implementation, no pseudoknot support!

    # # find all stems, collect all base pairs
    # all_stems = []
    # for chain in global_struct:
    #     all_stems.extend([x for x in chain.chain if x.type == 'stem'])
    # bps = []  # list of (i, j) tuple
    db_str = list('.' * seq_len)
    for s in stem_bbs:
        for i, j in zip(range(s.tr_x, s.bl_x + 1), range(s.bl_y, s.tr_y + 1)[::-1]):
            # handle corner case, if (i, j) is out of bound, skip
            # this can happen in rare cases (which should have been cleaned up after stage 1 <- to be fixed)
            if i >= seq_len or j >= seq_len:
                continue
            db_str[i] = '('
            db_str[j] = ')'
            # bps.append((i, j))
    # bps = sorted(bps)
    return ''.join(db_str)


def struct_df_to_db_str(df_struct, seq_len):
    # structure: a df, no structure: None
    if not isinstance(df_struct, pd.DataFrame):
        assert df_struct is None
        return '.' * seq_len

    BoundingBox = namedtuple('BoundingBox', ['tr_x', 'tr_y', 'bl_x', 'bl_y'])

    df_struct = df_struct[df_struct['bb_type'] == 'stem']
    stems = []
    for _, row in df_struct.iterrows():
        bb_x = row['bb_x']
        bb_y = row['bb_y']
        siz_x = row['siz_x']
        siz_y = row['siz_y']
        bl_x = bb_x + siz_x - 1
        bl_y = bb_y - siz_y + 1
        stems.append(BoundingBox(tr_x=bb_x, tr_y=bb_y, bl_x=bl_x, bl_y=bl_y))

    db_str = stem_bb_to_db_str(stems, seq_len)
    return db_str

In [None]:
df = pd.read_pickle('data/rand_s1_bb_0p1_global_structs_60_fe.pkl.gz')

In [None]:
print("Total: {}\nWith negative free energy (as defined by RNAfold): {}".format(len(df), (df['pred_fe']<0).sum()))

In [None]:
# correlation between free energy of predicted and RNAfold-generated structure
# where free energy is computed using RNAfold (so in theory predicted should always be >= RNAfold)
df_plot = df[df['pred_fe'] < 0]
corr, pval = pearsonr(df_plot['free_energy'], df_plot['pred_fe'])


fig = px.scatter(df_plot, x='free_energy', y='pred_fe', 
                 title="Pearson corr: {:.2f} ({:.2e})".format(corr, pval),
                width=600, height=600)
fig.update_traces(mode='markers', marker_size=2)
fig.show()

In [None]:
# same plot, with perfect predicted cases removed
df_plot = df[(df['pred_fe'] < 0) & (df['pred_fe'] != df['free_energy'])]
corr, pval = pearsonr(df_plot['free_energy'], df_plot['pred_fe'])


fig = px.scatter(df_plot, x='free_energy', y='pred_fe', 
                 title="[without perfect hits] Pearson corr: {:.2f} ({:.2e})".format(corr, pval),
                width=600, height=600)
fig.update_traces(mode='markers', marker_size=2)
fig.show()

In [None]:
# percentage difference distribution
df_plot = df[df['pred_fe'] < 0]
df_plot['perc_diff'] = df_plot['free_energy'] - df_plot['pred_fe']
df_plot['perc_diff'] = df_plot['perc_diff']/df_plot['free_energy']

px.histogram(df_plot, x='perc_diff', 
             histnorm='probability density',
            width=600, height=600)

In [None]:
# same plot, with perfect predicted cases removed
df_plot = df[(df['pred_fe'] < 0) & (df['pred_fe'] != df['free_energy'])]
df_plot['perc_diff'] = df_plot['free_energy'] - df_plot['pred_fe']
df_plot['perc_diff'] = df_plot['perc_diff']/df_plot['free_energy']

px.histogram(df_plot, x='perc_diff', 
             histnorm='probability density',
             title='[without perfect hits]',
            width=600, height=600)

In [None]:
# print those examples where predicted fe < RNAfold
# these should not happen?
for _, row in df[df['pred_fe'] < df['free_energy']].iterrows():
    fe_rnafold = row['free_energy']
    fe_pred = row['pred_fe']
    seq = row['seq']
    seq_len = row['len']
    one_idx = row['one_idx']
    pred_struct = row['best_struct']
    
    print("FE: rnafold {}, pred {}".format(fe_rnafold, fe_pred))
    print("{} length {}".format(seq, seq_len))
    db_str_rnafold, pk_exist = arr2db(idx2arr(one_idx, seq_len))
    db_str_pred = struct_df_to_db_str(pred_struct, seq_len)
    print("{} [RNAfold]".format(db_str_rnafold))
    print("{} [pred]".format(db_str_pred))
    print('')

In [None]:
df.iloc[0]