In [1]:
import pandas as pd
import numpy as np
from dgutils.pandas import add_columns

  from tqdm.autonotebook import tqdm


In [2]:
df_dataset = pd.read_pickle('/Users/alicegao/work/psi-lab-sandbox/rna_ss/data_processing/rna_cg/data/rfam.pkl')
df_e2efold = pd.read_pickle('/Users/alicegao/work/psi-lab-sandbox/rna_ss/tools/e2efold_2/e2efold_productive/tmp/rfam151.pkl')
df_spotrna = pd.read_pickle('/Users/alicegao/work/psi-lab-sandbox/rna_ss/tools/SPOT-RNA-master/tmp/rfam151.pkl')
df_rnafold = pd.read_pickle('/Users/alicegao/work/psi-lab-sandbox/meetings/2020_05_26/data/rfam151.pkl')

In [3]:
df_e2efold = df_e2efold.rename(columns={'pred_idx': 'pred_idx_e2efold'})
df = pd.merge(df_dataset, df_e2efold[['seq', 'pred_idx_e2efold']], on='seq')
df_spotrna = df_spotrna.rename(columns={'pred_idx': 'pred_idx_spotrna'})
df = pd.merge(df, df_spotrna[['seq', 'pred_idx_spotrna']], on='seq')
df_rnafold = df_rnafold.rename(columns={'pred_idx': 'pred_idx_rnafold'})
df = pd.merge(df, df_rnafold[['seq', 'pred_idx_rnafold']], on='seq')

In [4]:
# load dataset overlap, so that we can check performance on held-out portion

# SPOt-RNA was training on bpRNA & PDB20
df_overlap_spotrna1 = pd.read_csv('/Users/alicegao/work/psi-lab-sandbox/rna_ss/data_processing/dataset_overlap/data/rfam151_bprna_overlap.csv.gz')
df_overlap_spotrna2 = pd.read_csv('/Users/alicegao/work/psi-lab-sandbox/rna_ss/data_processing/dataset_overlap/data/rfam151_pdb250_overlap.csv.gz')
df_overlap_spotrna = pd.concat([df_overlap_spotrna1, df_overlap_spotrna2])

# E2Efold trained on rnastralign
df_overlap_e2efold = pd.read_csv('/Users/alicegao/work/psi-lab-sandbox/rna_ss/data_processing/dataset_overlap/data/rfam151_rnastralign_overlap.csv.gz')


In [5]:
rfam_ids_in_spotrna = df_overlap_spotrna['seq_id1'].unique()
rfam_ids_in_e2efold = df_overlap_e2efold['seq_id1'].unique()


print(len(rfam_ids_in_spotrna))
print(len(rfam_ids_in_e2efold))

91
10


In [6]:
class EvalMetric(object):

#     @staticmethod
#     def _check_arr(arr):
#         assert len(arr.shape) == 2
#         assert arr.shape[0] == arr.shape[1]
#         assert np.all((arr == 0) | (arr == 1))
#         assert np.max(np.sum(arr, axis=0)) <= 1
#         assert np.max(np.sum(arr, axis=1)) <= 1

    @staticmethod
    def sensitivity(_pred, _target):
        # numerator: number of correct predicted base pairs
        # denominator: number of true base pairs
        assert _pred.shape[0] == _target.shape[0]
        n = _pred.shape[0]
        # set lower triangular to be all 0's
        pred = _pred.copy()
        target = _target.copy()
        pred[np.tril_indices(n)] = 0
        target[np.tril_indices(n)] = 0
        # checks
#         EvalMetric._check_arr(pred)
#         EvalMetric._check_arr(target)
        # metric
        idx_true_base_pair = np.where(target == 1)
        return float(np.sum(pred[idx_true_base_pair]))/np.sum(target)

    @staticmethod
    def ppv(_pred, _target):
        # numerator: number of correct predicted base pairs
        # denominator: number of predicted base pairs
        assert _pred.shape[0] == _target.shape[0]
        n = _pred.shape[0]
        # set lower triangular to be all 0's
        pred = _pred.copy()
        target = _target.copy()
        pred[np.tril_indices(n)] = 0
        target[np.tril_indices(n)] = 0
        # checks
#         EvalMetric._check_arr(pred)
#         EvalMetric._check_arr(target)
        # metric
        idx_predicted_base_pair = np.where(pred == 1)
        return float(np.sum(target[idx_predicted_base_pair])/np.sum(pred))

    @staticmethod
    def f_measure(sensitivity, ppv):
        return (2 * sensitivity * ppv)/(sensitivity + ppv)

In [7]:
eval_metric = EvalMetric()

In [8]:
def add_metrics(seq_len, one_idx, pred_idx):
    arr_target = np.zeros((seq_len, seq_len))
    arr_target[one_idx] = 1
    arr_pred = np.zeros((seq_len, seq_len))
    arr_pred[pred_idx] = 1

    sensitivity = eval_metric.sensitivity(arr_pred, arr_target)
    ppv = eval_metric.ppv(arr_pred, arr_target)
    f_measure = eval_metric.f_measure(sensitivity, ppv)
    
    return sensitivity, ppv, f_measure

In [9]:
for model_name in ['e2efold', 'spotrna', 'rnafold']:
    df = add_columns(df, ['sensitivity_{}'.format(model_name),
                          'ppv_{}'.format(model_name),
                          'f_measure_{}'.format(model_name)], 
                     ['len', 'one_idx', 'pred_idx_{}'.format(model_name)], add_metrics)

  """


In [10]:
df.describe()

Unnamed: 0,f_measure_e2efold,f_measure_rnafold,f_measure_spotrna,len,ppv_e2efold,ppv_rnafold,ppv_spotrna,sensitivity_e2efold,sensitivity_rnafold,sensitivity_spotrna
count,75.0,142.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0
mean,0.132609,0.659045,0.670549,136.298013,0.088511,0.573774,0.684496,0.059149,0.695547,0.693567
std,0.169848,0.256173,0.230741,102.036451,0.16459,0.29579,0.224764,0.136486,0.302021,0.269894
min,0.012158,0.082192,0.076336,23.0,0.0,0.0,0.108696,0.0,0.0,0.053763
25%,0.041241,0.455164,0.485569,67.0,0.0,0.348573,0.555556,0.0,0.50625,0.49
50%,0.066667,0.713164,0.716049,104.0,0.0,0.574468,0.735849,0.0,0.8,0.72973
75%,0.147099,0.86646,0.865766,158.5,0.102632,0.792892,0.870266,0.053813,0.986726,0.954545
max,1.0,1.0,1.0,568.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# subset to sequences not similar/identical to SPOT-RNA training data
df[~df['seq_id'].isin(rfam_ids_in_spotrna)].describe()

Unnamed: 0,f_measure_e2efold,f_measure_rnafold,f_measure_spotrna,len,ppv_e2efold,ppv_rnafold,ppv_spotrna,sensitivity_e2efold,sensitivity_rnafold,sensitivity_spotrna
count,28.0,57.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,0.120733,0.659181,0.675827,139.183333,0.078074,0.569718,0.678067,0.047569,0.722145,0.714313
std,0.145842,0.234129,0.215863,111.491961,0.1419,0.278502,0.227523,0.107816,0.26814,0.257576
min,0.015385,0.184615,0.111111,23.0,0.0,0.0,0.108696,0.0,0.0,0.113636
25%,0.042332,0.461538,0.542424,68.75,0.0,0.380211,0.55303,0.0,0.559524,0.558029
50%,0.066912,0.725,0.715168,108.5,0.0,0.616369,0.736346,0.0,0.820856,0.78869
75%,0.131245,0.807018,0.840711,157.5,0.098182,0.761979,0.848901,0.049107,0.901563,0.931609
max,0.716418,1.0,1.0,568.0,0.705882,1.0,1.0,0.727273,1.0,1.0


In [13]:
# subset to sequences not similar/identical to E2Efold training data
df[~df['seq_id'].isin(rfam_ids_in_e2efold)].describe()

Unnamed: 0,f_measure_e2efold,f_measure_rnafold,f_measure_spotrna,len,ppv_e2efold,ppv_rnafold,ppv_spotrna,sensitivity_e2efold,sensitivity_rnafold,sensitivity_spotrna
count,65.0,132.0,141.0,141.0,141.0,141.0,141.0,141.0,141.0,141.0
mean,0.1011,0.672039,0.682383,126.390071,0.068867,0.584182,0.69688,0.040344,0.703694,0.706447
std,0.092868,0.25738,0.226985,92.544952,0.127414,0.30078,0.22075,0.078287,0.30755,0.266505
min,0.012158,0.082192,0.076336,23.0,0.0,0.0,0.131579,0.0,0.0,0.053763
25%,0.036364,0.494505,0.509091,65.0,0.0,0.351351,0.555556,0.0,0.521739,0.517241
50%,0.057143,0.727875,0.735849,102.0,0.0,0.628571,0.736842,0.0,0.810811,0.75
75%,0.125,0.88,0.869565,151.0,0.076923,0.810811,0.88,0.045455,1.0,0.956522
max,0.352941,1.0,1.0,568.0,0.75,1.0,1.0,0.6,1.0,1.0
