In [2]:
import pandas as pd
import numpy as np
import os

In [8]:
df = pd.read_csv('~/git/RTLib/fido_out.txt', sep='\t')
df

Unnamed: 0,prob,prot
0,1.000000,sp_P36578
1,0.827963,sp_A6NHY2
2,0.727791,REV__sp_Q9NX20
3,1.000000,sp_P10412
4,0.583840,sp_P22492
5,0.835864,REV__sp_P10412
6,0.867493,REV__sp_P16402
7,0.999995,sp_P16403
8,0.999998,sp_P16402
9,0.625358,sp_Q02539


In [14]:
dfa = df.sort_values('prob', ascending=False).reset_index(drop=True)
dff = dfa.groupby('prob')['prot'].apply(lambda x: x.values).sort_index(ascending=False)
dff

prob
1.000000         [sp_P19338, sp_P10809, sp_P26038, sp_P07237]
1.000000                                        [CON__P34955]
1.000000                                        [CON__P02769]
1.000000                                          [sp_P13639]
1.000000                                          [sp_P50990]
1.000000                    [CON__ENSEMBL:ENSBTAP00000024146]
1.000000                                          [sp_P41250]
1.000000                                          [sp_P00558]
1.000000                                          [sp_P14625]
1.000000                                          [sp_P36578]
1.000000                                          [sp_P06733]
1.000000                                          [sp_P22314]
1.000000                                          [sp_Q02878]
1.000000                                          [sp_P26641]
1.000000                                          [sp_P16401]
1.000000                                          [sp_P07195]
1.0

In [36]:
fps = [0]
tps = [0]
est_fdr_list = [0]
emp_fdr_list = [0]

fp_count = 0
tp_count = 0
total_fdr = 0.0
est_fdr = 0.0
emp_fdr = 0.0

num_decoys = 0
num_targets = 0

for i in range(0, dff.shape[0]):
    prob = dff.index[i]
    prots = dff.iloc[i]
    #print(prots)
    fp_change = np.sum('REV' in prot for prot in prots)
    tp_change = len(prots) - fp_change
    #print(fp_change, tp_change)
    
    num_decoys += fp_change
    num_targets += tp_change
    
    if tp_change > 0 and fp_change > 0:
        tp_change = fp_change = 0
    elif tp_change > 0:
        tp_change = 1
    elif fp_change > 0:
        fp_change = 1
    
    if i != 0:
        fps.append(fp_count)
        tps.append(tp_count)
        
        if est_fdr > est_fdr_list[len(est_fdr_list)-1]:
            est_fdr_list.append(est_fdr)
            emp_fdr_list.append(emp_fdr)
    
    fp_count += fp_change
    tp_count += tp_change
    
    total_fdr += (1-prob) * (fp_change + tp_change)
    est_fdr = total_fdr / (fp_count + tp_count)
    emp_fdr = float(fp_count) / (fp_count + tp_count)

fps.append(fp_count)
tps.append(tp_count)

fps.append(num_decoys)
tps.append(num_targets)

print(len(est_fdr_list))
print(len(fps))

est_fdr_list = np.array(est_fdr_list)
emp_fdr_list = np.array(emp_fdr_list)

2971
3194


In [51]:
def antiderivative_at(m, b, xVal):
    return (m * xVal * xVal / 2.0) + (b * xVal)

def squared_antiderivative_at(m, b, xVal):
    u = m*m
    v = 2*m*b
    t = b*b
    
    return (u * xVal * xVal / 3.0) + (v * xVal * xVal / 2.0) + (t * xVal)

def area(x1, y1, x2, y2, max_x):
    m = (y2-y1) / (x2-x1)
    b = y1-(m*x1)
    result = antiderivative_at(m, b, np.min([max_x, x2])) - antiderivative_at(m, b, x1)
    if result < 0.0:
        print('area: {}\n{} {} {} {}'.format(result, m, b, x1, x2))
    return result

def squared_area(x1, y1, x2, y2, max_x):
    if x2 < x1: return 0.0
    
    m = (y2-y1) / (x2-x1)
    b = y1-(m*x1)
    
    result = squared_antiderivative_at(m, b, np.min([max_x, x2])) - squared_antiderivative_at(m, b, x1)
    return result

# ROC_N
roc_N = 0.0
N = 50

if fps[-1] < N:
    print('Warning: There are not enough false positives; needed {} and was only given {}. Will proceed using largest available value'.format(N, fps[-1]))
    N = fps[-1]
    
for i in range(0, len(fps)-1):
    if fps[i] >= N: break
    if fps[i] != fps[i+1]:
        current_area = area(fps[i], tps[i], fps[i+1], tps[i+1], N)
        roc_N += current_area
    
roc_N / (N * tps[-1])

0.04872871987618837

In [53]:
# FDR divergence

diff = est_fdr_list - emp_fdr_list
tot = 0.0
thresh = 0.1

for i in range(0, len(diff)-1):
    if est_fdr_list[i] >= thresh:
        if i == 0:
            tot = np.inf
        break
    
    tot += squared_area(est_fdr_list[i], diff[i], est_fdr_list[i+1], diff[i+1], est_fdr_list[i+1])

x_range = np.min([thresh, est_fdr_list[-2]]) - est_fdr_list[0]

if np.isinf(tot):
    #return tot
    print('inf')
    tot

#return tot / x_range
tot / x_range

2.376667854082819

In [54]:
(0.15) * (roc_N / (N * tps[-1])) - ((1-0.15) * (tot / x_range))

-2.012858367988968

In [61]:
np.arange(10, 100, 10)

array([10, 20, 30, 40, 50, 60, 70, 80, 90])

In [62]:
np.append([], 'a')

array(['a'], dtype='<U32')