#### Controlling the False Omission Rate (FOR)

In [1]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
import sys
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
import argparse
from utility import conf_pval, BH, eval, dice_sim, thresholds_map, Timer

In [2]:
dataset_name = 'PGP'
dataset_path = os.path.join('data', f'{dataset_name}_training_disguised.csv')

dataset = pd.read_csv(dataset_path)

In [3]:
total_Y = dataset['Act'].to_numpy()
total_X = dataset.drop(columns=['MOLECULE', 'Act']).to_numpy()

Xtc, Xtest, Ytc, Ytest = train_test_split(total_X, total_Y, test_size=15/100, shuffle=True)
Xtrain, Xcalib, Ytrain, Ycalib = train_test_split(Xtc, Ytc, train_size=50/85, shuffle=True)

In [4]:
threshold = thresholds_map[dataset_name]

rf = RandomForestRegressor(n_estimators=100, max_depth=20, max_features='sqrt')
rf.fit(Xtrain, Ytrain < threshold)

In [6]:
# forward selection

calib_scores = 1000 * (Ycalib < threshold) - rf.predict(Xcalib)
test_scores = -rf.predict(Xtest)

pvals = conf_pval(calib_scores, test_scores)
sel_forward = BH(pvals, 0.3)

fdp_forward, _, power_forward = eval(Ytest, sel_forward, -np.inf, threshold)
fdp_forward, power_forward

(0.30864197530864196, 0.3916083916083916)

In [11]:
# backward selection

calib_scores = 1000 * (-Ycalib < -threshold) - (-rf.predict(Xcalib))
test_scores = -(-rf.predict(Xtest))

pvals = conf_pval(calib_scores, test_scores)
sel_backward = BH(pvals, 0.1)

fdp_backward, _, power_backward = eval(Ytest, sel_backward, threshold, np.inf)
fdp_backward, power_backward

(0.10514541387024609, 0.9791921664626683)

In [12]:
# check the intersection

sel_backward_c = set(range(0, len(Ytest))) - set(sel_backward)

green = set(sel_forward) & set(sel_backward_c)
grey = (set(sel_forward) - set(sel_backward_c)) | (set(sel_backward_c) - set(sel_forward)) 
red = set(sel_backward) - set(sel_forward)

len(sel_forward), len(sel_backward), len(green), len(grey), len(red)

(81, 894, 66, 15, 879)

In [13]:
eval(Ytest, list(green), -np.inf, threshold), eval(Ytest, list(red), threshold, np.inf)

((0.25757575757575757, 0.017708333333333333, 0.34265734265734266),
 (0.09897610921501707, 0.090625, 0.9694002447980417))

In [3]:
import pandas as pd
df = pd.DataFrame()
for i in range(100):
    df_i = pd.read_csv(f"D:\\Github\\confsel-drug\\result\\fop PGP 0.10\\fop PGP 0.10 {i+1}.csv")
    df = pd.concat((df, df_i))
df = df.groupby("fop_nominal", as_index=False).mean()
df

Unnamed: 0.1,fop_nominal,Unnamed: 0,fops,powers
0,0.02,0.0,0.01968,0.055945
1,0.04,1.0,0.032165,0.250809
2,0.06,2.0,0.052158,0.460021
3,0.08,3.0,0.068129,0.668809
4,0.1,4.0,0.094682,0.824818
5,0.12,5.0,0.11543,0.92817
6,0.14,6.0,0.132402,0.97849
7,0.16,7.0,0.141952,0.995731
8,0.18,8.0,0.145762,0.9995
9,0.2,9.0,0.146875,1.0
