In [1]:
from protosc.simulation import create_categorical_data, compare_models, create_simulation_data, create_correlated_data
from protosc.wrapper import Wrapper
from protosc.filter_model import filter_model, calc_chisquare, compute_pval, create_clusters, select_features
from protosc.feature_matrix import FeatureMatrix
import pandas as pd

In [7]:
X, y, ground_truth = create_categorical_data(min_dev=10, max_dev=20, n_categories=3)
print(f'ground truth #features: {len(ground_truth["selected_features"])}')
# X, y, ground_truth = create_correlated_data()

ground truth #features: 25


### Filter model

In [8]:
# Run filter method
filter_res = filter_model(X, y, fold_seed = 1234)
df = pd.DataFrame(filter_res)
df = df.rename(columns={0: 'Features', 1: 'Accuracy'})
df

Unnamed: 0,Features,Accuracy
0,"[327, 409, 171, 57, 3, 258, 392, 139, 397, 90,...",0.355556
1,"[171, 3, 327, 409, 88, 57, 46, 218, 74, 27, 39...",0.240741
2,"[327, 409, 3, 171, 166, 57, 139, 397, 88, 46, ...",0.203704
3,"[40, 394, 32, 257, 76, 360, 18, 128, 416, 488,...",0.333333
4,"[171, 3, 350, 88, 327, 409, 258, 218, 46, 166,...",0.37037
5,"[81, 223, 326, 15, 76, 416, 114, 305, 390, 493...",0.407407
6,"[171, 327, 3, 409, 166, 74, 27, 139, 392, 397,...",0.388889
7,"[327, 409, 3, 171, 57, 88, 218, 258, 166, 350,...",0.314815


### Wrapper model
#### Fast method (add immediately when cluster increases accuracy)

In [9]:
# Run fast wrapper method (add immediately = True)
fast = Wrapper(X, y, add_im=True, fold_seed = 1234)
out_fast = fast.wrapper(n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_fast['recurring']] * len(df)
except KeyError:
    pass
df

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:28<00:00,  3.59s/it]


Unnamed: 0,Model,Features,Accuracy,Recurring features
0,"[[327, 409, 171, 57, 3, 258, 392, 139, 397, 90...","[327, 409, 171, 57, 3, 258, 392, 139, 397, 90,...",0.488889,[]
1,"[[171, 3, 327, 409, 88, 57, 46, 218, 74, 27, 3...","[171, 3, 327, 409, 88, 57, 46, 218, 74, 27, 39...",0.407407,[]
2,"[[327, 409, 3, 171, 166, 57, 139, 397, 88, 46,...","[327, 409, 3, 171, 166, 57, 139, 397, 88, 46, ...",0.351852,[]
3,"[[40], [394], [76], [416], [488], [223], [205]...","[40, 394, 76, 416, 488, 223, 205, 493, 305, 24...",0.596491,[]
4,"[[171, 3, 350, 88, 327, 409, 258, 218, 46, 166...","[171, 3, 350, 88, 327, 409, 258, 218, 46, 166,...",0.555556,[]
5,"[[81], [223], [326], [15], [32], [237], [488],...","[81, 223, 326, 15, 32, 237, 488, 107, 308, 305...",0.574074,[]
6,"[[171, 327, 3, 409, 166, 74, 27, 139, 392, 397...","[171, 327, 3, 409, 166, 74, 27, 139, 392, 397,...",0.537037,[]
7,"[[327, 409, 3, 171, 57, 88, 218, 258, 166, 350...","[327, 409, 3, 171, 57, 88, 218, 258, 166, 350,...",0.407407,[]


#### Slow method (add cluster with highest accuracy)

In [10]:
# Run fast wrapper method (add immediately = False)
slow = Wrapper(X, y, fold_seed = 1234)
out_slow = slow.wrapper(n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_slow.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_slow['recurring']] * len(df)
except KeyError:
    pass
df

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:27<00:00,  3.47s/it]


Unnamed: 0,Model,Features,Accuracy,Recurring features
0,"[[15], [114], [460], [412], [457]]","[15, 114, 460, 412, 457]",0.755556,[]
1,[[178]],[178],0.462963,[]
2,"[[91], [360], [163], [306], [463]]","[91, 360, 163, 306, 463]",0.574074,[]
3,"[[493], [107]]","[493, 107]",0.491228,[]
4,"[[488], [148], [365], [460]]","[488, 148, 365, 460]",0.611111,[]
5,"[[40], [178], [107]]","[40, 178, 107]",0.574074,[]
6,"[[76], [81], [178]]","[76, 81, 178]",0.574074,[]
7,"[[40], [394], [81], [18], [463], [449]]","[40, 394, 81, 18, 463, 449]",0.685185,[]


## Compare models
* Percentage of features correct: n_correct_selected/(n_correct_selected+n_false_selected)
* Percentage of features found: n_correct_selected/n_total_features
* Percentage of bias found: selected_bias/total_bias

In [11]:
models = {'filter': filter_res, 'wrapper_fast': out_fast, 'wrapper_slow': out_slow}
results = compare_models(models, ground_truth, mean=True)
pd.DataFrame(results)

Unnamed: 0,filter,wrapper_fast,wrapper_slow
%corr_feat,0.327705,0.590348,0.208333
%feat_found,0.52,0.49,0.015
%bias_found,0.561944,0.534444,0.011667
mean_acc,0.326852,0.489839,0.591033
