In [1]:
from protosc.simulation import create_categorical_data, compare_models, create_simulation_data, create_correlated_data
from protosc.wrapper import Wrapper
from protosc.filter_model import filter_model, train_xvalidate, select_features
from protosc.feature_matrix import FeatureMatrix
from protosc.parallel import execute_parallel
from protosc.random_models import execute
import pandas as pd
import numpy as np
import random

In [2]:
X, y, ground_truth = create_categorical_data(min_dev=10, max_dev=20, n_categories=3)
print(f'ground truth #features: {len(ground_truth["selected_features"])}')
# X, y, ground_truth = create_correlated_data()

ground truth #features: 25


### Run all models

In [3]:
out = execute(X, y)
pd.DataFrame(out)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [01:11<00:00,  8.89s/it]


Unnamed: 0,filter,fast_wrapper,slow_wrapper,random,pseudo
features,"[[377, 163, 192, 127, 45, 496, 17, 315, 454, 2...","[[377, 163, 192, 127, 45, 496, 17, 315, 454, 1...","[[213, 179, 169, 187], [446], [250, 247, 172, ...","[[369, 413, 271, 15, 108, 69, 276, 175, 5, 334...","[[369, 413, 271, 15, 108, 69, 276, 175, 5, 334..."
accuracy,"[0.3958333333333333, 0.3508771929824561, 0.294...","[0.6458333333333334, 0.5263157894736842, 0.607...","[0.6458333333333334, 0.49122807017543857, 0.58...","[0.3541666666666667, 0.17543859649122806, 0.35...","[0.20833333333333334, 0.38596491228070173, 0.3..."


In [18]:
for model in out.keys():
    out[model]['un_feat'] = np.unique(np.concatenate(out[model]['features']))
    out[model]['av_acc'] = np.mean(out[model]['accuracy'])
pd.DataFrame(out)

Unnamed: 0,filter,fast_wrapper,slow_wrapper,random,pseudo
features,"[[377, 163, 192, 127, 45, 496, 17, 315, 454, 2...","[[377, 163, 192, 127, 45, 496, 17, 315, 454, 1...","[[213, 179, 169, 187], [446], [250, 247, 172, ...","[[369, 413, 271, 15, 108, 69, 276, 175, 5, 334...","[[369, 413, 271, 15, 108, 69, 276, 175, 5, 334..."
accuracy,"[0.3958333333333333, 0.3508771929824561, 0.294...","[0.6458333333333334, 0.5263157894736842, 0.607...","[0.6458333333333334, 0.49122807017543857, 0.58...","[0.3541666666666667, 0.17543859649122806, 0.35...","[0.20833333333333334, 0.38596491228070173, 0.3..."
av_acc,0.315748,0.565585,0.601385,0.332677,0.360558
un_feat,"[12, 17, 19, 34, 38, 39, 45, 51, 53, 57, 60, 6...","[4, 12, 17, 34, 45, 53, 68, 84, 91, 94, 96, 99...","[12, 17, 39, 45, 53, 84, 91, 99, 127, 133, 163...","[0, 2, 3, 5, 8, 10, 12, 15, 16, 20, 24, 25, 26...","[0, 2, 3, 5, 8, 10, 15, 16, 20, 24, 25, 26, 27..."


### Filter model

In [None]:
# Run filter method
filter_res = filter_model(X, y, fold_seed = 1234)
df = pd.DataFrame(filter_res)
df = df.rename(columns={0: 'Features', 1: 'Accuracy'})
df

### Wrapper model
#### Fast method (add immediately when cluster increases accuracy)

In [6]:
# Run fast wrapper method (add immediately = True)
fast = Wrapper(X, y, n=10, add_im=True, fold_seed = 1234)
out_fast = fast.wrapper(n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_fast['recurring']] * len(df)
except KeyError:
    pass
df

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:14<00:00,  1.87s/it]


Unnamed: 0,Model,Features,Accuracy,Recurring features
0,"[[126, 85, 301, 190, 69, 254, 302, 37, 157, 40...","[126, 85, 301, 190, 69, 254, 302, 37, 157, 402...",0.622222,[]
1,"[[466], [350], [256], [13], [72], [173], [355]...","[466, 350, 256, 13, 72, 173, 355, 274, 49]",0.54386,[]
2,"[[190, 37, 254, 402, 157, 302, 126, 85, 69, 26...","[190, 37, 254, 402, 157, 302, 126, 85, 69, 265...",0.416667,[]
3,"[[254, 190, 301, 302, 85, 157, 37, 265, 126, 1...","[254, 190, 301, 302, 85, 157, 37, 265, 126, 11...",0.55,[]
4,"[[126, 112, 254, 85, 69, 37, 302, 157, 265, 19...","[126, 112, 254, 85, 69, 37, 302, 157, 265, 190...",0.518519,[]
5,"[[350], [215], [237]]","[350, 215, 237]",0.5625,[]
6,"[[301, 302, 157, 126, 254, 85, 59], [252], [44...","[301, 302, 157, 126, 254, 85, 59, 252, 449, 117]",0.537037,[]
7,"[[190, 126, 37, 254, 85, 69, 324, 157, 131, 30...","[190, 126, 37, 254, 85, 69, 324, 157, 131, 302...",0.574074,[]


#### Slow method (add cluster with highest accuracy)

In [8]:
# Run fast wrapper method (add immediately = False)
slow = Wrapper(X, y, n=10, fold_seed = 1234)
out_slow = slow.wrapper(n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_slow.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_slow['recurring']] * len(df)
except KeyError:
    pass
df

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:19<00:00,  2.47s/it]


Unnamed: 0,Model,Features,Accuracy,Recurring features
0,"[[220], [163], [49]]","[220, 163, 49]",0.622222,[]
1,"[[350], [483], [13], [220], [449]]","[350, 483, 13, 220, 449]",0.578947,[]
2,"[[192], [421], [276], [441]]","[192, 421, 276, 441]",0.666667,[]
3,"[[117], [192], [350], [167], [252]]","[117, 192, 350, 167, 252]",0.633333,[]
4,"[[13], [167], [405], [355]]","[13, 167, 405, 355]",0.685185,[]
5,"[[192], [76]]","[192, 76]",0.541667,[]
6,"[[117], [350]]","[117, 350]",0.555556,[]
7,[[237]],[237],0.462963,[]


## Compare models
* Percentage of features correct: n_correct_selected/(n_correct_selected+n_false_selected)
* Percentage of features found: n_correct_selected/n_total_features
* Percentage of bias found: selected_bias/total_bias

In [None]:
models = {'filter': filter_res, 'wrapper_fast': out_fast, 'wrapper_slow': out_slow}
results = compare_models(models, ground_truth, mean=True)
pd.DataFrame(results)