In [1]:
from protosc.simulation import create_categorical_data, compare_models, create_simulation_data, create_correlated_data
from protosc.model.wrapper import Wrapper
from protosc.model.filter import filter_model, train_xvalidate, select_features
from protosc.feature_matrix import FeatureMatrix
from protosc.parallel import execute_parallel
from protosc.model.shell_all_models import execute_all_models
import pandas as pd
import numpy as np
import random

In [2]:
X, y, ground_truth = create_categorical_data(min_dev=10, max_dev=20, n_categories=3, seed=1234)
print(f'ground truth #features: {len(ground_truth["selected_features"])}')
# X, y, ground_truth = create_correlated_data()

ground truth #features: 25


### Run all models

In [3]:
out = execute_all_models(X, y)
pd.DataFrame(out)

100%|██████████| 8/8 [00:18<00:00,  2.25s/it]


Unnamed: 0,filter,fast_wrapper,slow_wrapper,random,pseudo_random
features,"[[176, 288, 91, 42, 379, 80, 167, 487, 403, 16...","[[176, 91, 167, 426, 99, 31], [451, 274, 74, 1...","[[91, 171], [196], [74, 423, 274, 226, 451, 33...","[[403, 309, 364, 499, 138, 474, 58, 71, 46, 31...","[[309, 364, 499, 138, 474, 58, 46, 315, 370, 3..."
accuracy,"[0.29411764705882354, 0.2807017543859649, 0.37...","[0.47058823529411764, 0.5087719298245614, 0.53...","[0.5490196078431373, 0.45614035087719296, 0.55...","[0.27450980392156865, 0.22807017543859648, 0.4...","[0.2549019607843137, 0.22807017543859648, 0.29..."
recurring,"[91, 288, 423, 451]",[],[],[],[]


## Compare models
* Percentage of features correct: n_correct_selected/(n_correct_selected+n_false_selected)
* Percentage of features found: n_correct_selected/n_total_features
* Percentage of bias found: selected_bias/total_bias

In [4]:
compare = compare_models(out, ground_truth, mean=True)
pd.DataFrame(compare)

Unnamed: 0,filter,fast_wrapper,slow_wrapper,random,pseudo_random
%corr_feat,0.318705,0.543885,0.118056,0.119847,0.031563
%feat_found,0.515,0.465,0.085,0.215,0.05
%bias_found,0.569306,0.520694,0.094444,0.224167,0.04375
mean_acc,0.319107,0.540785,0.560448,0.322802,0.30742


### Filter model

In [5]:
# Run filter method
filter_res = filter_model(X, y, fold_seed = 1234)
df = pd.DataFrame(filter_res)
df = df.rename(columns={0: 'Features', 1: 'Accuracy'})
df

Unnamed: 0,Features,Accuracy
0,"[176, 288, 91, 42, 379, 80, 167, 487, 403, 16,...",0.294118
1,"[451, 274, 74, 122, 185, 231, 281, 423, 83, 22...",0.280702
2,"[74, 423, 274, 226, 451, 334, 185, 281, 83, 12...",0.37037
3,"[42, 91, 288, 218, 229, 407, 274, 151, 304, 27...",0.395833
4,"[423, 274, 451, 74, 185, 226, 281, 83, 122, 33...",0.333333
5,"[185, 274, 451, 423, 334, 240, 281, 74, 226, 3...",0.27451
6,"[74, 451, 185, 274, 423, 226, 334, 231, 122, 2...",0.307692
7,"[423, 274, 226, 194, 451, 185, 83, 74, 334, 28...",0.296296


### Wrapper model
#### Fast method (add immediately when cluster increases accuracy)

In [None]:
# Run fast wrapper method (add immediately = True)
fast = Wrapper(n=10, add_im=True)
out_fast = fast.execute(X, y, fold_seed=1234)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_fast['recurring']] * len(df)
except KeyError:
    pass
df

In [None]:
out_fast

#### Slow method (add cluster with highest accuracy)

In [None]:
# Run fast wrapper method (add immediately = False)
slow = Wrapper(X, y, n=10, fold_seed = 1234)
out_slow = slow.wrapper(n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_slow.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_slow['recurring']] * len(df)
except KeyError:
    pass
df