In [1]:
from protosc.simulation import create_categorical_data, compare_models, create_simulation_data, create_correlated_data
from protosc.wrapper import Wrapper
from protosc.filter_model import filter_model, train_xvalidate, select_features
from protosc.feature_matrix import FeatureMatrix
from protosc.parallel import execute_parallel
from protosc.shell_all_models import execute
import pandas as pd
import numpy as np
import random

In [2]:
X, y, ground_truth = create_categorical_data(min_dev=10, max_dev=20, n_categories=3, seed=1234)
print(f'ground truth #features: {len(ground_truth["selected_features"])}')
# X, y, ground_truth = create_correlated_data()

ground truth #features: 25


### Run all models

In [None]:
out = execute(X, y)
pd.DataFrame(out)

 38%|████████████████████████████████████████████████████████▎                                                                                             | 3/8 [01:19<01:53, 22.73s/it]

In [None]:
compare = compare_models(out, ground_truth, mean=True)
pd.DataFrame(compare)

### Filter model

In [None]:
# Run filter method
filter_res = filter_model(X, y, fold_seed = 1234)
df = pd.DataFrame(filter_res)
df = df.rename(columns={0: 'Features', 1: 'Accuracy'})
df

### Wrapper model
#### Fast method (add immediately when cluster increases accuracy)

In [None]:
# Run fast wrapper method (add immediately = True)
fast = Wrapper(X, y, n=10, add_im=True, fold_seed = 1234)
out_fast = fast.wrapper(n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_fast['recurring']] * len(df)
except KeyError:
    pass
df

#### Slow method (add cluster with highest accuracy)

In [None]:
# Run fast wrapper method (add immediately = False)
slow = Wrapper(X, y, n=10, fold_seed = 1234)
out_slow = slow.wrapper(n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_slow.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Accuracy', 3: 'Accuracy'})
try:
    df['Recurring features'] = [out_slow['recurring']] * len(df)
except KeyError:
    pass
df

## Compare models
* Percentage of features correct: n_correct_selected/(n_correct_selected+n_false_selected)
* Percentage of features found: n_correct_selected/n_total_features
* Percentage of bias found: selected_bias/total_bias

In [None]:
models = {'filter': filter_res, 'wrapper_fast': out_fast, 'wrapper_slow': out_slow}
results = compare_models(models, ground_truth, mean=True)
pd.DataFrame(results)