In [1]:
from protosc import filter_model
from protosc.simulation import create_simulation_data, create_correlated_data, slightly_correlated_data
from protosc.filter_model import train_xvalidate, select_features
from protosc.final_selection import final_selection
from protosc.wrapper import Wrapper
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from protosc.parallel import execute_parallel

In [2]:
# Create data: features (X) and categories (y)
np.random.seed(1928374)
X, y, ground_truth = create_correlated_data()
print(f'features: {X[:5]}')
print(f'categories: {y[:5]}')

features: [[ 1.35946304 -0.05495788 -1.25547915 ... -0.29250157  0.71268609
  -1.15811878]
 [ 1.16041638  0.11066386  0.96355358 ... -0.74329004  0.1515906
  -0.07321478]
 [ 0.30436521  0.61188844 -0.84293664 ... -0.04545197  1.07970547
  -1.35302247]
 [-0.44400432 -0.55848696  0.62296112 ...  0.09136625 -0.86405888
  -1.28561021]
 [-1.91110816 -2.27302481 -1.0157041  ... -0.01774011  0.13587279
  -0.88981648]]
categories: [0 0 0 1 1]


In [3]:
# Create clusters of correlating features, and select n features from these clusters for filter model
selected_features, clusters = select_features(X, y, chisq_threshold=0.25)
print(f'Clusters (n={len(clusters)}): {clusters[:10]}')
print(f'Selected features (n={len(selected_features)}): {selected_features}')

Clusters (n=200): [[485, 862, 788, 497, 473], [175, 81, 486, 344, 591], [635, 769, 682, 693, 863], [451, 251, 468, 176, 766], [597, 684, 12, 984, 950], [367, 983, 0, 215, 702], [191, 495, 60, 455, 261], [496, 955, 193, 903, 934], [479, 567, 107, 834, 511], [302, 124, 732, 899, 867]]
Selected features (n=20): [485, 862, 788, 497, 473, 175, 81, 486, 344, 591, 635, 769, 682, 693, 863, 451, 251, 468, 176, 766]


In [4]:
# Slow method (add immediately = False)
slow = Wrapper(X, y, clusters, fold_seed=123)
out_slow = slow.wrapper(n_rounds=4, n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_slow.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
try:
    df['Recurring clusters'] = [out_slow['recurring']] * len(df)
except KeyError:
    pass
df

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:52<00:00, 13.22s/it]


Unnamed: 0,Model,Features,Clusters,Accuracy,Recurring clusters
0,"[[635, 769, 682, 693, 863], [663, 106, 27, 371...","[635, 769, 682, 693, 863, 663, 106, 27, 371, 5...","[2, 12, 3]",0.8,[]
1,"[[485, 862, 788, 497, 473], [424, 381, 563, 46...","[485, 862, 788, 497, 473, 424, 381, 563, 465, ...","[0, 10, 22]",0.72,[]
2,"[[302, 124, 732, 899, 867], [773, 452, 612, 27...","[302, 124, 732, 899, 867, 773, 452, 612, 279, ...","[9, 22, 23]",0.76,[]
3,"[[635, 769, 682, 693, 863], [175, 81, 486, 344...","[635, 769, 682, 693, 863, 175, 81, 486, 344, 5...","[2, 1, 10, 8]",0.78,[]


In [5]:
# Fast method (add immediately = True)
fast = Wrapper(X, y, clusters, add_im=True, fold_seed=123)
out_fast = fast.wrapper(n_rounds=4, n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
try:
    df['Recurring clusters'] = [out_fast['recurring']] * len(df)
except KeyError:
    pass
df

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:50<00:00, 12.74s/it]


Unnamed: 0,Model,Features,Clusters,Accuracy,Recurring clusters
0,"[[485, 862, 788, 497, 473], [635, 769, 682, 69...","[485, 862, 788, 497, 473, 635, 769, 682, 693, ...","[0, 2, 3, 7, 9, 13, 30]",0.86,"[0, 2]"
1,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 7, 15]",0.8,"[0, 2]"
2,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 7, 16]",0.8,"[0, 2]"
3,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 12]",0.76,"[0, 2]"


In [6]:
# Fast method (add immediately = True) with exclusion (excl=True)
fast_excl = Wrapper(X, y, clusters, add_im=True, excl=True, fold_seed=123)
out_fast_excl = fast_excl.wrapper(n_rounds=4, n_jobs=-1)

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast_excl.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
try:
    df['Recurring clusters'] = [out_fast_excl['recurring']] * len(df)
except KeyError:
    pass
df

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:35<00:00, 38.82s/it]


Unnamed: 0,Model,Features,Clusters,Accuracy,Recurring clusters
0,"[[485, 862, 788, 497, 473], [635, 769, 682, 69...","[485, 862, 788, 497, 473, 635, 769, 682, 693, ...","[0, 2, 8, 7, 9, 13, 30]",0.82,"[0, 2]"
1,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 7, 15]",0.8,"[0, 2]"
2,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 7, 16]",0.8,"[0, 2]"
3,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 12]",0.76,"[0, 2]"
