In [139]:
from protosc import filter_model
from protosc.simulation import create_simulation_data, create_correlated_data, slightly_correlated_data
from protosc.filter_model import train_xvalidate, select_features
from protosc.final_selection import final_selection
from protosc.wrapper import Wrapper
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from protosc.parallel import execute_parallel

In [140]:
# Create data: features (X) and categories (y)
np.random.seed(1928374)
X, y, ground_truth = create_correlated_data()
print(f'features: {X[:5]}')
print(f'categories: {y[:5]}')

features: [[ 1.35946304 -0.05495788 -1.25547915 ... -0.29250157  0.71268609
  -1.15811878]
 [ 1.16041638  0.11066386  0.96355358 ... -0.74329004  0.1515906
  -0.07321478]
 [ 0.30436521  0.61188844 -0.84293664 ... -0.04545197  1.07970547
  -1.35302247]
 [-0.44400432 -0.55848696  0.62296112 ...  0.09136625 -0.86405888
  -1.28561021]
 [-1.91110816 -2.27302481 -1.0157041  ... -0.01774011  0.13587279
  -0.88981648]]
categories: [0 0 0 1 1]


In [141]:
# Create clusters of correlating features, and select n features from these clusters for filter model
selected_features, clusters = select_features(X, y, chisq_threshold=0.25)
print(f'Clusters (n={len(clusters)}): {clusters[:10]}')
print(f'Selected features (n={len(selected_features)}): {selected_features}')

Clusters (n=200): [[485, 862, 788, 497, 473], [175, 81, 486, 344, 591], [635, 769, 682, 693, 863], [451, 251, 468, 176, 766], [597, 684, 12, 984, 950], [367, 983, 0, 215, 702], [191, 495, 60, 455, 261], [496, 955, 193, 903, 934], [479, 567, 107, 834, 511], [302, 124, 732, 899, 867]]
Selected features (n=20): [485, 862, 788, 497, 473, 175, 81, 486, 344, 591, 635, 769, 682, 693, 863, 451, 251, 468, 176, 766]


In [142]:
# Fast method (add immediately = True)
slow = Wrapper(X, y, clusters, n=2)
out_slow = slow.wrapper()

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_slow.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
try:
    df['Recurring features'] = [np.array(clusters)[out_slow['recurring']]] * len(df)
except KeyError:
    pass
df

-- Round 1 of 2 --

                    added cluster 0, new accuracy = 0.66

                    added cluster 10, new accuracy = 0.7

                    added cluster 22, new accuracy = 0.76

                    nothing added

                    nothing added

                    nothing added

                    nothing added

                    No features were added in 4 rounds.
                    Stop searching for new clusters.
-- Round 2 of 2 --

                    added cluster 2, new accuracy = 0.74

                    added cluster 9, new accuracy = 0.78

                    nothing added

                    nothing added

                    nothing added

                    nothing added

                    No features were added in 4 rounds.
                    Stop searching for new clusters.


Unnamed: 0,Model,Features,Clusters,Accuracy,Recurring features
0,"[[485, 862, 788, 497, 473], [424, 381, 563, 46...","[485, 862, 788, 497, 473, 424, 381, 563, 465, ...","[0, 10, 22]",0.76,[]
1,"[[635, 769, 682, 693, 863], [302, 124, 732, 89...","[635, 769, 682, 693, 863, 302, 124, 732, 899, ...","[2, 9]",0.78,[]


In [143]:
# Fast method (add immediately = True)
fast = Wrapper(X, y, clusters, add_im=True)
out_fast = fast.wrapper()

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
try:
    df['Recurring features'] = [np.array(clusters)[out_fast['recurring']]] * len(df)
except KeyError:
    pass
df


                            added cluster 0, new accuracy = 0.58

                            added cluster 1, new accuracy = 0.64

                            added cluster 2, new accuracy = 0.76

                            added cluster 7, new accuracy = 0.78

                            added cluster 15, new accuracy = 0.8

                            added cluster 9, new accuracy = 0.84

                    nothing added

                    nothing added

                    nothing added

                    nothing added

                    No features were added in 4 rounds.
                    Stop searching for new clusters.


Unnamed: 0,Model,Features,Clusters,Accuracy
0,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 7, 15, 9]",0.84


In [145]:
# Fast method (add immediately = True) with exclusion (excl=True)
fast_excl = Wrapper(X, y, clusters, excl=True, add_im=True, n=2)
out_fast_excl = fast_excl.wrapper()

# Print outcome in dataframe
df = pd.DataFrame([value for key, value in out_fast_excl.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
try:
    df['Recurring features'] = [np.array(clusters)[out_fast_excl['recurring']]] * len(df)
except KeyError:
    pass
df

-- Round 1 of 2 --

                            added cluster 0, new accuracy = 0.56

                            added cluster 1, new accuracy = 0.66

                            added cluster 2, new accuracy = 0.76

                            added cluster 3, new accuracy = 0.8

                            added cluster 17, new accuracy = 0.82

                            added cluster 14, new accuracy = 0.84

                            added cluster 25, new accuracy = 0.86

                    nothing added

                    nothing added

                    nothing added

                    nothing added

                    No features were added in 4 rounds.
                    Stop searching for new clusters.

        Trying to increase accuracy by removing/replacing clusters...
        

                    Removal/replacement of clusters did not increase accuracy.
                    
-- Round 2 of 2 --

                            added cluster 0, new accuracy = 0.6

 