In [1]:
from protosc import filter_model
from protosc.simulation import create_simulation_data, create_correlated_data, compare_results
from protosc.filter_model import train_xvalidate, select_features
from protosc.final_selection import final_selection
from protosc.wrapper import wrapper, calc_accuracy
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# Create data: features (X) and categories (y)
np.random.seed(1928374)
X, y, ground_truth = create_correlated_data()
print(f'features: {X[:5]}')
print(f'categories: {y[:5]}')

features: [[ 1.35946304 -0.05495788 -1.25547915 ... -0.29250157  0.71268609
  -1.15811878]
 [ 1.16041638  0.11066386  0.96355358 ... -0.74329004  0.1515906
  -0.07321478]
 [ 0.30436521  0.61188844 -0.84293664 ... -0.04545197  1.07970547
  -1.35302247]
 [-0.44400432 -0.55848696  0.62296112 ...  0.09136625 -0.86405888
  -1.28561021]
 [-1.91110816 -2.27302481 -1.0157041  ... -0.01774011  0.13587279
  -0.88981648]]
categories: [0 0 0 1 1]


In [3]:
# Create clusters of correlating features, and select n features from these clusters for filter model
selected_features, clusters = select_features(X, y, chisq_threshold=0.25)
print(f'Clusters (n={len(clusters)}): {clusters[:10]}')
print(f'Selected features (n={len(selected_features)}): {selected_features}')

Clusters (n=200): [[485, 862, 788, 497, 473], [175, 81, 486, 344, 591], [635, 769, 682, 693, 863], [451, 251, 468, 176, 766], [597, 684, 12, 984, 950], [367, 983, 0, 215, 702], [191, 495, 60, 455, 261], [496, 955, 193, 903, 934], [479, 567, 107, 834, 511], [302, 124, 732, 899, 867]]
Selected features (n=20): [485, 862, 788, 497, 473, 175, 81, 486, 344, 591, 635, 769, 682, 693, 863, 451, 251, 468, 176, 766]


In [4]:
# Check average accuracy with selected_features
calc_accuracy(X, y, selected_features)

0.78

Options for wrapper():
* X: np.array, features
* y: np.array, categories (1/0)
* clusters: np.array, clusters of correlating features
* decrease: boolean, if True clusters are ranked from high to low chi-square scores, if False from low to high
* add_im: boolean, if True clusters are immediately added to model if they increase the accuracy, if False it only adds the cluster with the highest accuracy increase
* excl: boolean, if True clusters are removed from the final model one by one to see if accuracy increases, if False this step is skipped.
* search_space: float, percentage of clusters that will be used to select clusters from
* stop: int, max number of rounds where no clusters can be added, after which looping will stop
* n_fold: int, number of folds (used for calculating accuracy)
* n: integer, number of times you want to run the code

### 'Slow' method
Select clusters that increase accuracy, but per loop only add the cluster with *highest* impact.
* decrease: True
* add_im: False
* search_space: 0.15
* stop: 4
* n_fold: 8
* n: 2

In [5]:
# Without exclusion step
output_slow = wrapper(X, y, clusters, decrease=True, add_im=False, excl=False, search_space=0.15, stop=4, n_fold=8, n=3)

# Print output in dataframe
df = pd.DataFrame([value for key, value in output_slow.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
df['Recurring features'] = [np.array(clusters)[output_slow['recurring']]] * len(df)
df

-- Round 1 of 3 --
added cluster 0, new accuracy = 0.64
added cluster 3, new accuracy = 0.7
added cluster 17, new accuracy = 0.74
nothing added
added cluster 1, new accuracy = 0.76
added cluster 2, new accuracy = 0.8
added cluster 10, new accuracy = 0.82
nothing added
added cluster 7, new accuracy = 0.86
nothing added
nothing added
nothing added
nothing added
No features were added in 4 rounds. Stop searching for new clusters.
-- Round 2 of 3 --
added cluster 2, new accuracy = 0.72
added cluster 1, new accuracy = 0.8
added cluster 29, new accuracy = 0.82
nothing added
nothing added
nothing added
nothing added
No features were added in 4 rounds. Stop searching for new clusters.
-- Round 3 of 3 --
added cluster 2, new accuracy = 0.72
added cluster 12, new accuracy = 0.76
nothing added
added cluster 10, new accuracy = 0.78
nothing added
nothing added
added cluster 26, new accuracy = 0.8
nothing added
nothing added
nothing added
nothing added
No features were added in 4 rounds. Stop search

Unnamed: 0,Model,Features,Clusters,Accuracy,Recurring features
0,"[[485, 862, 788, 497, 473], [451, 251, 468, 17...","[485, 862, 788, 497, 473, 451, 251, 468, 176, ...","[0, 3, 17, 1, 2, 10, 7]",0.86,"[[635, 769, 682, 693, 863]]"
1,"[[635, 769, 682, 693, 863], [175, 81, 486, 344...","[635, 769, 682, 693, 863, 175, 81, 486, 344, 5...","[2, 1, 29]",0.82,"[[635, 769, 682, 693, 863]]"
2,"[[635, 769, 682, 693, 863], [663, 106, 27, 371...","[635, 769, 682, 693, 863, 663, 106, 27, 371, 5...","[2, 12, 10, 26]",0.8,"[[635, 769, 682, 693, 863]]"


### 'Fast' method
Select clusters that increase accuracy; when one cluster increases accuracy, immediately add it to model and continue with this new model
* decrease: True
* add_im: True
* search_space: 0.15
* stop: 4
* n_fold: 8
* n: 2

In [6]:
# Without exclusion step
output_fast = wrapper(X, y, clusters, decrease=True, add_im=True, excl=False, search_space=0.15, stop=4, n_fold=8, n=3)

# Print output in dataframe
df = pd.DataFrame([value for key, value in output_fast.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
df['Recurring features'] = [np.array(clusters)[output_fast['recurring']]] * len(df)
df

-- Round 1 of 3 --
added cluster 0, new accuracy = 0.62
added cluster 1, new accuracy = 0.66
added cluster 2, new accuracy = 0.74
added cluster 7, new accuracy = 0.76
added cluster 8, new accuracy = 0.78
added cluster 15, new accuracy = 0.82
added cluster 21, new accuracy = 0.84
nothing added
nothing added
nothing added
nothing added
No features were added in 4 rounds. Stop searching for new clusters.
-- Round 2 of 3 --
added cluster 0, new accuracy = 0.58
added cluster 1, new accuracy = 0.64
added cluster 2, new accuracy = 0.82
nothing added
nothing added
nothing added
nothing added
No features were added in 4 rounds. Stop searching for new clusters.
-- Round 3 of 3 --
added cluster 0, new accuracy = 0.64
added cluster 2, new accuracy = 0.66
added cluster 3, new accuracy = 0.72
added cluster 4, new accuracy = 0.76
nothing added
nothing added
nothing added
added cluster 7, new accuracy = 0.78
added cluster 10, new accuracy = 0.82
added cluster 23, new accuracy = 0.84
nothing added
noth

Unnamed: 0,Model,Features,Clusters,Accuracy,Recurring features
0,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 7, 8, 15, 21]",0.84,"[[485, 862, 788, 497, 473], [635, 769, 682, 69..."
1,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2]",0.82,"[[485, 862, 788, 497, 473], [635, 769, 682, 69..."
2,"[[485, 862, 788, 497, 473], [635, 769, 682, 69...","[485, 862, 788, 497, 473, 635, 769, 682, 693, ...","[0, 2, 3, 4, 7, 10, 23]",0.84,"[[485, 862, 788, 497, 473], [635, 769, 682, 69..."


In [7]:
# With exclusion step
output_fast_excl = wrapper(X, y, clusters, decrease=True, add_im=True, excl=True, search_space=0.15, stop=4, n_fold=8, n=3)

# Print output as dataframe
df = pd.DataFrame([value for key, value in output_fast_excl.items() if key != 'recurring']).T
df = df.rename(columns={0: 'Model', 1: 'Features', 2: 'Clusters', 3: 'Accuracy'})
df['Recurring features'] = [np.array(clusters)[output_fast_excl['recurring']]] * len(df)
df

-- Round 1 of 3 --
added cluster 0, new accuracy = 0.6
added cluster 1, new accuracy = 0.66
added cluster 2, new accuracy = 0.76
added cluster 7, new accuracy = 0.78
added cluster 10, new accuracy = 0.8
added cluster 5, new accuracy = 0.82
nothing added
nothing added
nothing added
added cluster 12, new accuracy = 0.86
nothing added
nothing added
nothing added
nothing added
No features were added in 4 rounds. Stop searching for new clusters.
Trying to increase accuracy by removing/replacing clusters...
Removal/replacement of clusters did not increase accuracy.
-- Round 2 of 3 --
added cluster 0, new accuracy = 0.6
added cluster 1, new accuracy = 0.68
added cluster 2, new accuracy = 0.74
added cluster 9, new accuracy = 0.76
added cluster 10, new accuracy = 0.78
nothing added
added cluster 7, new accuracy = 0.8
nothing added
added cluster 15, new accuracy = 0.84
added cluster 13, new accuracy = 0.86
nothing added
nothing added
nothing added
nothing added
No features were added in 4 rounds

Unnamed: 0,Model,Features,Clusters,Accuracy,Recurring features
0,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 7, 10, 5, 12]",0.86,"[[485, 862, 788, 497, 473], [635, 769, 682, 69..."
1,"[[485, 862, 788, 497, 473], [175, 81, 486, 344...","[485, 862, 788, 497, 473, 175, 81, 486, 344, 5...","[0, 1, 2, 9, 10, 7, 15, 13]",0.86,"[[485, 862, 788, 497, 473], [635, 769, 682, 69..."
2,"[[485, 862, 788, 497, 473], [424, 381, 563, 46...","[485, 862, 788, 497, 473, 424, 381, 563, 465, ...","[0, 10, 14, 22, 13, 2, 21, 4, 35]",0.82,"[[485, 862, 788, 497, 473], [635, 769, 682, 69..."
