In [1]:
from protosc import filter_model
from protosc.simulation import create_simulation_data, create_correlated_data, compare_results
from protosc.filter_model import train_xvalidate, select_features
from protosc.final_selection import final_selection
from protosc.wrapper import wrapper, calc_accuracy
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Create data: features (X) and categories (y)
np.random.seed(1928374)
X, y, ground_truth = create_correlated_data()
print(f'features: {X[:5]}')
print(f'categories: {y[:5]}')

features: [[ 1.35946304 -0.05495788 -1.25547915 ... -0.29250157  0.71268609
  -1.15811878]
 [ 1.16041638  0.11066386  0.96355358 ... -0.74329004  0.1515906
  -0.07321478]
 [ 0.30436521  0.61188844 -0.84293664 ... -0.04545197  1.07970547
  -1.35302247]
 [-0.44400432 -0.55848696  0.62296112 ...  0.09136625 -0.86405888
  -1.28561021]
 [-1.91110816 -2.27302481 -1.0157041  ... -0.01774011  0.13587279
  -0.88981648]]
categories: [0 0 0 1 1]


In [3]:
# Create clusters of correlating features, and select n features from these clusters for filter model
selected_clusters, clusters = select_features(X, y, chisq_threshold=0.25)
print(f'Clusters (n={len(clusters)}): {clusters[:10]}')
print(f'Selected features (n={len(selected_clusters)}): {selected_clusters}')

Clusters (n=200): [[485, 862, 788, 497, 473], [175, 81, 486, 344, 591], [635, 769, 682, 693, 863], [451, 251, 468, 176, 766], [597, 684, 12, 984, 950], [367, 983, 0, 215, 702], [191, 495, 60, 455, 261], [496, 955, 193, 903, 934], [479, 567, 107, 834, 511], [302, 124, 732, 899, 867]]
Selected features (n=20): [485, 862, 788, 497, 473, 175, 81, 486, 344, 591, 635, 769, 682, 693, 863, 451, 251, 468, 176, 766]


In [4]:
# Split data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Options for wrapper():
* X: np.array, features
* y: np.array, categories (1/0)
* clusters: np.array, clusters of correlating features
* decrease: boolean, if True clusters are ranked from high to low chi-square scores, if False from low to high
* add_im: boolean, if True clusters are immediately added to model if they increase the accuracy, if False it only adds the cluster with the highest accuracy increase
* search_space: float, percentage of clusters that will be used to select clusters from
* stop: int, max number of rounds where no clusters can be added, after which looping will stop
* n_fold: int, number of folds (used for calculating accuracy)

### 'Slow' method
Select clusters that increase accuracy, but per loop only add the cluster with *highest* impact.
* decrease: True
* add_im: False
* search_space: 0.15
* stop: 4
* n_fold: 3

In [5]:
model, selected, final_accuracy = wrapper(X, y, clusters, decrease=True, add_im=False, search_space=0.15, stop=4, n_fold=3)

selected clusters: [0]
added cluster 1, new accuracy = 0.6851851851851852
selected clusters: [0, 1]
added cluster 6, new accuracy = 0.7345679012345679
selected clusters: [0, 1, 6]
selected clusters: [0, 1, 6]
selected clusters: [0, 1, 6]
selected clusters: [0, 1, 6]
added cluster 17, new accuracy = 0.7530864197530864
selected clusters: [0, 1, 6, 17]
selected clusters: [0, 1, 6, 17]
added cluster 19, new accuracy = 0.7592592592592593
selected clusters: [0, 1, 6, 17, 19]
selected clusters: [0, 1, 6, 17, 19]
selected clusters: [0, 1, 6, 17, 19]
selected clusters: [0, 1, 6, 17, 19]
No features were added in 4 rounds. Stop searching.


In [6]:
print(f'Selected clusters = {selected}')
print(f'Selected features = {model}')
print(f'Final accuracy = {final_accuracy}')
compare_results(model, ground_truth)

Selected clusters = [0, 1, 6, 17, 19]
Selected features = [485 862 788 497 473 175  81 486 344 591 191 495  60 455 261 895 555  19
 768 860 811  84 722 117  28 424 381 563 465 203 216  80 293 379 836 663
 106  27 371 528  38 587 324 521 407]
Final accuracy = 0.7592592592592593
Percentage of features correct: 0.3333333333333333
Percentage of features found: 0.3
Percentage of bias found: 0.36296296296296293


### 'Fast' method
Select clusters that increase accuracy; when one cluster increases accuracy, immediately add it to model and continue with this new model
* decrease: True
* add_im: True
* search_space: 0.15
* stop: 4
* n_fold: 3

In [7]:
model_f, selected_f, final_accuracy_f = wrapper(X, y, clusters, decrease=True, add_im=True, search_space=0.15, stop=4, n_fold=3)

selected clusters: [0]
added cluster 1, new accuracy = 0.6666666666666666
selected clusters: [0, 1]
added cluster 2, new accuracy = 0.6975308641975309
selected clusters: [0, 1, 2]
added cluster 4, new accuracy = 0.7160493827160493
selected clusters: [0, 1, 2, 4]
added cluster 9, new accuracy = 0.7283950617283951
selected clusters: [0, 1, 2, 4, 9]
added cluster 11, new accuracy = 0.7345679012345679
selected clusters: [0, 1, 2, 4, 9, 11]
added cluster 13, new accuracy = 0.7469135802469136
selected clusters: [0, 1, 2, 4, 9, 11, 13]
added cluster 29, new accuracy = 0.7716049382716049
selected clusters: [0, 1, 2, 4, 9, 11, 13, 29]
selected clusters: [0, 1, 2, 4, 9, 11, 13, 29]
added cluster 7, new accuracy = 0.7777777777777778
selected clusters: [0, 1, 2, 4, 9, 11, 13, 29, 7]
selected clusters: [0, 1, 2, 4, 9, 11, 13, 29, 7]
selected clusters: [0, 1, 2, 4, 9, 11, 13, 29, 7]
selected clusters: [0, 1, 2, 4, 9, 11, 13, 29, 7]
selected clusters: [0, 1, 2, 4, 9, 11, 13, 29, 7]
No features were a

In [8]:
print(f'Selected clusters = {selected_f}')
print(f'Selected features = {model_f}')
print(f'Final accuracy = {final_accuracy_f}')

Selected clusters = [0, 1, 2, 4, 9, 11, 13, 29, 7]
Selected features = [485 862 788 497 473 175  81 486 344 591 635 769 682 693 863 597 684  12
 984 950 302 124 732 899 867 216  80 293 379 836  38 587 324 521 407 830
 915 631 842 432 496 955 193 903 934 367 983   0 215 702 191 495  60 455
 261 479 567 107 834 511 424 381 563 465 203]
Final accuracy = 0.7777777777777778


In [9]:
compare_results(model_f, ground_truth)

Percentage of features correct: 0.6923076923076923
Percentage of features found: 0.9
Percentage of bias found: 0.8814814814814815
