In [11]:
from protosc import filter_model
from protosc.simulation import create_simulation_data, create_correlated_data
from protosc.filter_model import select_fold, train_xvalidate, select_features
from protosc.final_selection import final_selection
from protosc.wrapper import wrapper, calc_accuracy
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split

In [12]:
# Create data: features (X) and categories (y)
np.random.seed(1928374)
X, y, ground_truth = create_correlated_data()
print(f'features: {X[:5]}')
print(f'categories: {y[:5]}')

features: [[ 1.35946304 -0.05495788 -1.25547915 ... -0.29250157  0.71268609
  -1.15811878]
 [ 1.16041638  0.11066386  0.96355358 ... -0.74329004  0.1515906
  -0.07321478]
 [ 0.30436521  0.61188844 -0.84293664 ... -0.04545197  1.07970547
  -1.35302247]
 [-0.44400432 -0.55848696  0.62296112 ...  0.09136625 -0.86405888
  -1.28561021]
 [-1.91110816 -2.27302481 -1.0157041  ... -0.01774011  0.13587279
  -0.88981648]]
categories: [0 0 0 1 1]


In [13]:
# Calculate chi-square scores, rank them, calculate cumulative sum, and scale.
# Create vector with n features till max percentage of the cumulative rank scores (in this case 95%) 
selected_clusters = select_features(X, y, chisq_threshold=0.25)
print(f'Selected clusters (n={len(selected_clusters)}): {selected_clusters[:10]}')

Selected clusters (n=15): [485, 862, 788, 497, 473, 175, 81, 486, 344, 591]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
model, final_accuracy = wrapper(X, y, selected_clusters, n_fold=3)

model: 485, selected [0]
rest = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
selection = [485 862]
added 1, new accuracy = 0.5862068965517241
selection = [485 788]
selection = [485 497]
selection = [485 473]
added 4, new accuracy = 0.603448275862069
selection = [485 175]
added 5, new accuracy = 0.6724137931034483
selection = [485  81]
added 6, new accuracy = 0.6896551724137931
selection = [485 486]
selection = [485 344]
selection = [485 591]
selection = [485 635]
selection = [485 769]
selection = [485 682]
selection = [485 693]
selection = [485 863]
model: [485 862 473 175  81 788], selected [0, 1, 4, 5, 6]
rest = [3, 7, 8, 9, 10, 11, 12, 13, 14]
selection = [485 862 473 175  81 788 497]
selection = [485 862 473 175  81 788 486]
added 7, new accuracy = 0.7068965517241379
selection = [485 862 473 175  81 788 344]
selection = [485 862 473 175  81 788 591]
selection = [485 862 473 175  81 788 635]
added 10, new accuracy = 0.7413793103448276
selection = [485 862 473 175  81 788 769]
sel

In [10]:
print(f'Final accuracy = {final_accuracy}')
print(f'Selected features = {model}')

Final accuracy = 0.7931034482758621
Selected features = [485 862 473 175  81 486 635 863 497]
